aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/9p/trans_virtio.c4
-rw-r--r--net/bluetooth/af_bluetooth.c2
-rw-r--r--net/bluetooth/hci_request.c2
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/l2cap_core.c8
-rw-r--r--net/bluetooth/l2cap_sock.c14
-rw-r--r--net/bridge/br_fdb.c52
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_fs.c30
-rw-r--r--net/ceph/debugfs.c12
-rw-r--r--net/ceph/mon_client.c6
-rw-r--r--net/ceph/msgpool.c1
-rw-r--r--net/ceph/osd_client.c51
-rw-r--r--net/ceph/osdmap.c58
-rw-r--r--net/ceph/string_table.c105
-rw-r--r--net/core/dev.c10
-rw-r--r--net/core/filter.c109
-rw-r--r--net/ipv4/fib_trie.c12
-rw-r--r--net/ipv4/ip_gre.c1
-rw-r--r--net/ipv4/ip_tunnel_core.c8
-rw-r--r--net/ipv4/ip_vti.c31
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_diag.c7
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/udp.c13
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv6/addrconf.c38
-rw-r--r--net/ipv6/calipso.c4
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ping.c33
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/ipv6/udplite.c1
-rw-r--r--net/irda/iriap.c8
-rw-r--r--net/l2tp/l2tp_ppp.c2
-rw-r--r--net/netfilter/nf_conntrack_expect.c2
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c3
-rw-r--r--net/netfilter/nf_conntrack_netlink.c10
-rw-r--r--net/netfilter/nf_conntrack_sip.c4
-rw-r--r--net/netfilter/nf_conntrack_standalone.c4
-rw-r--r--net/netfilter/nfnetlink_acct.c17
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c16
-rw-r--r--net/netfilter/nfnetlink_log.c1
-rw-r--r--net/netfilter/nfnetlink_queue.c6
-rw-r--r--net/netfilter/nft_exthdr.c11
-rw-r--r--net/netfilter/nft_rbtree.c10
-rw-r--r--net/netfilter/xt_TPROXY.c4
-rw-r--r--net/netfilter/xt_nfacct.c2
-rw-r--r--net/openvswitch/conntrack.c8
-rw-r--r--net/openvswitch/vport-geneve.c9
-rw-r--r--net/openvswitch/vport-gre.c11
-rw-r--r--net/openvswitch/vport-internal_dev.c2
-rw-r--r--net/openvswitch/vport-vxlan.c9
-rw-r--r--net/rxrpc/ar-internal.h1
-rw-r--r--net/rxrpc/call_accept.c1
-rw-r--r--net/rxrpc/call_event.c7
-rw-r--r--net/rxrpc/call_object.c11
-rw-r--r--net/rxrpc/input.c39
-rw-r--r--net/rxrpc/recvmsg.c25
-rw-r--r--net/rxrpc/skbuff.c41
-rw-r--r--net/sched/act_api.c34
-rw-r--r--net/sched/act_ife.c18
-rw-r--r--net/sched/act_police.c62
-rw-r--r--net/sched/cls_api.c51
-rw-r--r--net/sched/sch_generic.c9
-rw-r--r--net/sctp/input.c11
-rw-r--r--net/sctp/inqueue.c13
-rw-r--r--net/sctp/proc.c1
-rw-r--r--net/sctp/sctp_diag.c24
-rw-r--r--net/sctp/ulpevent.c4
-rw-r--r--net/sunrpc/auth.c8
-rw-r--r--net/sunrpc/auth_generic.c9
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c11
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/auth_null.c1
-rw-r--r--net/sunrpc/auth_unix.c1
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/clnt.c26
-rw-r--r--net/sunrpc/sched.c67
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/svc_xprt.c51
-rw-r--r--net/sunrpc/svcsock.c150
-rw-r--r--net/sunrpc/xprt.c40
-rw-r--r--net/sunrpc/xprtmultipath.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c369
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c122
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c242
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h118
-rw-r--r--net/sunrpc/xprtsock.c181
-rw-r--r--net/tipc/monitor.c3
-rw-r--r--net/tipc/socket.c3
-rw-r--r--net/tipc/udp_media.c5
-rw-r--r--net/vmw_vsock/Kconfig20
-rw-r--r--net/vmw_vsock/Makefile6
-rw-r--r--net/vmw_vsock/af_vsock.c25
-rw-r--r--net/vmw_vsock/virtio_transport.c620
-rw-r--r--net/vmw_vsock/virtio_transport_common.c992
-rw-r--r--net/vmw_vsock/vmci_transport.c2
-rw-r--r--net/wireless/chan.c2
107 files changed, 3418 insertions, 1520 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 82a116ba590e..8de138d3306b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -169,7 +169,7 @@ int register_vlan_dev(struct net_device *dev)
if (err < 0)
goto out_uninit_mvrp;
- vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1;
+ vlan->nest_level = dev_get_nest_level(real_dev) + 1;
err = register_netdevice(dev);
if (err < 0)
goto out_uninit_mvrp;
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 4acb1d5417aa..f24b25c25106 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -507,8 +507,8 @@ err_out:
/* wakeup anybody waiting for slots to pin pages */
wake_up(&vp_wq);
}
- kfree(in_pages);
- kfree(out_pages);
+ kvfree(in_pages);
+ kvfree(out_pages);
return err;
}
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ece45e0683fd..0b5f729d08d2 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -250,7 +250,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
skb_free_datagram(sk, skb);
- if (msg->msg_flags & MSG_TRUNC)
+ if (flags & MSG_TRUNC)
copied = skblen;
return err ? : copied;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index c045b3c54768..b0e23dfc5c34 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -262,6 +262,8 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
break;
}
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
hdev->req_status = hdev->req_result = 0;
BT_DBG("%s end: err %d", hdev->name, err);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 6ef8a01a9ad4..96f04b7b9556 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1091,7 +1091,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
skb_free_datagram(sk, skb);
- if (msg->msg_flags & MSG_TRUNC)
+ if (flags & MSG_TRUNC)
copied = skblen;
return err ? : copied;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 54ceb1f2cc9a..d4cad29b033f 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -32,6 +32,7 @@
#include <linux/debugfs.h>
#include <linux/crc16.h>
+#include <linux/filter.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -5835,6 +5836,9 @@ static int l2cap_reassemble_sdu(struct l2cap_chan *chan, struct sk_buff *skb,
if (chan->sdu)
break;
+ if (!pskb_may_pull(skb, L2CAP_SDULEN_SIZE))
+ break;
+
chan->sdu_len = get_unaligned_le16(skb->data);
skb_pull(skb, L2CAP_SDULEN_SIZE);
@@ -6610,6 +6614,10 @@ static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
goto drop;
}
+ if ((chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING) && sk_filter(chan->data, skb))
+ goto drop;
+
if (!control->sframe) {
int err;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 1842141baedb..a8ba752732c9 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1019,7 +1019,7 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
goto done;
if (pi->rx_busy_skb) {
- if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb))
+ if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb))
pi->rx_busy_skb = NULL;
else
goto done;
@@ -1270,7 +1270,17 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
goto done;
}
- err = sock_queue_rcv_skb(sk, skb);
+ if (chan->mode != L2CAP_MODE_ERTM &&
+ chan->mode != L2CAP_MODE_STREAMING) {
+ /* Even if no filter is attached, we could potentially
+ * get errors from security modules, etc.
+ */
+ err = sk_filter(sk, skb);
+ if (err)
+ goto done;
+ }
+
+ err = __sock_queue_rcv_skb(sk, skb);
/* For ERTM, handle one skb that doesn't fit into the recv
* buffer. This is important to do because the data frames
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index c18080ad4085..cd620fab41b0 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -267,7 +267,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
/* If old entry was unassociated with any port, then delete it. */
f = __br_fdb_get(br, br->dev->dev_addr, 0);
- if (f && f->is_local && !f->dst)
+ if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, 0);
@@ -282,7 +282,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
if (!br_vlan_should_use(v))
continue;
f = __br_fdb_get(br, br->dev->dev_addr, v->vid);
- if (f && f->is_local && !f->dst)
+ if (f && f->is_local && !f->dst && !f->added_by_user)
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, v->vid);
}
@@ -764,20 +764,25 @@ out:
}
/* Update (create or replace) forwarding database entry */
-static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
- __u16 state, __u16 flags, __u16 vid)
+static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
+ const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
{
- struct net_bridge *br = source->br;
struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
bool modified = false;
/* If the port cannot learn allow only local and static entries */
- if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
+ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
!(source->state == BR_STATE_LEARNING ||
source->state == BR_STATE_FORWARDING))
return -EPERM;
+ if (!source && !(state & NUD_PERMANENT)) {
+ pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n",
+ br->dev->name);
+ return -EINVAL;
+ }
+
fdb = fdb_find(head, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
@@ -832,22 +837,28 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
return 0;
}
-static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
- const unsigned char *addr, u16 nlh_flags, u16 vid)
+static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
+ struct net_bridge_port *p, const unsigned char *addr,
+ u16 nlh_flags, u16 vid)
{
int err = 0;
if (ndm->ndm_flags & NTF_USE) {
+ if (!p) {
+ pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n",
+ br->dev->name);
+ return -EINVAL;
+ }
local_bh_disable();
rcu_read_lock();
- br_fdb_update(p->br, p, addr, vid, true);
+ br_fdb_update(br, p, addr, vid, true);
rcu_read_unlock();
local_bh_enable();
} else {
- spin_lock_bh(&p->br->hash_lock);
- err = fdb_add_entry(p, addr, ndm->ndm_state,
+ spin_lock_bh(&br->hash_lock);
+ err = fdb_add_entry(br, p, addr, ndm->ndm_state,
nlh_flags, vid);
- spin_unlock_bh(&p->br->hash_lock);
+ spin_unlock_bh(&br->hash_lock);
}
return err;
@@ -884,6 +895,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
dev->name);
return -EINVAL;
}
+ br = p->br;
vg = nbp_vlan_group(p);
}
@@ -895,15 +907,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
}
/* VID was specified, so use it. */
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, vid);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid);
} else {
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, 0);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags, 0);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0);
if (err || !vg || !vg->num_vlans)
goto out;
@@ -914,11 +920,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- if (dev->priv_flags & IFF_EBRIDGE)
- err = br_fdb_insert(br, NULL, addr, v->vid);
- else
- err = __br_fdb_add(ndm, p, addr, nlh_flags,
- v->vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid);
if (err)
goto out;
}
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d9856912c..84cbed630c4b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o
+ pagevec.o snapshot.o string_table.o
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 55d2bfee16d7..bddfcf6f09c2 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -747,6 +747,8 @@ out:
static void __exit exit_ceph_lib(void)
{
dout("exit_ceph_lib\n");
+ WARN_ON(!ceph_strings_empty());
+
ceph_osdc_cleanup();
ceph_msgr_exit();
ceph_crypto_shutdown();
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 41466ccb972a..7d54e944de5e 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -9,9 +9,9 @@
*/
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
{
- __u32 su = le32_to_cpu(layout->fl_stripe_unit);
- __u32 sc = le32_to_cpu(layout->fl_stripe_count);
- __u32 os = le32_to_cpu(layout->fl_object_size);
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
/* stripe unit, object size must be non-zero, 64k increment */
if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
return 1;
}
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0)
+ fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
int ceph_flags_to_mode(int flags)
{
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e77b04ca7802..c62b2b029a6e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
- seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
- t->target_oid.name_len, t->target_oid.name, t->flags);
+ seq_printf(s, "]/%d\t", t->acting.primary);
+ if (t->target_oloc.pool_ns) {
+ seq_printf(s, "%*pE/%*pE\t0x%x",
+ (int)t->target_oloc.pool_ns->len,
+ t->target_oloc.pool_ns->str,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ } else {
+ seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+ t->target_oid.name, t->flags);
+ }
if (t->paused)
seq_puts(s, "\tP");
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 37c38a7fb5c5..ef34a02719d7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
}
const char *ceph_sub_str[] = {
- [CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
+ [CEPH_SUB_FSMAP] = "fsmap.user",
+ [CEPH_SUB_MDSMAP] = "mdsmap",
};
/*
@@ -573,7 +574,7 @@ static void complete_generic_request(struct ceph_mon_generic_request *req)
put_generic_request(req);
}
-void cancel_generic_request(struct ceph_mon_generic_request *req)
+static void cancel_generic_request(struct ceph_mon_generic_request *req)
{
struct ceph_mon_client *monc = req->monc;
struct ceph_mon_generic_request *lookup_req;
@@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_FS_MAP_USER:
m = ceph_msg_new(type, front_len, GFP_NOFS, false);
if (!m)
return NULL; /* ENOMEM--return skip == 0 */
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index ddec1c10ac80..aaed59a47b1d 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
+#include <linux/ceph/messenger.h>
#include <linux/ceph/msgpool.h>
static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 89469592076c..a97e7b506612 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
static void target_destroy(struct ceph_osd_request_target *t)
{
ceph_oid_destroy(&t->base_oid);
+ ceph_oloc_destroy(&t->base_oloc);
ceph_oid_destroy(&t->target_oid);
+ ceph_oloc_destroy(&t->target_oloc);
}
/*
@@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_osdc_alloc_request);
+static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+{
+ return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
int msg_size;
WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
- msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
@@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
} else {
- u32 object_size = le32_to_cpu(layout->fl_object_size);
+ u32 object_size = layout->object_size;
u32 object_base = off - objoff;
if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
if (truncate_size <= object_base) {
@@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
}
req->r_flags = flags;
- req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ req->r_base_oloc.pool = layout->pool_id;
+ req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
req->r_snapid = vino.snap;
@@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
p += sizeof(req->r_replay_version);
/* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_start_encoding(&p, 5, 4,
+ ceph_oloc_encoding_size(&req->r_t.target_oloc));
ceph_encode_64(&p, req->r_t.target_oloc.pool);
ceph_encode_32(&p, -1); /* preferred */
ceph_encode_32(&p, 0); /* key len */
+ if (req->r_t.target_oloc.pool_ns)
+ ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
+ req->r_t.target_oloc.pool_ns->len);
+ else
+ ceph_encode_32(&p, 0);
/* pgid */
ceph_encode_8(&p, 1);
@@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end,
}
if (struct_v >= 5) {
+ bool changed = false;
+
len = ceph_decode_32(p);
if (len > 0) {
- pr_warn("ceph_object_locator::nspace is set\n");
+ ceph_decode_need(p, end, len, e_inval);
+ if (!oloc->pool_ns ||
+ ceph_compare_string(oloc->pool_ns, *p, len))
+ changed = true;
+ *p += len;
+ } else {
+ if (oloc->pool_ns)
+ changed = true;
+ }
+ if (changed) {
+ /* redirect changes namespace */
+ pr_warn("ceph_object_locator::nspace is changed\n");
goto e_inval;
}
}
@@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
goto out_unlock_session;
}
+ m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
ret = decode_MOSDOpReply(msg, &m);
+ m.redirect.oloc.pool_ns = NULL;
if (ret) {
pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
req->r_tid, ret);
@@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
unlink_request(osd, req);
mutex_unlock(&osd->lock);
- ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ /*
+ * Not ceph_oloc_copy() - changing pool_ns is not
+ * supported.
+ */
+ req->r_t.target_oloc.pool = m.redirect.oloc.pool;
req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
req->r_tid = 0;
__submit_request(req, false);
@@ -4187,7 +4220,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
GFP_NOIO);
- if (!pages) {
+ if (IS_ERR(pages)) {
ceph_msg_put(m);
return NULL;
}
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7e480bf75bcf..d2436880b305 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1510,6 +1510,24 @@ bad:
return ERR_PTR(err);
}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src)
+{
+ WARN_ON(!ceph_oloc_empty(dest));
+ WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+
+ dest->pool = src->pool;
+ if (src->pool_ns)
+ dest->pool_ns = ceph_get_string(src->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+ ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
@@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 *ono,
u64 *oxoff, u64 *oxlen)
{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 osize = layout->object_size;
+ u32 su = layout->stripe_unit;
+ u32 sc = layout->stripe_count;
u32 bl, stripeno, stripepos, objsetno;
u32 su_per_object;
u64 t, su_offset;
@@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
if (!pi)
return -ENOENT;
- raw_pgid->pool = oloc->pool;
- raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
-
- dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
- raw_pgid->pool, raw_pgid->seed);
+ if (!oloc->pool_ns) {
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
+ } else {
+ char stack_buf[256];
+ char *buf = stack_buf;
+ int nsl = oloc->pool_ns->len;
+ size_t total = nsl + 1 + oid->name_len;
+
+ if (total > sizeof(stack_buf)) {
+ buf = kmalloc(total, GFP_NOIO);
+ if (!buf)
+ return -ENOMEM;
+ }
+ memcpy(buf, oloc->pool_ns->str, nsl);
+ buf[nsl] = '\037';
+ memcpy(buf + nsl + 1, oid->name, oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+ if (buf != stack_buf)
+ kfree(buf);
+ dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+ oid->name, nsl, oloc->pool_ns->str,
+ raw_pgid->pool, raw_pgid->seed);
+ }
return 0;
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 000000000000..22fb96efcf34
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,105 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+ struct ceph_string *cs, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&string_tree_lock);
+ p = &string_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist && !kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ exist = NULL;
+ }
+ spin_unlock(&string_tree_lock);
+ if (exist)
+ return exist;
+
+ cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+ if (!cs)
+ return NULL;
+
+ kref_init(&cs->kref);
+ cs->len = len;
+ memcpy(cs->str, str, len);
+ cs->str[len] = 0;
+
+retry:
+ exist = NULL;
+ parent = NULL;
+ p = &string_tree.rb_node;
+ spin_lock(&string_tree_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ ret = 0;
+ if (!exist) {
+ rb_link_node(&cs->node, parent, p);
+ rb_insert_color(&cs->node, &string_tree);
+ } else if (!kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&string_tree_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+
+ if (exist) {
+ kfree(cs);
+ cs = exist;
+ }
+
+ return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+void ceph_release_string(struct kref *ref)
+{
+ struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+ spin_lock(&string_tree_lock);
+ if (!RB_EMPTY_NODE(&cs->node)) {
+ rb_erase(&cs->node, &string_tree);
+ RB_CLEAR_NODE(&cs->node);
+ }
+ spin_unlock(&string_tree_lock);
+
+ kfree_rcu(cs, rcu);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+ return RB_EMPTY_ROOT(&string_tree);
+}
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ce07dc25573..dd6ce598de89 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6045,8 +6045,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
EXPORT_SYMBOL(netdev_lower_dev_get_private);
-int dev_get_nest_level(struct net_device *dev,
- bool (*type_check)(const struct net_device *dev))
+int dev_get_nest_level(struct net_device *dev)
{
struct net_device *lower = NULL;
struct list_head *iter;
@@ -6056,15 +6055,12 @@ int dev_get_nest_level(struct net_device *dev,
ASSERT_RTNL();
netdev_for_each_lower_dev(dev, lower, iter) {
- nest = dev_get_nest_level(lower, type_check);
+ nest = dev_get_nest_level(lower);
if (max_nest < nest)
max_nest = nest;
}
- if (type_check(dev))
- max_nest++;
-
- return max_nest;
+ return max_nest + 1;
}
EXPORT_SYMBOL(dev_get_nest_level);
diff --git a/net/core/filter.c b/net/core/filter.c
index 5708999f8a79..cb06aceb512a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1355,56 +1355,47 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
{
int err;
- if (!skb_cloned(skb))
- return 0;
- if (skb_clone_writable(skb, write_len))
- return 0;
- err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (!err)
- bpf_compute_data_end(skb);
+ err = skb_ensure_writable(skb, write_len);
+ bpf_compute_data_end(skb);
+
return err;
}
+static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
+static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
+{
+ if (skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
+}
+
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
- struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
struct sk_buff *skb = (struct sk_buff *) (long) r1;
- int offset = (int) r2;
+ unsigned int offset = (unsigned int) r2;
void *from = (void *) (long) r3;
unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
-
- /* bpf verifier guarantees that:
- * 'from' pointer points to bpf program stack
- * 'len' bytes of it were initialized
- * 'len' > 0
- * 'skb' is a valid pointer to 'struct sk_buff'
- *
- * so check for invalid 'offset' and too large 'len'
- */
- if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
+ if (unlikely(offset > 0xffff))
return -EFAULT;
if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
- ptr = skb_header_pointer(skb, offset, len, sp->buff);
- if (unlikely(!ptr))
- return -EFAULT;
-
+ ptr = skb->data + offset;
if (flags & BPF_F_RECOMPUTE_CSUM)
- skb_postpull_rcsum(skb, ptr, len);
+ __skb_postpull_rcsum(skb, ptr, len, offset);
memcpy(ptr, from, len);
- if (ptr == sp->buff)
- /* skb_store_bits cannot return -EFAULT here */
- skb_store_bits(skb, offset, ptr, len);
-
if (flags & BPF_F_RECOMPUTE_CSUM)
- skb_postpush_rcsum(skb, ptr, len);
+ __skb_postpush_rcsum(skb, ptr, len, offset);
if (flags & BPF_F_INVALIDATE_HASH)
skb_clear_hash(skb);
@@ -1425,12 +1416,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
- int offset = (int) r2;
+ unsigned int offset = (unsigned int) r2;
void *to = (void *)(unsigned long) r3;
unsigned int len = (unsigned int) r4;
void *ptr;
- if (unlikely((u32) offset > 0xffff))
+ if (unlikely(offset > 0xffff))
goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
@@ -1458,20 +1449,17 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
- int offset = (int) r2;
- __sum16 sum, *ptr;
+ unsigned int offset = (unsigned int) r2;
+ __sum16 *ptr;
if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
- if (unlikely((u32) offset > 0xffff))
+ if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
- if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
- return -EFAULT;
-
- ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
- if (unlikely(!ptr))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
return -EFAULT;
+ ptr = (__sum16 *)(skb->data + offset);
switch (flags & BPF_F_HDR_FIELD_MASK) {
case 0:
if (unlikely(from != 0))
@@ -1489,10 +1477,6 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
}
- if (ptr == &sum)
- /* skb_store_bits guaranteed to not return -EFAULT here */
- skb_store_bits(skb, offset, ptr, sizeof(sum));
-
return 0;
}
@@ -1512,20 +1496,18 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
- int offset = (int) r2;
- __sum16 sum, *ptr;
+ unsigned int offset = (unsigned int) r2;
+ __sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
BPF_F_HDR_FIELD_MASK)))
return -EINVAL;
- if (unlikely((u32) offset > 0xffff))
+ if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
- if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum))))
+ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
return -EFAULT;
- ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
- if (unlikely(!ptr))
- return -EFAULT;
+ ptr = (__sum16 *)(skb->data + offset);
if (is_mmzero && !*ptr)
return 0;
@@ -1548,10 +1530,6 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (is_mmzero && !*ptr)
*ptr = CSUM_MANGLED_0;
- if (ptr == &sum)
- /* skb_store_bits guaranteed to not return -EFAULT here */
- skb_store_bits(skb, offset, ptr, sizeof(sum));
-
return 0;
}
@@ -1607,9 +1585,6 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_at_tc_ingress(skb))
- skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
-
return dev_forward_skb(dev, skb);
}
@@ -1648,6 +1623,8 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!skb))
return -ENOMEM;
+ bpf_push_mac_rcsum(skb);
+
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
@@ -1693,6 +1670,8 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
+ bpf_push_mac_rcsum(skb);
+
return ri->flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}
@@ -1756,7 +1735,10 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
vlan_proto != htons(ETH_P_8021AD)))
vlan_proto = htons(ETH_P_8021Q);
+ bpf_push_mac_rcsum(skb);
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
+ bpf_pull_mac_rcsum(skb);
+
bpf_compute_data_end(skb);
return ret;
}
@@ -1776,7 +1758,10 @@ static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
int ret;
+ bpf_push_mac_rcsum(skb);
ret = skb_vlan_pop(skb);
+ bpf_pull_mac_rcsum(skb);
+
bpf_compute_data_end(skb);
return ret;
}
@@ -2298,7 +2283,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
#ifdef CONFIG_SOCK_CGROUP_DATA
-static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *)(long)r1;
struct bpf_map *map = (struct bpf_map *)(long)r2;
@@ -2321,8 +2306,8 @@ static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
}
-static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {
- .func = bpf_skb_in_cgroup,
+static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
+ .func = bpf_skb_under_cgroup,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -2402,8 +2387,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
- case BPF_FUNC_skb_in_cgroup:
- return &bpf_skb_in_cgroup_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
#endif
default:
return sk_filter_func_proto(func_id);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d07fc076bea0..e2ffc2a5c7db 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -249,7 +249,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv)
* index into the parent's child array. That is, they will be used to find
* 'n' among tp's children.
*
- * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
* for the node n.
*
* All the bits we have seen so far are significant to the node n. The rest
@@ -258,7 +258,7 @@ static inline unsigned long get_index(t_key key, struct key_vector *kv)
* The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
* n's child array, and will of course be different for each child.
*
- * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
* at this point.
*/
@@ -2452,9 +2452,7 @@ struct fib_route_iter {
static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
loff_t pos)
{
- struct fib_table *tb = iter->main_tb;
struct key_vector *l, **tp = &iter->tnode;
- struct trie *t;
t_key key;
/* use cache location of next-to-find key */
@@ -2462,8 +2460,6 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
pos -= iter->pos;
key = iter->key;
} else {
- t = (struct trie *)tb->tb_data;
- iter->tnode = t->kv;
iter->pos = 0;
key = 0;
}
@@ -2504,12 +2500,12 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
return NULL;
iter->main_tb = tb;
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
if (*pos != 0)
return fib_route_get_idx(iter, *pos);
- t = (struct trie *)tb->tb_data;
- iter->tnode = t->kv;
iter->pos = 0;
iter->key = 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5b1481be0282..113cc43df789 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -370,7 +370,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->parms.o_flags, proto, tunnel->parms.o_key,
htonl(tunnel->o_seqno));
- skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 9d847c302551..0f227db0e9ac 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -73,9 +73,11 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- if (skb_iif && proto == IPPROTO_UDP) {
- /* Arrived from an ingress interface and got udp encapuslated.
- * The encapsulated network segment length may exceed dst mtu.
+ if (skb_iif && !(df & htons(IP_DF))) {
+ /* Arrived from an ingress interface, got encapsulated, with
+ * fragmentation of encapulating frames allowed.
+ * If skb is gso, the resulting encapsulated network segments
+ * may exceed dst mtu.
* Allow IP Fragmentation of segments.
*/
IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index a917903d5e97..cc701fa70b12 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -557,6 +557,33 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
.get_link_net = ip_tunnel_get_link_net,
};
+static bool is_vti_tunnel(const struct net_device *dev)
+{
+ return dev->netdev_ops == &vti_netdev_ops;
+}
+
+static int vti_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ if (!is_vti_tunnel(dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ if (!net_eq(tunnel->net, dev_net(dev)))
+ xfrm_garbage_collect(tunnel->net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vti_notifier_block __read_mostly = {
+ .notifier_call = vti_device_event,
+};
+
static int __init vti_init(void)
{
const char *msg;
@@ -564,6 +591,8 @@ static int __init vti_init(void)
pr_info("IPv4 over IPsec tunneling driver\n");
+ register_netdevice_notifier(&vti_notifier_block);
+
msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
@@ -596,6 +625,7 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
pernet_dev_failed:
+ unregister_netdevice_notifier(&vti_notifier_block);
pr_err("vti init: failed to register %s\n", msg);
return err;
}
@@ -607,6 +637,7 @@ static void __exit vti_fini(void)
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
+ unregister_netdevice_notifier(&vti_notifier_block);
}
module_init(vti_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 032a96d78c99..ffbb218de520 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3193,7 +3193,6 @@ int tcp_abort(struct sock *sk, int err)
local_bh_enable();
return 0;
}
- sock_gen_put(sk);
return -EOPNOTSUPP;
}
@@ -3222,7 +3221,6 @@ int tcp_abort(struct sock *sk, int err)
bh_unlock_sock(sk);
local_bh_enable();
release_sock(sk);
- sock_put(sk);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 4d610934fb39..a748c74aa8b7 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -54,11 +54,16 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
{
struct net *net = sock_net(in_skb->sk);
struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+ int err;
if (IS_ERR(sk))
return PTR_ERR(sk);
- return sock_diag_destroy(sk, ECONNABORTED);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_gen_put(sk);
+
+ return err;
}
#endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 32b048e524d6..7158d4f8dae4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -814,8 +814,14 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
tcp_sk(sk)->snd_nxt;
+ /* RFC 7323 2.3
+ * The window field (SEG.WND) of every outgoing segment, with the
+ * exception of <SYN> segments, MUST be right-shifted by
+ * Rcv.Wind.Shift bits:
+ */
tcp_v4_send_ack(sock_net(sk), skb, seq,
- tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
+ tcp_rsk(req)->rcv_nxt,
+ req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp,
req->ts_recent,
0,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e61f7cd65d08..5fdcb8d108d4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1182,13 +1182,13 @@ out:
* @sk: socket
*
* Drops all bad checksum frames, until a valid one is found.
- * Returns the length of found skb, or 0 if none is found.
+ * Returns the length of found skb, or -1 if none is found.
*/
-static unsigned int first_packet_length(struct sock *sk)
+static int first_packet_length(struct sock *sk)
{
struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
struct sk_buff *skb;
- unsigned int res;
+ int res;
__skb_queue_head_init(&list_kill);
@@ -1203,7 +1203,7 @@ static unsigned int first_packet_length(struct sock *sk)
__skb_unlink(skb, rcvq);
__skb_queue_tail(&list_kill, skb);
}
- res = skb ? skb->len : 0;
+ res = skb ? skb->len : -1;
spin_unlock_bh(&rcvq->lock);
if (!skb_queue_empty(&list_kill)) {
@@ -1232,7 +1232,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
case SIOCINQ:
{
- unsigned int amount = first_packet_length(sk);
+ int amount = max_t(int, 0, first_packet_length(sk));
return put_user(amount, (int __user *)arg);
}
@@ -2184,7 +2184,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Check for false positives due to checksum errors */
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
- !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(POLLIN | POLLRDNORM);
return mask;
@@ -2216,7 +2216,6 @@ struct proto udp_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3b3efbda48e1..2eea073e27ef 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -55,7 +55,6 @@ struct proto udplite_prot = {
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
.obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab3e796596b1..f418d2eaeddd 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1872,7 +1872,6 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct inet6_ifaddr *ifp)
{
- struct in6_addr addr;
struct inet6_dev *idev = ifp->idev;
struct net *net = dev_net(ifp->idev->dev);
@@ -1934,18 +1933,6 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
in6_ifa_put(ifp2);
lock_errdad:
spin_lock_bh(&ifp->lock);
- } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
- addr.s6_addr32[0] = htonl(0xfe800000);
- addr.s6_addr32[1] = 0;
-
- if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
- ipv6_addr_equal(&ifp->addr, &addr)) {
- /* DAD failed for link-local based on MAC address */
- idev->cnf.disable_ipv6 = 1;
-
- pr_info("%s: IPv6 being disabled!\n",
- ifp->idev->dev->name);
- }
}
errdad:
@@ -3543,7 +3530,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- keep_addr = !(how || _keep_addr <= 0);
+ keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3599,7 +3586,7 @@ restart:
/* re-combine the user config with event to determine if permanent
* addresses are to be removed from the interface list
*/
- keep_addr = (!how && _keep_addr > 0);
+ keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
INIT_LIST_HEAD(&del_list);
list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
@@ -3821,6 +3808,7 @@ static void addrconf_dad_work(struct work_struct *w)
dad_work);
struct inet6_dev *idev = ifp->idev;
struct in6_addr mcaddr;
+ bool disable_ipv6 = false;
enum {
DAD_PROCESS,
@@ -3837,6 +3825,24 @@ static void addrconf_dad_work(struct work_struct *w)
} else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
action = DAD_ABORT;
ifp->state = INET6_IFADDR_STATE_POSTDAD;
+
+ if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 &&
+ !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
+ struct in6_addr addr;
+
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ addr.s6_addr32[1] = 0;
+
+ if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+ ipv6_addr_equal(&ifp->addr, &addr)) {
+ /* DAD failed for link-local based on MAC */
+ idev->cnf.disable_ipv6 = 1;
+
+ pr_info("%s: IPv6 being disabled!\n",
+ ifp->idev->dev->name);
+ disable_ipv6 = true;
+ }
+ }
}
spin_unlock_bh(&ifp->lock);
@@ -3845,6 +3851,8 @@ static void addrconf_dad_work(struct work_struct *w)
goto out;
} else if (action == DAD_ABORT) {
addrconf_dad_stop(ifp, 1);
+ if (disable_ipv6)
+ addrconf_ifdown(idev->dev, 0);
goto out;
}
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index c53b92c617c5..37ac9de713c6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -952,8 +952,10 @@ calipso_opt_insert(struct ipv6_opt_hdr *hop,
memcpy(new, hop, start);
ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def,
secattr);
- if (ret_val < 0)
+ if (ret_val < 0) {
+ kfree(new);
return ERR_PTR(ret_val);
+ }
buf_len = start + ret_val;
/* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 776d145113e1..704274cbd495 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -519,8 +519,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
- skb_set_inner_protocol(skb, protocol);
-
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
NEXTHDR_GRE);
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index fed40d1ec29b..0900352c924c 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -55,7 +55,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct icmp6hdr user_icmph;
int addr_type;
struct in6_addr *daddr;
- int iif = 0;
+ int oif = 0;
struct flowi6 fl6;
int err;
struct dst_entry *dst;
@@ -78,25 +78,30 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (u->sin6_family != AF_INET6) {
return -EAFNOSUPPORT;
}
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != u->sin6_scope_id) {
- return -EINVAL;
- }
daddr = &(u->sin6_addr);
- iif = u->sin6_scope_id;
+ if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
+ oif = u->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
}
- if (!iif)
- iif = sk->sk_bound_dev_if;
+ if (!oif)
+ oif = sk->sk_bound_dev_if;
+
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ if (!oif && ipv6_addr_is_multicast(daddr))
+ oif = np->mcast_oif;
+ else if (!oif)
+ oif = np->ucast_oif;
addr_type = ipv6_addr_type(daddr);
- if (__ipv6_addr_needs_scope_id(addr_type) && !iif)
- return -EINVAL;
- if (addr_type & IPV6_ADDR_MAPPED)
+ if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
+ (addr_type & IPV6_ADDR_MAPPED) ||
+ (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
return -EINVAL;
/* TODO: use ip6_datagram_send_ctl to get options from cmsg */
@@ -106,16 +111,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
+ fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
- else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
-
ipc6.tclass = np->tclass;
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33df8b8575cc..94f4f89d73e7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -944,9 +944,15 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
+ /* RFC 7323 2.3
+ * The window field (SEG.WND) of every outgoing segment, with the
+ * exception of <SYN> segments, MUST be right-shifted by
+ * Rcv.Wind.Shift bits:
+ */
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
+ tcp_rsk(req)->rcv_nxt,
+ req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 81e2f98b958d..19ac3a1c308d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1460,7 +1460,6 @@ struct proto udpv6_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 9cf097e206e9..fd6ef414899b 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -50,7 +50,6 @@ struct proto udplitev6_prot = {
.unhash = udp_lib_unhash,
.get_port = udp_v6_get_port,
.obj_size = sizeof(struct udp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
index 4a7ae32afa09..1138eaf5c682 100644
--- a/net/irda/iriap.c
+++ b/net/irda/iriap.c
@@ -185,8 +185,12 @@ struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv,
self->magic = IAS_MAGIC;
self->mode = mode;
- if (mode == IAS_CLIENT)
- iriap_register_lsap(self, slsap_sel, mode);
+ if (mode == IAS_CLIENT) {
+ if (iriap_register_lsap(self, slsap_sel, mode)) {
+ kfree(self);
+ return NULL;
+ }
+ }
self->confirm = callback;
self->priv = priv;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index d9560aa2dba3..232cb92033e8 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -856,7 +856,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
error = -ENOTCONN;
if (sk == NULL)
goto end;
- if (sk->sk_state != PPPOX_CONNECTED)
+ if (!(sk->sk_state & PPPOX_CONNECTED))
goto end;
error = -EBADF;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 9e3693128313..f8dbacf66795 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -574,7 +574,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
helper = rcu_dereference(nfct_help(expect->master)->helper);
if (helper) {
seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
- if (helper->expect_policy[expect->class].name)
+ if (helper->expect_policy[expect->class].name[0])
seq_printf(s, "/%s",
helper->expect_policy[expect->class].name);
}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index bb77a97961bf..5c0db5c64734 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1473,7 +1473,8 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
"timeout to %u seconds for",
info->timeout);
nf_ct_dump_tuple(&exp->tuple);
- mod_timer(&exp->timeout, jiffies + info->timeout * HZ);
+ mod_timer_pending(&exp->timeout,
+ jiffies + info->timeout * HZ);
}
spin_unlock_bh(&nf_conntrack_expect_lock);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 050bb3420a6b..fdfc71f416b7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1894,6 +1894,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
return -EINVAL;
+ if (otuple.dst.protonum != rtuple.dst.protonum)
+ return -EINVAL;
ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
&rtuple, u3);
@@ -2362,12 +2364,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
return PTR_ERR(exp);
err = nf_ct_expect_related_report(exp, portid, report);
- if (err < 0) {
- nf_ct_expect_put(exp);
- return err;
- }
-
- return 0;
+ nf_ct_expect_put(exp);
+ return err;
}
static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 8d9db9d4702b..7d77217de6a3 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1383,7 +1383,7 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq) {
+ if (!cseq && *(*dptr + matchoff) != '0') {
nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
}
@@ -1446,7 +1446,7 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
- if (!cseq) {
+ if (!cseq && *(*dptr + matchoff) != '0') {
nf_ct_helper_log(skb, ct, "cannot get cseq");
return NF_DROP;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 958a1455ca7f..9f267c3ffb39 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -205,6 +205,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = seq_file_net(s);
int ret = 0;
NF_CT_ASSERT(ct);
@@ -215,6 +216,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (NF_CT_DIRECTION(hash))
goto release;
+ if (!net_eq(nf_ct_net(ct), net))
+ goto release;
+
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
NF_CT_ASSERT(l3proto);
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 1b4de4bd6958..70eb2f6a3b01 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -326,14 +326,14 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
{
int ret = 0;
- /* we want to avoid races with nfnl_acct_find_get. */
- if (atomic_dec_and_test(&cur->refcnt)) {
+ /* We want to avoid races with nfnl_acct_put. So only when the current
+ * refcnt is 1, we decrease it to 0.
+ */
+ if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) {
/* We are protected by nfnl mutex. */
list_del_rcu(&cur->head);
kfree_rcu(cur, rcu_head);
} else {
- /* still in use, restore reference counter. */
- atomic_inc(&cur->refcnt);
ret = -EBUSY;
}
return ret;
@@ -443,7 +443,7 @@ void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
}
EXPORT_SYMBOL_GPL(nfnl_acct_update);
-static void nfnl_overquota_report(struct nf_acct *nfacct)
+static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
{
int ret;
struct sk_buff *skb;
@@ -458,11 +458,12 @@ static void nfnl_overquota_report(struct nf_acct *nfacct)
kfree_skb(skb);
return;
}
- netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+ netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
GFP_ATOMIC);
}
-int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb,
+ struct nf_acct *nfacct)
{
u64 now;
u64 *quota;
@@ -480,7 +481,7 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
if (now >= *quota &&
!test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) {
- nfnl_overquota_report(nfacct);
+ nfnl_overquota_report(net, nfacct);
}
return ret;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 4cdcd969b64c..68216cdc7083 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -330,16 +330,16 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
{
int ret = 0;
- /* we want to avoid races with nf_ct_timeout_find_get. */
- if (atomic_dec_and_test(&timeout->refcnt)) {
+ /* We want to avoid races with ctnl_timeout_put. So only when the
+ * current refcnt is 1, we decrease it to 0.
+ */
+ if (atomic_cmpxchg(&timeout->refcnt, 1, 0) == 1) {
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
nf_ct_l4proto_put(timeout->l4proto);
ctnl_untimeout(net, timeout);
kfree_rcu(timeout, rcu_head);
} else {
- /* still in use, restore reference counter. */
- atomic_inc(&timeout->refcnt);
ret = -EBUSY;
}
return ret;
@@ -543,7 +543,9 @@ err:
static void ctnl_timeout_put(struct ctnl_timeout *timeout)
{
- atomic_dec(&timeout->refcnt);
+ if (atomic_dec_and_test(&timeout->refcnt))
+ kfree_rcu(timeout, rcu_head);
+
module_put(THIS_MODULE);
}
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
@@ -591,7 +593,9 @@ static void __net_exit cttimeout_net_exit(struct net *net)
list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
list_del_rcu(&cur->head);
nf_ct_l4proto_put(cur->l4proto);
- kfree_rcu(cur, rcu_head);
+
+ if (atomic_dec_and_test(&cur->refcnt))
+ kfree_rcu(cur, rcu_head);
}
}
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cbcfdfb586a6..6577db524ef6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1147,6 +1147,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
+MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5d36a0926b4a..f49f45081acb 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1145,10 +1145,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int err;
- queue = instance_lookup(q, queue_num);
- if (!queue)
- queue = verdict_instance_lookup(q, queue_num,
- NETLINK_CB(skb).portid);
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index ba7aed13e174..82c264e40278 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -59,6 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 offset, len;
if (tb[NFTA_EXTHDR_DREG] == NULL ||
tb[NFTA_EXTHDR_TYPE] == NULL ||
@@ -66,9 +67,15 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
tb[NFTA_EXTHDR_LEN] == NULL)
return -EINVAL;
+ offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+ len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+
+ if (offset > U8_MAX || len > U8_MAX)
+ return -ERANGE;
+
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
- priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
- priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ priv->offset = offset;
+ priv->len = len;
priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
return nft_validate_register_store(ctx, priv->dreg, NULL,
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index 6473936d05c6..ffe9ae062d23 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -70,7 +70,6 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
} else if (d > 0)
parent = parent->rb_right;
else {
-found:
if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = parent->rb_left;
continue;
@@ -84,9 +83,12 @@ found:
}
}
- if (set->flags & NFT_SET_INTERVAL && interval != NULL) {
- rbe = interval;
- goto found;
+ if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
+ nft_set_elem_active(&interval->ext, genmask) &&
+ !nft_rbtree_interval_end(interval)) {
+ spin_unlock_bh(&nft_rbtree_lock);
+ *ext = &interval->ext;
+ return true;
}
out:
spin_unlock_bh(&nft_rbtree_lock);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 7f4414d26a66..663c4c3c9072 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -127,6 +127,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
daddr, dport,
in->ifindex);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
@@ -195,6 +197,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
daddr, ntohs(dport),
in->ifindex);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 3048a7e3a90a..cf327593852a 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
nfnl_acct_update(skb, info->nfacct);
- overquota = nfnl_acct_overquota(skb, info->nfacct);
+ overquota = nfnl_acct_overquota(par->net, skb, info->nfacct);
return overquota == NFACCT_UNDERQUOTA ? false : true;
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c644c78ed485..e054a748ff25 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -433,7 +433,6 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
- enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
unsigned int dataoff;
u8 protonum;
@@ -458,13 +457,8 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
ct = nf_ct_tuplehash_to_ctrack(h);
- ctinfo = ovs_ct_get_info(h);
- if (ctinfo == IP_CT_NEW) {
- /* This should not happen. */
- WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
- }
skb->nfct = &ct->ct_general;
- skb->nfctinfo = ctinfo;
+ skb->nfctinfo = ovs_ct_get_info(h);
return ct;
}
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 1a1fcec88695..5aaf3babfc3f 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -93,7 +93,14 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ goto error;
+ }
+
rtnl_unlock();
return vport;
error:
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 7f8897f33a67..0e72d95b0e8f 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -54,6 +54,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct net_device *dev;
struct vport *vport;
+ int err;
vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
if (IS_ERR(vport))
@@ -67,9 +68,15 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
- rtnl_unlock();
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_PTR(err);
+ }
+ rtnl_unlock();
return vport;
}
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 434e04c3a189..95c36147a6e1 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -140,7 +140,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
{
- dev->needed_headroom = new_hr;
+ dev->needed_headroom = new_hr < 0 ? 0 : new_hr;
}
static const struct net_device_ops internal_dev_netdev_ops = {
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 5eb7694348b5..7eb955e453e6 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -130,7 +130,14 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
return ERR_CAST(dev);
}
- dev_change_flags(dev, dev->flags | IFF_UP);
+ err = dev_change_flags(dev, dev->flags | IFF_UP);
+ if (err < 0) {
+ rtnl_delete_link(dev);
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ goto error;
+ }
+
rtnl_unlock();
return vport;
error:
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 1bb9e7ac9e14..ff83fb1ddd47 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -425,6 +425,7 @@ struct rxrpc_call {
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
atomic_t usage;
+ atomic_t skb_count; /* Outstanding packets on this call */
atomic_t sequence; /* Tx data packet sequence counter */
u32 local_abort; /* local abort code */
u32 remote_abort; /* remote abort code */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 0b2832141bd0..9bae21e66d65 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -130,6 +130,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
call->state = RXRPC_CALL_SERVER_ACCEPTING;
list_add_tail(&call->accept_link, &rx->acceptq);
rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
nsp = rxrpc_skb(notification);
nsp->call = call;
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fc32aa5764a2..e60cf65c2232 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -460,6 +460,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
ASSERTCMP(sp->call, ==, NULL);
sp->call = call;
rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
/* insert into the buffer in sequence order */
spin_lock_bh(&call->lock);
@@ -734,6 +735,7 @@ all_acked:
skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
sp->call = call;
rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
spin_lock_bh(&call->lock);
if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
BUG();
@@ -793,6 +795,7 @@ static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
sp->error = error;
sp->call = call;
rxrpc_get_call(call);
+ atomic_inc(&call->skb_count);
spin_lock_bh(&call->lock);
ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
@@ -834,6 +837,9 @@ void rxrpc_process_call(struct work_struct *work)
return;
}
+ if (!call->conn)
+ goto skip_msg_init;
+
/* there's a good chance we're going to have to send a message, so set
* one up in advance */
msg.msg_name = &call->conn->params.peer->srx.transport;
@@ -856,6 +862,7 @@ void rxrpc_process_call(struct work_struct *work)
memset(iov, 0, sizeof(iov));
iov[0].iov_base = &whdr;
iov[0].iov_len = sizeof(whdr);
+skip_msg_init:
/* deal with events of a final nature */
if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 91287c9d01bb..ae057e0740f3 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -275,6 +275,7 @@ error:
list_del_init(&call->link);
write_unlock_bh(&rxrpc_call_lock);
+ set_bit(RXRPC_CALL_RELEASED, &call->flags);
call->state = RXRPC_CALL_DEAD;
rxrpc_put_call(call);
_leave(" = %d", ret);
@@ -287,6 +288,7 @@ error:
*/
found_user_ID_now_present:
write_unlock(&rx->call_lock);
+ set_bit(RXRPC_CALL_RELEASED, &call->flags);
call->state = RXRPC_CALL_DEAD;
rxrpc_put_call(call);
_leave(" = -EEXIST [%p]", call);
@@ -491,15 +493,9 @@ void rxrpc_release_call(struct rxrpc_call *call)
spin_lock_bh(&call->lock);
while ((skb = skb_dequeue(&call->rx_queue)) ||
(skb = skb_dequeue(&call->rx_oos_queue))) {
- sp = rxrpc_skb(skb);
- if (sp->call) {
- ASSERTCMP(sp->call, ==, call);
- rxrpc_put_call(call);
- sp->call = NULL;
- }
- skb->destructor = NULL;
spin_unlock_bh(&call->lock);
+ sp = rxrpc_skb(skb);
_debug("- zap %s %%%u #%u",
rxrpc_pkts[sp->hdr.type],
sp->hdr.serial, sp->hdr.seq);
@@ -605,6 +601,7 @@ void __rxrpc_put_call(struct rxrpc_call *call)
if (atomic_dec_and_test(&call->usage)) {
_debug("call %d dead", call->debug_id);
+ WARN_ON(atomic_read(&call->skb_count) != 0);
ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
rxrpc_queue_work(&call->destroyer);
}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 991a20d25093..70bb77818dea 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -55,9 +55,6 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
_debug("already terminated");
ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
- skb->destructor = NULL;
- sp->call = NULL;
- rxrpc_put_call(call);
rxrpc_free_skb(skb);
return 0;
}
@@ -111,13 +108,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
ret = 0;
out:
- /* release the socket buffer */
- if (skb) {
- skb->destructor = NULL;
- sp->call = NULL;
- rxrpc_put_call(call);
- rxrpc_free_skb(skb);
- }
+ rxrpc_free_skb(skb);
_leave(" = %d", ret);
return ret;
@@ -133,11 +124,15 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
struct rxrpc_skb_priv *sp;
bool terminal;
int ret, ackbit, ack;
+ u32 serial;
+ u8 flags;
_enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
sp = rxrpc_skb(skb);
ASSERTCMP(sp->call, ==, NULL);
+ flags = sp->hdr.flags;
+ serial = sp->hdr.serial;
spin_lock(&call->lock);
@@ -200,8 +195,9 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
sp->call = call;
rxrpc_get_call(call);
- terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
- !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+ atomic_inc(&call->skb_count);
+ terminal = ((flags & RXRPC_LAST_PACKET) &&
+ !(flags & RXRPC_CLIENT_INITIATED));
ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
if (ret < 0) {
if (ret == -ENOMEM || ret == -ENOBUFS) {
@@ -213,12 +209,13 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
}
skb = NULL;
+ sp = NULL;
_debug("post #%u", seq);
ASSERTCMP(call->rx_data_post, ==, seq);
call->rx_data_post++;
- if (sp->hdr.flags & RXRPC_LAST_PACKET)
+ if (flags & RXRPC_LAST_PACKET)
set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
/* if we've reached an out of sequence packet then we need to drain
@@ -234,7 +231,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
spin_unlock(&call->lock);
atomic_inc(&call->ackr_not_idle);
- rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false);
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false);
_leave(" = 0 [posted]");
return 0;
@@ -247,7 +244,7 @@ out:
discard_and_ack:
_debug("discard and ACK packet %p", skb);
- __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+ __rxrpc_propose_ACK(call, ack, serial, true);
discard:
spin_unlock(&call->lock);
rxrpc_free_skb(skb);
@@ -255,7 +252,7 @@ discard:
return 0;
enqueue_and_ack:
- __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+ __rxrpc_propose_ACK(call, ack, serial, true);
enqueue_packet:
_net("defer skb %p", skb);
spin_unlock(&call->lock);
@@ -575,13 +572,13 @@ done:
* post connection-level events to the connection
* - this includes challenges, responses and some aborts
*/
-static bool rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
+static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
_enter("%p,%p", conn, skb);
skb_queue_tail(&conn->rx_queue, skb);
- return rxrpc_queue_conn(conn);
+ rxrpc_queue_conn(conn);
}
/*
@@ -702,7 +699,6 @@ void rxrpc_data_ready(struct sock *sk)
rcu_read_lock();
-retry_find_conn:
conn = rxrpc_find_connection_rcu(local, skb);
if (!conn)
goto cant_route_call;
@@ -710,8 +706,7 @@ retry_find_conn:
if (sp->hdr.callNumber == 0) {
/* Connection-level packet */
_debug("CONN %p {%d}", conn, conn->debug_id);
- if (!rxrpc_post_packet_to_conn(conn, skb))
- goto retry_find_conn;
+ rxrpc_post_packet_to_conn(conn, skb);
} else {
/* Call-bound packets are routed by connection channel. */
unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK;
@@ -749,6 +744,8 @@ cant_route_call:
if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
_debug("reject type %d",sp->hdr.type);
rxrpc_reject_packet(local, skb);
+ } else {
+ rxrpc_free_skb(skb);
}
_leave(" [no call]");
return;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a3fa2ed85d63..9ed66d533002 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -203,6 +203,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
}
/* we transferred the whole data packet */
+ if (!(flags & MSG_PEEK))
+ rxrpc_kernel_data_consumed(call, skb);
+
if (sp->hdr.flags & RXRPC_LAST_PACKET) {
_debug("last");
if (rxrpc_conn_is_client(call->conn)) {
@@ -360,28 +363,6 @@ wait_error:
}
/**
- * rxrpc_kernel_data_delivered - Record delivery of data message
- * @skb: Message holding data
- *
- * Record the delivery of a data message. This permits RxRPC to keep its
- * tracking correct. The socket buffer will be deleted.
- */
-void rxrpc_kernel_data_delivered(struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_call *call = sp->call;
-
- ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
- ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
- call->rx_data_recv = sp->hdr.seq;
-
- ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
- rxrpc_free_skb(skb);
-}
-
-EXPORT_SYMBOL(rxrpc_kernel_data_delivered);
-
-/**
* rxrpc_kernel_is_data_last - Determine if data message is last one
* @skb: Message holding data
*
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index eee0cfd9ac8c..06c51d4b622d 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -98,11 +98,39 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
spin_unlock_bh(&call->lock);
}
+/**
+ * rxrpc_kernel_data_consumed - Record consumption of data message
+ * @call: The call to which the message pertains.
+ * @skb: Message holding data
+ *
+ * Record the consumption of a data message and generate an ACK if appropriate.
+ * The call state is shifted if this was the final packet. The caller must be
+ * in process context with no spinlocks held.
+ *
+ * TODO: Actually generate the ACK here rather than punting this to the
+ * workqueue.
+ */
+void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq);
+
+ ASSERTCMP(sp->call, ==, call);
+ ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA);
+
+ /* TODO: Fix the sequence number tracking */
+ ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+ ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+ ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
+
+ call->rx_data_recv = sp->hdr.seq;
+ rxrpc_hard_ACK_data(call, sp);
+}
+EXPORT_SYMBOL(rxrpc_kernel_data_consumed);
+
/*
- * destroy a packet that has an RxRPC control buffer
- * - advance the hard-ACK state of the parent call (done here in case something
- * in the kernel bypasses recvmsg() and steals the packet directly off of the
- * socket receive queue)
+ * Destroy a packet that has an RxRPC control buffer
*/
void rxrpc_packet_destructor(struct sk_buff *skb)
{
@@ -112,9 +140,8 @@ void rxrpc_packet_destructor(struct sk_buff *skb)
_enter("%p{%p}", skb, call);
if (call) {
- /* send the final ACK on a client call */
- if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
- rxrpc_hard_ACK_data(call, sp);
+ if (atomic_dec_return(&call->skb_count) < 0)
+ BUG();
rxrpc_put_call(call);
sp->call = NULL;
}
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e4a5f2607ffa..d09d0687594b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -64,7 +64,6 @@ int __tcf_hash_release(struct tc_action *p, bool bind, bool strict)
if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
if (p->ops->cleanup)
p->ops->cleanup(p, bind);
- list_del(&p->list);
tcf_hash_destroy(p->hinfo, p);
ret = ACT_P_DELETED;
}
@@ -421,18 +420,19 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
return res;
}
-int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
- struct tcf_result *res)
+int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
+ int nr_actions, struct tcf_result *res)
{
- const struct tc_action *a;
- int ret = -1;
+ int ret = -1, i;
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
ret = TC_ACT_OK;
goto exec_done;
}
- list_for_each_entry(a, actions, list) {
+ for (i = 0; i < nr_actions; i++) {
+ const struct tc_action *a = actions[i];
+
repeat:
ret = a->ops->act(skb, a, res);
if (ret == TC_ACT_REPEAT)
@@ -754,16 +754,6 @@ err_out:
return ERR_PTR(err);
}
-static void cleanup_a(struct list_head *actions)
-{
- struct tc_action *a, *tmp;
-
- list_for_each_entry_safe(a, tmp, actions, list) {
- list_del(&a->list);
- kfree(a);
- }
-}
-
static int tca_action_flush(struct net *net, struct nlattr *nla,
struct nlmsghdr *n, u32 portid)
{
@@ -905,7 +895,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
return ret;
}
err:
- cleanup_a(&actions);
+ tcf_action_destroy(&actions, 0);
return ret;
}
@@ -942,15 +932,9 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
if (ret)
- goto done;
+ return ret;
- /* dump then free all the actions after update; inserted policy
- * stays intact
- */
- ret = tcf_add_notify(net, n, &actions, portid);
- cleanup_a(&actions);
-done:
- return ret;
+ return tcf_add_notify(net, n, &actions, portid);
}
static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 141a06eeb1e5..e87cd81315e1 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -53,7 +53,7 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
u32 *tlv = (u32 *)(skbdata);
u16 totlen = nla_total_size(dlen); /*alignment + hdr */
char *dptr = (char *)tlv + NLA_HDRLEN;
- u32 htlv = attrtype << 16 | totlen;
+ u32 htlv = attrtype << 16 | dlen;
*tlv = htonl(htlv);
memset(dptr, 0, totlen - NLA_HDRLEN);
@@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(ife_release_meta_gen);
int ife_validate_meta_u32(void *val, int len)
{
- if (len == 4)
+ if (len == sizeof(u32))
return 0;
return -EINVAL;
@@ -144,8 +144,8 @@ EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
int ife_validate_meta_u16(void *val, int len)
{
- /* length will include padding */
- if (len == NLA_ALIGN(2))
+ /* length will not include padding */
+ if (len == sizeof(u16))
return 0;
return -EINVAL;
@@ -652,12 +652,14 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
u8 *tlvdata = (u8 *)tlv;
u16 mtype = tlv->type;
u16 mlen = tlv->len;
+ u16 alen;
mtype = ntohs(mtype);
mlen = ntohs(mlen);
+ alen = NLA_ALIGN(mlen);
- if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
- (void *)(tlvdata + 4))) {
+ if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN),
+ (void *)(tlvdata + NLA_HDRLEN))) {
/* abuse overlimits to count when we receive metadata
* but dont have an ops for it
*/
@@ -666,8 +668,8 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
ife->tcf_qstats.overlimits++;
}
- tlvdata += mlen;
- ifehdrln -= mlen;
+ tlvdata += alen;
+ ifehdrln -= alen;
tlv = (struct meta_tlvhdr *)tlvdata;
}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index b3c7e975fc9e..8a3be1d99775 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -63,49 +63,8 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
const struct tc_action_ops *ops)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
- struct tcf_hashinfo *hinfo = tn->hinfo;
- int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
- struct nlattr *nest;
-
- spin_lock_bh(&hinfo->lock);
-
- s_i = cb->args[0];
-
- for (i = 0; i < (POL_TAB_MASK + 1); i++) {
- struct hlist_head *head;
- struct tc_action *p;
-
- head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
-
- hlist_for_each_entry_rcu(p, head, tcfa_head) {
- index++;
- if (index < s_i)
- continue;
- nest = nla_nest_start(skb, index);
- if (nest == NULL)
- goto nla_put_failure;
- if (type == RTM_DELACTION)
- err = tcf_action_dump_1(skb, p, 0, 1);
- else
- err = tcf_action_dump_1(skb, p, 0, 0);
- if (err < 0) {
- index--;
- nla_nest_cancel(skb, nest);
- goto done;
- }
- nla_nest_end(skb, nest);
- n_i++;
- }
- }
-done:
- spin_unlock_bh(&hinfo->lock);
- if (n_i)
- cb->args[0] += n_i;
- return n_i;
-nla_put_failure:
- nla_nest_cancel(skb, nest);
- goto done;
+ return tcf_generic_walker(tn, skb, cb, type, ops);
}
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
@@ -125,6 +84,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
+ bool exists = false;
int size;
if (nla == NULL)
@@ -139,24 +99,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
size = nla_len(tb[TCA_POLICE_TBF]);
if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
return -EINVAL;
+
parm = nla_data(tb[TCA_POLICE_TBF]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
- if (parm->index) {
- if (tcf_hash_check(tn, parm->index, a, bind)) {
- if (ovr)
- goto override;
- /* not replacing */
- return -EEXIST;
- }
- } else {
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, NULL, a,
&act_police_ops, bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
+ } else {
+ tcf_hash_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
}
-override:
police = to_police(*a);
if (parm->rate.rate) {
err = -ENOMEM;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 843a716a4303..a7c5645373af 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -541,8 +541,12 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
- INIT_LIST_HEAD(&exts->actions);
+ LIST_HEAD(actions);
+
+ tcf_exts_to_list(exts, &actions);
+ tcf_action_destroy(&actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ exts->nr_actions = 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
@@ -554,7 +558,6 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
{
struct tc_action *act;
- INIT_LIST_HEAD(&exts->actions);
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
"police", ovr,
@@ -563,14 +566,20 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
- list_add(&act->list, &exts->actions);
+ exts->actions[0] = act;
+ exts->nr_actions = 1;
} else if (exts->action && tb[exts->action]) {
- int err;
+ LIST_HEAD(actions);
+ int err, i = 0;
+
err = tcf_action_init(net, tb[exts->action], rate_tlv,
NULL, ovr,
- TCA_ACT_BIND, &exts->actions);
+ TCA_ACT_BIND, &actions);
if (err)
return err;
+ list_for_each_entry(act, &actions, list)
+ exts->actions[i++] = act;
+ exts->nr_actions = i;
}
}
#else
@@ -587,37 +596,49 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
- LIST_HEAD(tmp);
+ struct tcf_exts old = *dst;
+
tcf_tree_lock(tp);
- list_splice_init(&dst->actions, &tmp);
- list_splice(&src->actions, &dst->actions);
+ dst->nr_actions = src->nr_actions;
+ dst->actions = src->actions;
dst->type = src->type;
tcf_tree_unlock(tp);
- tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
+
+ tcf_exts_destroy(&old);
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
-#define tcf_exts_first_act(ext) \
- list_first_entry_or_null(&(exts)->actions, \
- struct tc_action, list)
+#ifdef CONFIG_NET_CLS_ACT
+static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts)
+{
+ if (exts->nr_actions == 0)
+ return NULL;
+ else
+ return exts->actions[0];
+}
+#endif
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
- if (exts->action && !list_empty(&exts->actions)) {
+ if (exts->action && exts->nr_actions) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
+ LIST_HEAD(actions);
+
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
+
+ tcf_exts_to_list(exts, &actions);
+ if (tcf_action_dump(skb, &actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e95b67cd5718..657c13362b19 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -643,18 +643,19 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
struct Qdisc *sch;
if (!try_module_get(ops->owner))
- goto errout;
+ return NULL;
sch = qdisc_alloc(dev_queue, ops);
- if (IS_ERR(sch))
- goto errout;
+ if (IS_ERR(sch)) {
+ module_put(ops->owner);
+ return NULL;
+ }
sch->parent = parentid;
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
qdisc_destroy(sch);
-errout:
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index c182db7d691f..69444d32ecda 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -119,7 +119,13 @@ int sctp_rcv(struct sk_buff *skb)
skb_transport_offset(skb))
goto discard_it;
- if (!pskb_may_pull(skb, sizeof(struct sctphdr)))
+ /* If the packet is fragmented and we need to do crc checking,
+ * it's better to just linearize it otherwise crc computing
+ * takes longer.
+ */
+ if ((!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) &&
+ skb_linearize(skb)) ||
+ !pskb_may_pull(skb, sizeof(struct sctphdr)))
goto discard_it;
/* Pull up the IP header. */
@@ -1177,9 +1183,6 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP)
return NULL;
- if (skb_linearize(skb))
- return NULL;
-
ch = (sctp_chunkhdr_t *) skb->data;
/* The code below will attempt to walk the chunk and extract
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index c30ddb0f3190..6437aa97cfd7 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -170,19 +170,6 @@ next_chunk:
chunk = list_entry(entry, struct sctp_chunk, list);
- /* Linearize if it's not GSO */
- if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP &&
- skb_is_nonlinear(chunk->skb)) {
- if (skb_linearize(chunk->skb)) {
- __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
- sctp_chunk_free(chunk);
- goto next_chunk;
- }
-
- /* Update sctp_hdr as it probably changed */
- chunk->sctp_hdr = sctp_hdr(chunk->skb);
- }
-
if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) {
/* GSO-marked skbs but without frags, handle
* them normally
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 4cb5aedfe3ee..ef8ba77a5bea 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -293,6 +293,7 @@ static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
return ERR_PTR(err);
}
+ iter->start_fail = 0;
return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index f69edcf219e5..f3508aa75815 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -13,6 +13,7 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
{
union sctp_addr laddr, paddr;
struct dst_entry *dst;
+ struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer;
laddr = list_entry(asoc->base.bind_addr.address_list.next,
struct sctp_sockaddr_entry, list)->a;
@@ -40,10 +41,15 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
}
r->idiag_state = asoc->state;
- r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
- r->idiag_retrans = asoc->rtx_data_chunks;
- r->idiag_expires = jiffies_to_msecs(
- asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] - jiffies);
+ if (timer_pending(t3_rtx)) {
+ r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
+ r->idiag_retrans = asoc->rtx_data_chunks;
+ r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+ r->idiag_expires = 0;
+ }
}
static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
@@ -350,7 +356,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
if (cb->args[4] < cb->args[1])
goto next;
- if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+ if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs))
goto next;
if (r->sdiag_family != AF_UNSPEC &&
@@ -418,11 +424,13 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
paddr.v4.sin_family = AF_INET;
} else {
laddr.v6.sin6_port = req->id.idiag_sport;
- memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, 64);
+ memcpy(&laddr.v6.sin6_addr, req->id.idiag_src,
+ sizeof(laddr.v6.sin6_addr));
laddr.v6.sin6_family = AF_INET6;
paddr.v6.sin6_port = req->id.idiag_dport;
- memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, 64);
+ memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst,
+ sizeof(paddr.v6.sin6_addr));
paddr.v6.sin6_family = AF_INET6;
}
@@ -465,7 +473,7 @@ skip:
* 3 : to mark if we have dumped the ep info of the current asoc
* 4 : to work as a temporary variable to traversal list
*/
- if (!(idiag_states & ~TCPF_LISTEN))
+ if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
goto done;
sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
done:
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 1bc4f71aaba8..d85b803da11d 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,14 +702,14 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
*/
sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff));
- sctp_ulpevent_receive_data(event, asoc);
-
/* And hold the chunk as we need it for getting the IP headers
* later in recvmsg
*/
sctp_chunk_hold(chunk);
event->chunk = chunk;
+ sctp_ulpevent_receive_data(event, asoc);
+
event->stream = ntohs(chunk->subh.data_hdr->stream);
event->ssn = ntohs(chunk->subh.data_hdr->ssn);
event->ppid = chunk->subh.data_hdr->ppid;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 040ff627c18a..a7e42f9a405c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL)
goto out_inval;
- nbits = fls(num);
- if (num > (1U << nbits))
- nbits++;
+ nbits = fls(num - 1);
if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
goto out_inval;
*(unsigned int *)kp->arg = nbits;
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
bool
-rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
{
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
+ return false;
if (!cred->cr_ops->crkey_to_expire)
return false;
return cred->cr_ops->crkey_to_expire(cred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 54dd3fdead54..168219535a34 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
/* Fast track for non crkey_timeout (no key) underlying credentials */
- if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
return 0;
/* Fast track for the normal case */
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
if (IS_ERR(tcred))
return -EACCES;
- if (!tcred->cr_ops->crkey_timeout) {
- set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
- ret = 0;
- goto out_put;
- }
-
/* Test for the almost error case */
ret = tcred->cr_ops->crkey_timeout(tcred);
if (ret != 0) {
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
}
-out_put:
put_rpccred(tcred);
return ret;
}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index e64ae93d5b4f..976c7812bbd5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -340,12 +340,14 @@ gss_release_msg(struct gss_upcall_msg *gss_msg)
}
static struct gss_upcall_msg *
-__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid)
+__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth)
{
struct gss_upcall_msg *pos;
list_for_each_entry(pos, &pipe->in_downcall, list) {
if (!uid_eq(pos->uid, uid))
continue;
+ if (auth && pos->auth->service != auth->service)
+ continue;
atomic_inc(&pos->count);
dprintk("RPC: %s found msg %p\n", __func__, pos);
return pos;
@@ -365,7 +367,7 @@ gss_add_msg(struct gss_upcall_msg *gss_msg)
struct gss_upcall_msg *old;
spin_lock(&pipe->lock);
- old = __gss_find_upcall(pipe, gss_msg->uid);
+ old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth);
if (old == NULL) {
atomic_inc(&gss_msg->count);
list_add(&gss_msg->list, &pipe->in_downcall);
@@ -714,7 +716,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
err = -ENOENT;
/* Find a matching upcall */
spin_lock(&pipe->lock);
- gss_msg = __gss_find_upcall(pipe, uid);
+ gss_msg = __gss_find_upcall(pipe, uid, NULL);
if (gss_msg == NULL) {
spin_unlock(&pipe->lock);
goto err_put_ctx;
@@ -1015,8 +1017,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_flags = 0;
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
+ if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
+ auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b1c9..60595835317a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
+ .datatouch = true,
},
[2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P,
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p",
+ .datatouch = true,
},
};
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a598..5fec3abbe19b 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
}
EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+bool
+gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return gm->gm_pfs[i].datatouch;
+ }
+ return false;
+}
+
char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
{
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index e085f5ae1548..1d281816f2bf 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1230,8 +1230,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- dprintk("RPC: svcauth_gss: gss major status = %d\n",
- ud.major_status);
+ dprintk("RPC: svcauth_gss: gss major status = %d "
+ "minor status = %d\n",
+ ud.major_status, ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 8d9eb4d5ddd8..4d17376b2acb 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -115,6 +115,7 @@ static
struct rpc_auth null_auth = {
.au_cslack = NUL_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 9f65452b7cbc..a99278c984e8 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -228,6 +228,7 @@ static
struct rpc_auth unix_auth = {
.au_cslack = UNX_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 553bf95f7003..4d8e11f94a35 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries || atomic_read(&cd->inuse)) {
+ if (cd->entries) {
write_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
goto out;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2808d550d273..7f79fb7dc6a0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
kfree(data);
}
-const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
.rpc_call_done = rpc_cb_add_xprt_done,
.rpc_release = rpc_cb_add_xprt_release,
};
@@ -2638,6 +2638,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
{
struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
+ unsigned long reconnect_timeout;
unsigned char resvport;
int ret = 0;
@@ -2649,6 +2650,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
return -EAGAIN;
}
resvport = xprt->resvport;
+ reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
xprt = xprt_create_transport(xprtargs);
@@ -2657,6 +2659,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
goto out_put_switch;
}
xprt->resvport = resvport;
+ xprt->max_reconnect_timeout = reconnect_timeout;
rpc_xprt_switch_set_roundrobin(xps);
if (setup) {
@@ -2673,6 +2676,27 @@ out_put_switch:
}
EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+static int
+rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data)
+{
+ unsigned long timeout = *((unsigned long *)data);
+
+ if (timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = timeout;
+ return 0;
+}
+
+void
+rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+{
+ rpc_clnt_iterate_for_each_xprt(clnt,
+ rpc_xprt_cap_max_reconnect_timeout,
+ &timeo);
+}
+EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static void rpc_show_header(void)
{
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fcfd48d263f6..9ae588511aaf 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
/*
* rpciod-related stuff
*/
-struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue __read_mostly;
+struct workqueue_struct *xprtiod_workqueue __read_mostly;
/*
* Disable the timer for a given RPC task. Should be called with
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
* lockless RPC_IS_QUEUED() test) before we've had a chance to test
* the RPC_TASK_RUNNING flag.
*/
-static void rpc_make_runnable(struct rpc_task *task)
+static void rpc_make_runnable(struct workqueue_struct *wq,
+ struct rpc_task *task)
{
bool need_wakeup = !rpc_test_and_set_running(task);
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
- queue_work(rpciod_workqueue, &task->u.tk_work);
+ queue_work(wq, &task->u.tk_work);
} else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
/**
- * __rpc_do_wake_up_task - wake up a single rpc_task
+ * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
+ * @wq: workqueue on which to run task
* @queue: wait queue
* @task: task to be woken up
*
* Caller must hold queue->lock, and have cleared the task queued flag.
*/
-static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
+ struct rpc_task *task)
{
dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
task->tk_pid, jiffies);
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
__rpc_remove_wait_queue(queue, task);
- rpc_make_runnable(task);
+ rpc_make_runnable(wq, task);
dprintk("RPC: __rpc_wake_up_task done\n");
}
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue, struct rpc_task *task)
{
if (RPC_IS_QUEUED(task)) {
smp_rmb();
if (task->tk_waitqueue == queue)
- __rpc_do_wake_up_task(queue, task);
+ __rpc_do_wake_up_task_on_wq(wq, queue, task);
}
}
/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
+}
+
+/*
* Wake up a task on a specific queue
*/
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
/*
* Wake up the first task on the wait queue.
*/
-struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
bool (*func)(struct rpc_task *, void *), void *data)
{
struct rpc_task *task = NULL;
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
task = __rpc_find_next_queued(queue);
if (task != NULL) {
if (func(task, data))
- rpc_wake_up_task_queue_locked(queue, task);
+ rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
else
task = NULL;
}
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
return task;
}
+
+/*
+ * Wake up the first task on the wait queue.
+ */
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+ bool (*func)(struct rpc_task *, void *), void *data)
+{
+ return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
+}
EXPORT_SYMBOL_GPL(rpc_wake_up_first);
static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
bool is_async = RPC_IS_ASYNC(task);
rpc_set_active(task);
- rpc_make_runnable(task);
+ rpc_make_runnable(rpciod_workqueue, task);
if (!is_async)
__rpc_execute(task);
}
@@ -1071,10 +1095,22 @@ static int rpciod_start(void)
* Create the rpciod thread and wait for it to start.
*/
dprintk("RPC: creating workqueue rpciod\n");
- /* Note: highpri because network receive is latency sensitive */
- wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+ if (!wq)
+ goto out_failed;
rpciod_workqueue = wq;
- return rpciod_workqueue != NULL;
+ /* Note: highpri because network receive is latency sensitive */
+ wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (!wq)
+ goto free_rpciod;
+ xprtiod_workqueue = wq;
+ return 1;
+free_rpciod:
+ wq = rpciod_workqueue;
+ rpciod_workqueue = NULL;
+ destroy_workqueue(wq);
+out_failed:
+ return 0;
}
static void rpciod_stop(void)
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
destroy_workqueue(wq);
+ wq = xprtiod_workqueue;
+ xprtiod_workqueue = NULL;
+ destroy_workqueue(wq);
}
void
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897395..c5b0cb4f4056 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
/* Encode reply */
- if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+ if (*statp == rpc_drop_reply ||
+ test_bit(RQ_DROPME, &rqstp->rq_flags)) {
if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit;
}
+ if (*statp == rpc_autherr_badcred) {
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto err_bad_auth;
+ }
if (*statp == rpc_success &&
(xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4f01f63102ee..c3f652395a80 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -21,6 +21,10 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static unsigned int svc_rpc_per_connection_limit __read_mostly;
+module_param(svc_rpc_per_connection_limit, uint, 0644);
+
+
static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
static int svc_deferred_recv(struct svc_rqst *rqstp);
static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
}
EXPORT_SYMBOL_GPL(svc_print_addr);
+static bool svc_xprt_slots_in_range(struct svc_xprt *xprt)
+{
+ unsigned int limit = svc_rpc_per_connection_limit;
+ int nrqsts = atomic_read(&xprt->xpt_nr_rqsts);
+
+ return limit == 0 || (nrqsts >= 0 && nrqsts < limit);
+}
+
+static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ if (!test_bit(RQ_DATA, &rqstp->rq_flags)) {
+ if (!svc_xprt_slots_in_range(xprt))
+ return false;
+ atomic_inc(&xprt->xpt_nr_rqsts);
+ set_bit(RQ_DATA, &rqstp->rq_flags);
+ }
+ return true;
+}
+
+static void svc_xprt_release_slot(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
+ atomic_dec(&xprt->xpt_nr_rqsts);
+ svc_xprt_enqueue(xprt);
+ }
+}
+
static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
{
if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
return true;
- if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
- return xprt->xpt_ops->xpo_has_wspace(xprt);
+ if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) {
+ if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
+ svc_xprt_slots_in_range(xprt))
+ return true;
+ trace_svc_xprt_no_write_space(xprt);
+ return false;
+ }
return false;
}
@@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
- if (xprt->xpt_ops->xpo_adjust_wspace)
- xprt->xpt_ops->xpo_adjust_wspace(xprt);
svc_xprt_enqueue(xprt);
}
}
@@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
rqstp->rq_res.head[0].iov_len = 0;
svc_reserve(rqstp, 0);
+ svc_xprt_release_slot(rqstp);
rqstp->rq_xprt = NULL;
-
svc_xprt_put(xprt);
}
@@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
svc_add_new_temp_xprt(serv, newxpt);
else
module_put(xprt->xpt_class->xcl_owner);
- } else {
+ } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
@@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv);
*/
void svc_drop(struct svc_rqst *rqstp)
{
+ trace_svc_drop(rqstp);
dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
svc_xprt_release(rqstp);
}
@@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
spin_unlock(&xprt->xpt_lock);
dprintk("revisit canceled\n");
svc_xprt_put(xprt);
+ trace_svc_drop_deferred(dr);
kfree(dr);
return;
}
@@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
set_bit(RQ_DROPME, &rqstp->rq_flags);
dr->handle.revisit = svc_revisit;
+ trace_svc_defer(rqstp);
return &dr->handle;
}
@@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
struct svc_deferred_req,
handle.recent);
list_del_init(&dr->handle.recent);
+ trace_svc_revisit_deferred(dr);
} else
clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index dadfec66dbd8..57625f64efd5 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,6 @@
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int flags);
-static void svc_udp_data_ready(struct sock *);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
static void svc_sock_detach(struct svc_xprt *);
@@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
return svc_port_is_privileged(svc_addr(rqstp));
}
-static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
-{
- if (!wq)
- return false;
- /*
- * There should normally be a memory * barrier here--see
- * wq_has_sleeper().
- *
- * It appears that isn't currently necessary, though, basically
- * because callers all appear to have sufficient memory barriers
- * between the time the relevant change is made and the
- * time they call these callbacks.
- *
- * The nfsd code itself doesn't actually explicitly wait on
- * these waitqueues, but it may wait on them for example in
- * sendpage() or sendmsg() calls. (And those may be the only
- * places, since it it uses nonblocking reads.)
- *
- * Maybe we should add the memory barriers anyway, but these are
- * hot paths so we'd need to be convinced there's no sigificant
- * penalty.
- */
- return waitqueue_active(wq);
-}
-
/*
* INET callback when data has been received on the socket.
*/
-static void svc_udp_data_ready(struct sock *sk)
+static void svc_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), busy=%d\n",
svsk, sk,
test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_odata(sk);
+ if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk)
static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ svsk->sk_owspace(sk);
svc_xprt_enqueue(&svsk->sk_xprt);
}
-
- if (sunrpc_waitqueue_active(wq)) {
- dprintk("RPC svc_write_space: someone sleeping on %p\n",
- svsk);
- wake_up_interruptible(wq);
- }
}
static int svc_tcp_has_wspace(struct svc_xprt *xprt)
{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int required;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
return 1;
- required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
- if (sk_stream_wspace(svsk->sk_sk) >= required ||
- (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
- atomic_read(&xprt->xpt_reserved) == 0))
- return 1;
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- return 0;
-}
-
-static void svc_tcp_write_space(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- struct socket *sock = sk->sk_socket;
-
- if (!sk_stream_is_writeable(sk) || !sock)
- return;
- if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
- clear_bit(SOCK_NOSPACE, &sock->flags);
- svc_write_space(sk);
-}
-
-static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
-{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-
- if (svc_tcp_has_wspace(xprt))
- clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
/*
@@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
&svsk->sk_xprt, serv);
clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
- svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+ svsk->sk_sk->sk_data_ready = svc_data_ready;
svsk->sk_sk->sk_write_space = svc_write_space;
/* initialise setting must have enough space to
@@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq;
dprintk("svc: socket %p TCP (listen) state change %d\n",
sk, sk->sk_state);
+ if (svsk)
+ svsk->sk_odata(sk);
/*
* This callback may called twice when a new connection
* is established as a child socket inherits everything
@@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
} else
printk("svc: socket %p: no user data\n", sk);
}
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
}
/*
@@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
sk, sk->sk_state, sk->sk_user_data);
@@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk)
if (!svsk)
printk("svc: socket %p: no user data\n", sk);
else {
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
- }
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
-}
-
-static void svc_tcp_data_ready(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
-
- dprintk("svc: socket %p TCP data ready (svsk %p)\n",
- sk, sk->sk_user_data);
- if (svsk) {
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_ostate(sk);
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
dprintk("%s: connect from %s\n", serv->sv_name,
__svc_print_addr(sin, buf, sizeof(buf)));
+ /* Reset the inherited callbacks before calling svc_setup_socket */
+ newsock->sk->sk_state_change = svsk->sk_ostate;
+ newsock->sk->sk_data_ready = svsk->sk_odata;
+ newsock->sk->sk_write_space = svsk->sk_owspace;
+
/* make sure that a write doesn't block forever when
* low on memory
*/
@@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
- .xpo_adjust_wspace = svc_tcp_adjust_wspace,
};
static struct svc_xprt_class svc_tcp_class = {
@@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
} else {
dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
- sk->sk_data_ready = svc_tcp_data_ready;
- sk->sk_write_space = svc_tcp_write_space;
+ sk->sk_data_ready = svc_data_ready;
+ sk->sk_write_space = svc_write_space;
svsk->sk_reclen = 0;
svsk->sk_tcplen = 0;
@@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- if (sk->sk_state != TCP_ESTABLISHED)
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ default:
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ }
}
}
@@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
/* Initialize the socket */
if (sock->type == SOCK_DGRAM)
svc_udp_init(svsk, serv);
- else {
- /* initialise setting must have enough space to
- * receive and respond to one request.
- */
- svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
- 4 * serv->sv_max_mesg);
+ else
svc_tcp_init(svsk, serv);
- }
- dprintk("svc: svc_setup_socket created %p (inet %p)\n",
- svsk, svsk->sk_sk);
+ dprintk("svc: svc_setup_socket created %p (inet %p), "
+ "listen %d close %d\n",
+ svsk, svsk->sk_sk,
+ test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
return svsk;
}
@@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
- wait_queue_head_t *wq;
dprintk("svc: svc_sock_detach(%p)\n", svsk);
/* put back the old socket callbacks */
+ lock_sock(sk);
sk->sk_state_change = svsk->sk_ostate;
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
+ sk->sk_user_data = NULL;
+ release_sock(sk);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 216a1385718a..ea244b29138b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_atomic();
} else
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
}
/*
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_func, xprt))
return;
xprt_clear_locked(xprt);
}
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
return;
if (RPCXPRT_CONGESTED(xprt))
goto out_unlock;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_cong_func, xprt))
return;
out_unlock:
xprt_clear_locked(xprt);
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
@@ -672,12 +674,26 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
spin_unlock_bh(&xprt->transport_lock);
}
+static bool
+xprt_has_timer(const struct rpc_xprt *xprt)
+{
+ return xprt->idle_timeout != 0;
+}
+
+static void
+xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
+ __must_hold(&xprt->transport_lock)
+{
+ if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+ mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
+}
+
static void
xprt_init_autodisconnect(unsigned long data)
{
@@ -686,10 +702,12 @@ xprt_init_autodisconnect(unsigned long data)
spin_lock(&xprt->transport_lock);
if (!list_empty(&xprt->recv))
goto out_abort;
+ /* Reset xprt->last_used to avoid connect/autodisconnect cycling */
+ xprt->last_used = jiffies;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
goto out_abort;
spin_unlock(&xprt->transport_lock);
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
@@ -723,6 +741,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
goto out;
xprt->snd_task =NULL;
xprt->ops->release_xprt(xprt, NULL);
+ xprt_schedule_autodisconnect(xprt);
out:
spin_unlock_bh(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
@@ -886,11 +905,6 @@ static void xprt_timer(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
}
-static inline int xprt_has_timer(struct rpc_xprt *xprt)
-{
- return xprt->idle_timeout != 0;
-}
-
/**
* xprt_prepare_transmit - reserve the transport before sending a request
* @task: RPC task about to send a request
@@ -1278,9 +1292,7 @@ void xprt_release(struct rpc_task *task)
if (!list_empty(&req->rq_list))
list_del(&req->rq_list);
xprt->last_used = jiffies;
- if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
- mod_timer(&xprt->timer,
- xprt->last_used + xprt->idle_timeout);
+ xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(req->rq_buffer);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index e7fd76975d86..66c9d63f4797 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
xprt_switch_find_xprt_t find_next)
{
struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
- struct list_head *head;
if (xps == NULL)
return NULL;
- head = &xps->xps_xprt_list;
- if (xps->xps_nxprts < 2)
- return xprt_switch_find_first_entry(head);
- return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
+ return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
+ &xpi->xpi_cursor,
+ find_next);
}
static
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513a05..ef19fa42c50f 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o \
+ fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b595..21cb3b150b37 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
* verb (fmr_op_unmap).
*/
-/* Transport recovery
- *
- * After a transport reconnect, fmr_op_map re-uses the MR already
- * allocated for the RPC, but generates a fresh rkey then maps the
- * MR again. This process is synchronous.
- */
-
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
-static struct workqueue_struct *fmr_recovery_wq;
-
-#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+/* Access mode of externally registered pages */
+enum {
+ RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ,
+};
-int
-fmr_alloc_recovery_wq(void)
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
{
- fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
- return !fmr_recovery_wq ? -ENOMEM : 0;
+ if (!ia->ri_device->alloc_fmr) {
+ pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+ }
+ return true;
}
-void
-fmr_destroy_recovery_wq(void)
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
{
- struct workqueue_struct *wq;
+ static struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
- if (!fmr_recovery_wq)
- return;
+ mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(u64), GFP_KERNEL);
+ if (!mw->fmr.fm_physaddrs)
+ goto out_free;
- wq = fmr_recovery_wq;
- fmr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(*mw->mw_sg), GFP_KERNEL);
+ if (!mw->mw_sg)
+ goto out_free;
+
+ sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+
+ mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+ &fmr_attr);
+ if (IS_ERR(mw->fmr.fm_mr))
+ goto out_fmr_err;
+
+ return 0;
+
+out_fmr_err:
+ dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
+ PTR_ERR(mw->fmr.fm_mr));
+
+out_free:
+ kfree(mw->mw_sg);
+ kfree(mw->fmr.fm_physaddrs);
+ return -ENOMEM;
}
static int
__fmr_unmap(struct rpcrdma_mw *mw)
{
LIST_HEAD(l);
+ int rc;
- list_add(&mw->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
+ list_add(&mw->fmr.fm_mr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ list_del_init(&mw->fmr.fm_mr->list);
+ return rc;
}
-/* Deferred reset of a single FMR. Generate a fresh rkey by
- * replacing the MR. There's no recovery if this fails.
- */
static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_release_mr(struct rpcrdma_mw *r)
{
- struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
- mw_work);
- struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ LIST_HEAD(unmap_list);
+ int rc;
- __fmr_unmap(mw);
- rpcrdma_put_mw(r_xprt, mw);
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
+
+ kfree(r->fmr.fm_physaddrs);
+ kfree(r->mw_sg);
+
+ /* In case this one was left mapped, try to unmap it
+ * to prevent dealloc_fmr from failing with EBUSY
+ */
+ rc = __fmr_unmap(r);
+ if (rc)
+ pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+ r, rc);
+
+ rc = ib_dealloc_fmr(r->fmr.fm_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+ r, rc);
+
+ kfree(r);
}
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
+/* Reset of a single FMR.
*/
static void
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
{
- INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
- queue_work(fmr_recovery_wq, &mw->mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ int rc;
+
+ /* ORDER: invalidate first */
+ rc = __fmr_unmap(mw);
+
+ /* ORDER: then DMA unmap */
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
+
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
+
+out_release:
+ pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
+
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+
+ fmr_op_release_mr(mw);
}
static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
-static int
-fmr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
- struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_FMR_SGES,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int i, rc;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
-
- rc = -ENOMEM;
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- goto out;
-
- r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
- sizeof(u64), GFP_KERNEL);
- if (!r->fmr.physaddrs)
- goto out_free;
-
- r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->fmr.fmr))
- goto out_fmr_err;
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
- return 0;
-
-out_fmr_err:
- rc = PTR_ERR(r->fmr.fmr);
- dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
- kfree(r->fmr.physaddrs);
-out_free:
- kfree(r);
-out:
- return rc;
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc;
struct rpcrdma_mw *mw;
+ u64 *dma_pages;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
- if (!mw) {
- mw = rpcrdma_get_mw(r_xprt);
- if (!mw)
- return -ENOMEM;
- } else {
- /* this is a retransmit; generate a fresh rkey */
- rc = __fmr_unmap(mw);
- if (rc)
- return rc;
- }
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOBUFS;
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- mw->fmr.physaddrs[i] = seg->mr_dma;
+ if (seg->mr_page)
+ sg_set_page(&mw->mw_sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+ seg->mr_len);
len += seg->mr_len;
++seg;
++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
-
- rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
- i, seg1->mr_dma);
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
+
+ if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir))
+ goto out_dmamap_err;
+
+ for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
+ dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
+ rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+ dma_pages[0]);
if (rc)
goto out_maperr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mw->fmr.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ mw->mw_handle = mw->fmr.fm_mr->rkey;
+ mw->mw_length = len;
+ mw->mw_offset = dma_pages[0] + pageoff;
-out_maperr:
- dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
- __func__, len, (unsigned long long)seg1->mr_dma,
- pageoff, i, rc);
- while (i--)
- rpcrdma_unmap_one(device, --seg);
- return rc;
-}
+ *out = mw;
+ return mw->mw_nents;
-static void
-__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- int nsegs = seg->mr_nsegs;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
- while (nsegs--)
- rpcrdma_unmap_one(device, seg++);
+out_maperr:
+ pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ len, (unsigned long long)dma_pages[0],
+ pageoff, mw->mw_nents, rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
}
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
- struct rpcrdma_mw *mw;
+ struct rpcrdma_mw *mw, *tmp;
LIST_HEAD(unmap_list);
int rc;
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* ORDER: Invalidate all of the req's MRs first
*
* ib_unmap_fmr() is slow, so use a single call instead
- * of one call per mapped MR.
+ * of one call per mapped FMR.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- list_add(&mw->fmr.fmr->list, &unmap_list);
-
- i += seg->mr_nsegs;
- }
+ list_for_each_entry(mw, &req->rl_registered, mw_list)
+ list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
rc = ib_unmap_fmr(&unmap_list);
if (rc)
- pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+ goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ list_del_init(&mw->fmr.fm_mr->list);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ rpcrdma_put_mw(r_xprt, mw);
+ }
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, seg->rl_mw);
+ return;
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
+out_reset:
+ pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
- req->rl_nchunks = 0;
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->fmr.fm_mr->list);
+ fmr_op_recover_mr(mw);
+ }
}
/* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req".
- *
- * In the asynchronous case, DMA unmapping occurs first here
- * because the rpcrdma_mr_seg is released immediately after this
- * call. It's contents won't be available in __fmr_dma_unmap later.
- * FIXME.
*/
static void
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- if (sync) {
- /* ORDER */
- __fmr_unmap(mw);
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, mw);
- } else {
- __fmr_dma_unmap(r_xprt, seg);
- __fmr_queue_recovery(mw);
- }
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-fmr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- kfree(r->fmr.physaddrs);
- rc = ib_dealloc_fmr(r->fmr.fmr);
- if (rc)
- dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
- __func__, rc);
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
- kfree(r);
+ if (sync)
+ fmr_op_recover_mr(mw);
+ else
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe,
+ .ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
- .ro_init = fmr_op_init,
- .ro_destroy = fmr_op_destroy,
+ .ro_init_mr = fmr_op_init_mr,
+ .ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c0947544babe..892b5e1d9b09 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-static struct workqueue_struct *frwr_recovery_wq;
-
-#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ goto out_not_supported;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ goto out_not_supported;
+ return true;
+
+out_not_supported:
+ pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+}
-int
-frwr_alloc_recovery_wq(void)
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
{
- frwr_recovery_wq = alloc_workqueue("frwr_recovery",
- FRWR_RECOVERY_WQ_FLAGS, 0);
- return !frwr_recovery_wq ? -ENOMEM : 0;
+ unsigned int depth = ia->ri_max_frmr_depth;
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+
+ r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
+ if (!r->mw_sg)
+ goto out_list_err;
+
+ sg_init_table(r->mw_sg, depth);
+ init_completion(&f->fr_linv_done);
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
}
-void
-frwr_destroy_recovery_wq(void)
+static void
+frwr_op_release_mr(struct rpcrdma_mw *r)
{
- struct workqueue_struct *wq;
+ int rc;
- if (!frwr_recovery_wq)
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
- wq = frwr_recovery_wq;
- frwr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ rc = ib_dereg_mr(r->frmr.fr_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+ r, rc);
+ kfree(r->mw_sg);
+ kfree(r);
}
static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0;
}
-static void
-__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_frmr *f = &mw->frmr;
- int rc;
-
- rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
- if (rc)
- return;
-
- rpcrdma_put_mw(r_xprt, mw);
-}
-
-/* Deferred reset of a single FRMR. Generate a fresh rkey by
- * replacing the MR.
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
*
* There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is
* destroyed.
*/
static void
-__frwr_recovery_worker(struct work_struct *work)
-{
- struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- mw_work);
-
- __frwr_reset_and_unmap(r->mw_xprt, r);
- return;
-}
-
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
- */
-static void
-__frwr_queue_recovery(struct rpcrdma_mw *r)
-{
- INIT_WORK(&r->mw_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->mw_work);
-}
-
-static int
-__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
- unsigned int depth)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_frmr *f = &r->frmr;
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
- f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(f->fr_mr))
- goto out_mr_err;
-
- f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
- if (!f->fr_sg)
- goto out_list_err;
-
- sg_init_table(f->fr_sg, depth);
-
- init_completion(&f->fr_linv_done);
-
- return 0;
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
-out_mr_err:
- rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_mr status %i\n",
- __func__, rc);
- return rc;
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
-out_list_err:
- rc = -ENOMEM;
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
- ib_dereg_mr(f->fr_mr);
- return rc;
-}
+out_release:
+ pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
-static void
-__frwr_release(struct rpcrdma_mw *r)
-{
- int rc;
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
- rc = ib_dereg_mr(r->frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr status %i\n",
- __func__, rc);
- kfree(r->frmr.fr_sg);
+ frwr_op_release_mr(mw);
}
static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
complete_all(&frmr->fr_linv_done);
}
-static int
-frwr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- int i;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
-
- while (i--) {
- struct rpcrdma_mw *r;
- int rc;
-
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- rc = __frwr_init(r, pd, device, depth);
- if (rc) {
- kfree(r);
- return rc;
- }
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
-
- return 0;
-}
-
-/* Post a FAST_REG Work Request to register a memory region
+/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
- struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int rc, i, n, dma_nents;
u8 key;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
+ mw = NULL;
do {
if (mw)
- __frwr_queue_recovery(mw);
+ rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOMEM;
+ return -ENOBUFS;
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
-
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->fr_sg[i],
+ sg_set_page(&mw->mw_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->fr_nents = i;
- frmr->fr_dir = direction;
-
- dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
- if (!dma_nents) {
- pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->fr_sg, frmr->fr_nents);
- return -ENOMEM;
- }
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
- n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
- if (unlikely(n != frmr->fr_nents)) {
- pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->fr_nents);
- rc = n < 0 ? n : -EINVAL;
- goto out_senderr;
- }
+ dma_nents = ib_dma_map_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (!dma_nents)
+ goto out_dmamap_err;
+
+ n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != mw->mw_nents))
+ goto out_mapmr_err;
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->fr_nents, mr->length);
+ __func__, mw, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mr->rkey;
- seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->fr_nents;
- seg1->mr_len = mr->length;
+ mw->mw_handle = mr->rkey;
+ mw->mw_length = mr->length;
+ mw->mw_offset = mr->iova;
+
+ *out = mw;
+ return mw->mw_nents;
- return frmr->fr_nents;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
+
+out_mapmr_err:
+ pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
+ frmr->fr_mr, n, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
out_senderr:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return rc;
+ pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -ENOTCONN;
}
static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *mw = seg->rl_mw;
struct rpcrdma_frmr *f = &mw->frmr;
struct ib_send_wr *invalidate_wr;
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
+ struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
+ f = NULL;
invalidate_wrs = pos = prev = NULL;
- seg = NULL;
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
-
- pos = __frwr_prepare_linv_wr(seg);
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
+ pos = __frwr_prepare_linv_wr(mw);
if (!invalidate_wrs)
invalidate_wrs = pos;
else
prev->next = pos;
prev = pos;
-
- i += seg->mr_nsegs;
+ f = &mw->frmr;
}
- f = &seg->rl_mw->frmr;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list.
*/
unmap:
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
- f->fr_dir);
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ ib_dma_unmap_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
}
-
- req->rl_nchunks = 0;
return;
reset_mrs:
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+ pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
+ rdma_disconnect(ia->ri_id);
/* Find and reset the MRs in the LOCAL_INV WRs that did not
* get posted. This is synchronous, and slow.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
-
if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
-
- i += seg->mr_nsegs;
}
goto unmap;
}
@@ -621,38 +552,17 @@ static void
frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
if (sync)
- __frwr_reset_and_unmap(r_xprt, mw);
+ frwr_op_recover_mr(mw);
else
- __frwr_queue_recovery(mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-frwr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
-
- /* Ensure stale MWs for "buf" are no longer in flight */
- flush_workqueue(frwr_recovery_wq);
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- __frwr_release(r);
- kfree(r);
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap_safe = frwr_op_unmap_safe,
+ .ro_recover_mr = frwr_op_recover_mr,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
- .ro_init = frwr_op_init,
- .ro_destroy = frwr_op_destroy,
+ .ro_init_mr = frwr_op_init_mr,
+ .ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr",
};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
deleted file mode 100644
index 3750596cc432..000000000000
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2015 Oracle. All rights reserved.
- * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
- */
-
-/* No-op chunk preparation. All client memory is pre-registered.
- * Sometimes referred to as ALLPHYSICAL mode.
- *
- * Physical registration is simple because all client memory is
- * pre-registered and never deregistered. This mode is good for
- * adapter bring up, but is considered not safe: the server is
- * trusted not to abuse its access to client memory not involved
- * in RDMA I/O.
- */
-
-#include "xprt_rdma.h"
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
-static int
-physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
- struct rpcrdma_create_data_internal *cdata)
-{
- struct ib_mr *mr;
-
- /* Obtain an rkey to use for RPC data payloads.
- */
- mr = ib_get_dma_mr(ia->ri_pd,
- IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ);
- if (IS_ERR(mr)) {
- pr_err("%s: ib_get_dma_mr for failed with %lX\n",
- __func__, PTR_ERR(mr));
- return -ENOMEM;
- }
- ia->ri_dma_mr = mr;
-
- rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
- RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS));
- return 0;
-}
-
-/* PHYSICAL memory registration conveys one page per chunk segment.
- */
-static size_t
-physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
-{
- return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS);
-}
-
-static int
-physical_op_init(struct rpcrdma_xprt *r_xprt)
-{
- return 0;
-}
-
-/* The client's physical memory is already exposed for
- * remote access via RDMA READ or RDMA WRITE.
- */
-static int
-physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_dma_mr->rkey;
- seg->mr_base = seg->mr_dma;
- return 1;
-}
-
-/* DMA unmap all memory regions that were mapped for "req".
- */
-static void
-physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; --req->rl_nchunks)
- rpcrdma_unmap_one(device, &req->rl_segments[i++]);
-}
-
-/* Use a slow, safe mechanism to invalidate all memory regions
- * that were registered for "req".
- *
- * For physical memory registration, there is no good way to
- * fence a single MR that has been advertised to the server. The
- * client has already handed the server an R_key that cannot be
- * invalidated and is shared by all MRs on this connection.
- * Tearing down the PD might be the only safe choice, but it's
- * not clear that a freshly acquired DMA R_key would be different
- * than the one used by the PD that was just destroyed.
- * FIXME.
- */
-static void
-physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- bool sync)
-{
- physical_op_unmap_sync(r_xprt, req);
-}
-
-static void
-physical_op_destroy(struct rpcrdma_buffer *buf)
-{
-}
-
-const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
- .ro_map = physical_op_map,
- .ro_unmap_sync = physical_op_unmap_sync,
- .ro_unmap_safe = physical_op_unmap_safe,
- .ro_open = physical_op_open,
- .ro_maxpages = physical_op_maxpages,
- .ro_init = physical_op_init,
- .ro_destroy = physical_op_destroy,
- .ro_displayname = "physical",
-};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 35a81096e83d..a47f170b20ef 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
* MR when they can.
*/
static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
- int n, int nsegs)
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
{
size_t page_offset;
u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
- while (remaining && n < nsegs) {
+ while (remaining && n < RPCRDMA_MAX_SEGS) {
seg[n].mr_page = NULL;
seg[n].mr_offset = base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
{
- int len, n = 0, p;
- int page_base;
+ int len, n, p, page_base;
struct page **ppages;
+ n = 0;
if (pos == 0) {
- n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
- while (len && n < nsegs) {
+ while (len && n < RPCRDMA_MAX_SEGS) {
if (!ppages[p]) {
/* alloc the pagelist for receiving buffer */
ppages[p] = alloc_page(GFP_ATOMIC);
if (!ppages[p])
- return -ENOMEM;
+ return -EAGAIN;
}
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
if (seg[n].mr_len > PAGE_SIZE)
- return -EIO;
+ goto out_overflow;
len -= seg[n].mr_len;
++n;
++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
}
/* Message overflows the seg array */
- if (len && n == nsegs)
- return -EIO;
+ if (len && n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
/* When encoding the read list, the tail is always sent inline */
if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* xdr pad bytes, saving the server an RDMA operation. */
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
return n;
- n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
return n;
+
+out_overflow:
+ pr_err("rpcrdma: segment array overflow\n");
+ return -EIO;
}
static inline __be32 *
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{
- *iptr++ = cpu_to_be32(seg->mr_rkey);
- *iptr++ = cpu_to_be32(seg->mr_len);
- return xdr_encode_hyper(iptr, seg->mr_base);
+ *iptr++ = cpu_to_be32(mw->mw_handle);
+ *iptr++ = cpu_to_be32(mw->mw_length);
+ return xdr_encode_hyper(iptr, mw->mw_offset);
}
/* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype rtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
unsigned int pos;
int n, nsegs;
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ false, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
*iptr++ = xdr_one; /* item present */
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* have the same "position".
*/
*iptr++ = cpu_to_be32(pos);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: read segment pos %u "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++;
- req->rl_nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Finish Read list */
*iptr++ = xdr_zero; /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, __be32 *iptr,
enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return iptr;
}
+ seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: write segment "
- "%d@0x016%llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr;
}
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: reply segment "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ bool ddp_allowed;
ssize_t hdrlen;
size_t rpclen;
__be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
headerp->rm_type = rdma_msg;
+ /* When the ULP employs a GSS flavor that guarantees integrity
+ * or privacy, direct data placement of individual data items
+ * is not allowed.
+ */
+ ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
+ RPCAUTH_AUTH_DATATOUCH);
+
/*
* Chunks needed for results?
*
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rtype = rpcrdma_noch;
rpcrdma_inline_pullup(rqst);
rpclen = rqst->rq_svec[0].iov_len;
- } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
rpclen = rqst->rq_svec[0].iov_len;
rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- req->rl_nchunks = 0;
- req->rl_nextseg = req->rl_segments;
iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
out_overflow:
pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
- /* Terminate this RPC. Chunks registered above will be
- * released by xprt_release -> xprt_rmda_free .
- */
- return -EIO;
+ iptr = ERR_PTR(-EIO);
out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
*/
static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
{
unsigned int i, total_len;
struct rpcrdma_write_chunk *cur_wchunk;
char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
i = be32_to_cpu(**iptrp);
- if (i > max)
- return -1;
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
total_len = 0;
while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
return total_len;
}
-/*
- * Scatter inline received data back into provided iov's.
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
*/
-static void
+static unsigned long
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{
- int i, npages, curlen, olen;
+ unsigned long fixup_copy_count;
+ int i, npages, curlen;
char *destp;
struct page **ppages;
int page_base;
+ /* The head iovec is redirected to the RPC reply message
+ * in the receive buffer, to avoid a memcopy.
+ */
+ rqst->rq_rcv_buf.head[0].iov_base = srcp;
+ rqst->rq_private_buf.head[0].iov_base = srcp;
+
+ /* The contents of the receive buffer that follow
+ * head.iov_len bytes are copied into the page list.
+ */
curlen = rqst->rq_rcv_buf.head[0].iov_len;
- if (curlen > copy_len) { /* write chunk header fixup */
+ if (curlen > copy_len)
curlen = copy_len;
- rqst->rq_rcv_buf.head[0].iov_len = curlen;
- }
-
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
__func__, srcp, copy_len, curlen);
-
- /* Shift pointer for first receive segment only */
- rqst->rq_rcv_buf.head[0].iov_base = srcp;
srcp += curlen;
copy_len -= curlen;
- olen = copy_len;
- i = 0;
- rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
page_base = rqst->rq_rcv_buf.page_base;
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
page_base &= ~PAGE_MASK;
-
+ fixup_copy_count = 0;
if (copy_len && rqst->rq_rcv_buf.page_len) {
- npages = PAGE_ALIGN(page_base +
- rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
- for (; i < npages; i++) {
+ int pagelist_len;
+
+ pagelist_len = rqst->rq_rcv_buf.page_len;
+ if (pagelist_len > copy_len)
+ pagelist_len = copy_len;
+ npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
curlen = PAGE_SIZE - page_base;
- if (curlen > copy_len)
- curlen = copy_len;
+ if (curlen > pagelist_len)
+ curlen = pagelist_len;
+
dprintk("RPC: %s: page %d"
" srcp 0x%p len %d curlen %d\n",
__func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
kunmap_atomic(destp);
srcp += curlen;
copy_len -= curlen;
- if (copy_len == 0)
+ fixup_copy_count += curlen;
+ pagelist_len -= curlen;
+ if (!pagelist_len)
break;
page_base = 0;
}
- }
- if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
- curlen = copy_len;
- if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
- curlen = rqst->rq_rcv_buf.tail[0].iov_len;
- if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
- memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
- dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
- __func__, srcp, copy_len, curlen);
- rqst->rq_rcv_buf.tail[0].iov_len = curlen;
- copy_len -= curlen; ++i;
- } else
- rqst->rq_rcv_buf.tail[0].iov_len = 0;
-
- if (pad) {
- /* implicit padding on terminal chunk */
- unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
- while (pad--)
- p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+ /* Implicit padding for the last segment in a Write
+ * chunk is inserted inline at the front of the tail
+ * iovec. The upper layer ignores the content of
+ * the pad. Simply ensure inline content in the tail
+ * that follows the Write chunk is properly aligned.
+ */
+ if (pad)
+ srcp -= pad;
}
- if (copy_len)
- dprintk("RPC: %s: %d bytes in"
- " %d extra segments (%d lost)\n",
- __func__, olen, i, copy_len);
+ /* The tail iovec is redirected to the remaining data
+ * in the receive buffer, to avoid a memcopy.
+ */
+ if (copy_len || pad) {
+ rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+ rqst->rq_private_buf.tail[0].iov_base = srcp;
+ }
- /* TBD avoid a warning from call_decode() */
- rqst->rq_private_buf = rqst->rq_rcv_buf;
+ return fixup_copy_count;
}
void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
(headerp->rm_body.rm_chunks[1] == xdr_zero &&
headerp->rm_body.rm_chunks[2] != xdr_zero) ||
(headerp->rm_body.rm_chunks[1] != xdr_zero &&
- req->rl_nchunks == 0))
+ list_empty(&req->rl_registered)))
goto badheader;
if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
/* count any expected write chunks in read reply */
/* start at write chunk array count */
iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep,
- req->rl_nchunks, 1, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
/* check for validity, and no reply chunk after */
if (rdmalen < 0 || *iptr++ != xdr_zero)
goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rep->rr_len -= RPCRDMA_HDRLEN_MIN;
status = rep->rr_len;
}
- /* Fix up the rpc results for upper layer */
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
+ rdmalen);
break;
case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
headerp->rm_body.rm_chunks[1] != xdr_zero ||
headerp->rm_body.rm_chunks[2] != xdr_one ||
- req->rl_nchunks == 0)
+ list_empty(&req->rl_registered))
goto badheader;
iptr = (__be32 *)((unsigned char *)headerp +
RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
if (rdmalen < 0)
goto badheader;
r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
badheader:
default:
- dprintk("%s: invalid rpcrdma reply header (type %d):"
- " chunks[012] == %d %d %d"
- " expected chunks <= %d\n",
- __func__, be32_to_cpu(headerp->rm_type),
- headerp->rm_body.rm_chunks[0],
- headerp->rm_body.rm_chunks[1],
- headerp->rm_body.rm_chunks[2],
- req->rl_nchunks);
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpu(headerp->rm_type));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
break;
@@ -1035,7 +1049,7 @@ out:
* control: waking the next RPC waits until this RPC has
* relinquished all its Send Queue entries.
*/
- if (req->rl_nchunks)
+ if (!list_empty(&req->rl_registered))
r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
spin_lock_bh(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 99d2e5b72726..81f0e879f019 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -558,7 +558,6 @@ out_sendbuf:
out_fail:
rpcrdma_buffer_put(req);
- r_xprt->rx_stats.failed_marshal_count++;
return NULL;
}
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
rpcrdma_buffer_put(req);
}
-/*
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @task: RPC task with an RPC message in rq_snd_buf
+ *
+ * Return values:
+ * 0: The request has been sent
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * ENOBUFS: Call again later to send the request
+ * EIO: A permanent error occurred. The request was not sent,
+ * and don't try it again
+ *
* send_request invokes the meat of RPC RDMA. It must do the following:
+ *
* 1. Marshal the RPC request into an RPC RDMA request, which means
* putting a header in front of data, and creating IOVs for RDMA
* from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
* the request (rpcrdma_ep_post).
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
*/
-
static int
xprt_rdma_send_request(struct rpc_task *task)
{
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ /* On retransmit, remove any previously registered chunks */
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0;
failed_marshal:
- r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
__func__, rc);
if (rc == -EIO)
- return -EIO;
+ r_xprt->rx_stats.failed_marshal_count++;
+ if (rc != -ENOTCONN)
+ return rc;
drop_connection:
xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
- seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
+ seq_printf(seq, "%lu %lu %lu\n",
+ r_xprt->rx_stats.mrs_recovered,
+ r_xprt->rx_stats.mrs_orphaned,
+ r_xprt->rx_stats.mrs_allocated);
}
static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
__func__, rc);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
rc = xprt_unregister_transport(&xprt_rdma_bc);
if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
{
int rc;
- rc = frwr_alloc_recovery_wq();
- if (rc)
- return rc;
-
rc = rpcrdma_alloc_wq();
- if (rc) {
- frwr_destroy_recovery_wq();
+ if (rc)
return rc;
- }
rc = xprt_register_transport(&xprt_rdma);
if (rc) {
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
if (rc) {
xprt_unregister_transport(&xprt_rdma);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b044d98a1370..536d0be3f61b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
- ia->ri_dma_mr = NULL;
-
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
- dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
- __func__, rc);
+ pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
goto out2;
}
- if (memreg == RPCRDMA_FRMR) {
- if (!(ia->ri_device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
- dprintk("RPC: %s: FRMR registration "
- "not supported by HCA\n", __func__);
- memreg = RPCRDMA_MTHCAFMR;
- }
- }
- if (memreg == RPCRDMA_MTHCAFMR) {
- if (!ia->ri_device->alloc_fmr) {
- dprintk("RPC: %s: MTHCAFMR registration "
- "not supported by HCA\n", __func__);
- rc = -EINVAL;
- goto out3;
- }
- }
-
switch (memreg) {
case RPCRDMA_FRMR:
- ia->ri_ops = &rpcrdma_frwr_memreg_ops;
- break;
- case RPCRDMA_ALLPHYSICAL:
- ia->ri_ops = &rpcrdma_physical_memreg_ops;
- break;
+ if (frwr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
case RPCRDMA_MTHCAFMR:
- ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- break;
+ if (fmr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
default:
- printk(KERN_ERR "RPC: Unsupported memory "
- "registration mode: %d\n", memreg);
- rc = -ENOMEM;
+ pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
+ memreg);
+ rc = -EINVAL;
goto out3;
}
- dprintk("RPC: %s: memory registration strategy is '%s'\n",
- __func__, ia->ri_ops->ro_displayname);
return 0;
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
out2:
ib_free_cq(sendcq);
out1:
- if (ia->ri_dma_mr)
- ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -600,8 +578,6 @@ out1:
void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.recv_cq);
ib_free_cq(ep->rep_attr.send_cq);
-
- if (ia->ri_dma_mr) {
- rc = ib_dereg_mr(ia->ri_dma_mr);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
}
/*
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_drain_qp(ia->ri_id->qp);
}
+static void
+rpcrdma_mr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_recovery_worker.work);
+ struct rpcrdma_mw *mw;
+
+ spin_lock(&buf->rb_recovery_lock);
+ while (!list_empty(&buf->rb_stale_mrs)) {
+ mw = list_first_entry(&buf->rb_stale_mrs,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
+ mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
+
+ spin_lock(&buf->rb_recovery_lock);
+ }
+ spin_unlock(&buf->rb_recovery_lock);
+}
+
+void
+rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+ spin_lock(&buf->rb_recovery_lock);
+ list_add(&mw->mw_list, &buf->rb_stale_mrs);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ schedule_delayed_work(&buf->rb_recovery_worker, 0);
+}
+
+static void
+rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ unsigned int count;
+ LIST_HEAD(free);
+ LIST_HEAD(all);
+
+ for (count = 0; count < 32; count++) {
+ struct rpcrdma_mw *mw;
+ int rc;
+
+ mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+ if (!mw)
+ break;
+
+ rc = ia->ri_ops->ro_init_mr(ia, mw);
+ if (rc) {
+ kfree(mw);
+ break;
+ }
+
+ mw->mw_xprt = r_xprt;
+
+ list_add(&mw->mw_list, &free);
+ list_add(&mw->mw_all, &all);
+ }
+
+ spin_lock(&buf->rb_mwlock);
+ list_splice(&free, &buf->rb_mws);
+ list_splice(&all, &buf->rb_all);
+ r_xprt->rx_stats.mrs_allocated += count;
+ spin_unlock(&buf->rb_mwlock);
+
+ dprintk("RPC: %s: created %u MRs\n", __func__, count);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_refresh_worker.work);
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+
+ rpcrdma_create_mrs(r_xprt);
+}
+
struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buffer->rb_reqslock);
req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
+ INIT_LIST_HEAD(&req->rl_registered);
return req;
}
@@ -832,17 +887,23 @@ int
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int i, rc;
buf->rb_max_requests = r_xprt->rx_data.max_requests;
buf->rb_bc_srv_max_requests = 0;
- spin_lock_init(&buf->rb_lock);
atomic_set(&buf->rb_credits, 1);
+ spin_lock_init(&buf->rb_mwlock);
+ spin_lock_init(&buf->rb_lock);
+ spin_lock_init(&buf->rb_recovery_lock);
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+ INIT_LIST_HEAD(&buf->rb_stale_mrs);
+ INIT_DELAYED_WORK(&buf->rb_refresh_worker,
+ rpcrdma_mr_refresh_worker);
+ INIT_DELAYED_WORK(&buf->rb_recovery_worker,
+ rpcrdma_mr_recovery_worker);
- rc = ia->ri_ops->ro_init(r_xprt);
- if (rc)
- goto out;
+ rpcrdma_create_mrs(r_xprt);
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
kfree(req);
}
+static void
+rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct rpcrdma_mw *mw;
+ unsigned int count;
+
+ count = 0;
+ spin_lock(&buf->rb_mwlock);
+ while (!list_empty(&buf->rb_all)) {
+ mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&mw->mw_all);
+
+ spin_unlock(&buf->rb_mwlock);
+ ia->ri_ops->ro_release_mr(mw);
+ count++;
+ spin_lock(&buf->rb_mwlock);
+ }
+ spin_unlock(&buf->rb_mwlock);
+ r_xprt->rx_stats.mrs_allocated = 0;
+
+ dprintk("RPC: %s: released %u MRs\n", __func__, count);
+}
+
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ cancel_delayed_work_sync(&buf->rb_recovery_worker);
+
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
spin_unlock(&buf->rb_reqslock);
- ia->ri_ops->ro_destroy(buf);
+ rpcrdma_destroy_mrs(buf);
}
struct rpcrdma_mw *
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buf->rb_mwlock);
if (!mw)
- pr_err("RPC: %s: no MWs available\n", __func__);
+ goto out_nomws;
return mw;
+
+out_nomws:
+ dprintk("RPC: %s: no MWs available\n", __func__);
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
+
+ /* Allow the reply handler and refresh worker to run */
+ cond_resched();
+
+ return NULL;
}
void
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
/*
* Get a set of request/reply buffers.
- *
- * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
out_reqbuf:
spin_unlock(&buffers->rb_lock);
- pr_warn("RPC: %s: out of request buffers\n", __func__);
+ pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
return NULL;
out_repbuf:
+ list_add(&req->rl_free, &buffers->rb_send_bufs);
spin_unlock(&buffers->rb_lock);
- pr_warn("RPC: %s: out of reply buffers\n", __func__);
- req->rl_reply = NULL;
- return req;
+ pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
+ return NULL;
}
/*
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/
-void
-rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
-{
- dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
- seg->mr_offset,
- (unsigned long long)seg->mr_dma, seg->mr_dmalen);
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc)
- goto out;
+ return rc;
req->rl_reply = NULL;
}
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc)
- dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
- rc);
-out:
- return rc;
+ goto out_postsend_err;
+ return 0;
+
+out_postsend_err:
+ pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
+ return -ENOTCONN;
}
/*
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
DMA_BIDIRECTIONAL);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
-
if (rc)
- dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
- rc);
- return rc;
+ goto out_postrecv;
+ return 0;
+
+out_postrecv:
+ pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
+ return -ENOTCONN;
}
/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 95cdc66225ee..670fad57153a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_dma_mr;
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* o recv buffer (posted to provider)
* o ib_sge (also donated to provider)
* o status of reply (length, success or not)
- * o bookkeeping state to get run by tasklet (list, etc)
+ * o bookkeeping state to get run by reply handler (list, etc)
*
- * These are allocated during initialization, per-transport instance;
- * however, the tasklet execution list itself is global, as it should
- * always be pretty short.
+ * These are allocated during initialization, per-transport instance.
*
* N of these are associated with a transport instance, and stored in
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-
-/* data segments + head/tail for Call + head/tail for Reply */
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
-
-struct rpcrdma_buffer;
-
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
unsigned int rr_len;
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct scatterlist *fr_sg;
- int fr_nents;
- enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
@@ -235,18 +222,23 @@ struct rpcrdma_frmr {
};
struct rpcrdma_fmr {
- struct ib_fmr *fmr;
- u64 *physaddrs;
+ struct ib_fmr *fm_mr;
+ u64 *fm_physaddrs;
};
struct rpcrdma_mw {
+ struct list_head mw_list;
+ struct scatterlist *mw_sg;
+ int mw_nents;
+ enum dma_data_direction mw_dir;
union {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
- struct work_struct mw_work;
struct rpcrdma_xprt *mw_xprt;
- struct list_head mw_list;
+ u32 mw_handle;
+ u32 mw_length;
+ u64 mw_offset;
struct list_head mw_all;
};
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
* of iovs for send operations. The reason is that the iovs passed to
* ib_post_{send,recv} must not be modified until the work request
* completes.
- *
- * NOTES:
- * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
- * marshal. The number needed varies depending on the iov lists that
- * are passed to us, the memory registration mode we are in, and if
- * physical addressing is used, the layout.
*/
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+ RPCRDMA_MAX_IOV_SEGS = 3,
+ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+ RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
+ RPCRDMA_MAX_IOV_SEGS,
+};
+
struct rpcrdma_mr_seg { /* chunk descriptors */
- struct rpcrdma_mw *rl_mw; /* registered MR */
- u64 mr_base; /* registration result */
- u32 mr_rkey; /* registration result */
u32 mr_len; /* length of chunk or segment */
- int mr_nsegs; /* number of segments in chunk or 0 */
- enum dma_data_direction mr_dir; /* segment mapping direction */
- dma_addr_t mr_dma; /* segment mapping address */
- size_t mr_dmalen; /* segment mapping length */
struct page *mr_page; /* owning page, if any */
char *mr_offset; /* kva if no page, else offset */
};
#define RPCRDMA_MAX_IOVS (2)
+struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_free;
unsigned int rl_niovs;
- unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
- struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
bool rl_backchannel;
+
+ struct list_head rl_registered; /* registered segments */
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
@@ -341,6 +331,11 @@ struct rpcrdma_buffer {
struct list_head rb_allreqs;
u32 rb_bc_max_requests;
+
+ spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
+ struct list_head rb_stale_mrs;
+ struct delayed_work rb_recovery_worker;
+ struct delayed_work rb_refresh_worker;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -387,6 +382,9 @@ struct rpcrdma_stats {
unsigned long bad_reply_count;
unsigned long nomsg_call_count;
unsigned long bcall_count;
+ unsigned long mrs_recovered;
+ unsigned long mrs_orphaned;
+ unsigned long mrs_allocated;
};
/*
@@ -395,23 +393,25 @@ struct rpcrdma_stats {
struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *, int, bool);
+ struct rpcrdma_mr_seg *, int, bool,
+ struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
void (*ro_unmap_safe)(struct rpcrdma_xprt *,
struct rpcrdma_req *, bool);
+ void (*ro_recover_mr)(struct rpcrdma_mw *);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
- int (*ro_init)(struct rpcrdma_xprt *);
- void (*ro_destroy)(struct rpcrdma_buffer *);
+ int (*ro_init_mr)(struct rpcrdma_ia *,
+ struct rpcrdma_mw *);
+ void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname;
};
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
-extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
/*
* RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
*/
int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
void rpcrdma_ia_close(struct rpcrdma_ia *);
+bool frwr_is_supported(struct rpcrdma_ia *);
+bool fmr_is_supported(struct rpcrdma_ia *);
/*
* Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
+
struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
size_t, gfp_t);
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
-int frwr_alloc_recovery_wq(void);
-void frwr_destroy_recovery_wq(void);
-
int rpcrdma_alloc_wq(void);
void rpcrdma_destroy_wq(void);
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
* Wrappers for chunk registration, shared by read/write chunk code.
*/
-void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
-
static inline enum dma_data_direction
rpcrdma_data_dir(bool writing)
{
return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
}
-static inline void
-rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
- enum dma_data_direction direction)
-{
- seg->mr_dir = direction;
- seg->mr_dmalen = seg->mr_len;
-
- if (seg->mr_page)
- seg->mr_dma = ib_dma_map_page(device,
- seg->mr_page, offset_in_page(seg->mr_offset),
- seg->mr_dmalen, seg->mr_dir);
- else
- seg->mr_dma = ib_dma_map_single(device,
- seg->mr_offset,
- seg->mr_dmalen, seg->mr_dir);
-
- if (ib_dma_mapping_error(device, seg->mr_dma))
- rpcrdma_mapping_error(seg);
-}
-
-static inline void
-rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
-{
- if (seg->mr_page)
- ib_dma_unmap_page(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
- else
- ib_dma_unmap_single(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7e2b2fa189c3..8ede3bc52481 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit,
- .extra2 = &xprt_max_resvport_limit
+ .extra2 = &xprt_max_resvport
},
{
.procname = "max_resvport",
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xprt_min_resvport_limit,
+ .extra1 = &xprt_min_resvport,
.extra2 = &xprt_max_resvport_limit
},
{
@@ -177,7 +177,6 @@ static struct ctl_table sunrpc_table[] = {
* increase over time if the server is down or not responding.
*/
#define XS_TCP_INIT_REEST_TO (3U * HZ)
-#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ)
/*
* TCP idle timeout; client drops the transport socket if it is idle
@@ -642,6 +641,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
bool zerocopy = true;
+ bool vm_wait = false;
int status;
int sent;
@@ -677,15 +677,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
return 0;
}
+ WARN_ON_ONCE(sent == 0 && status == 0);
+
+ if (status == -EAGAIN ) {
+ /*
+ * Return EAGAIN if we're sure we're hitting the
+ * socket send buffer limits.
+ */
+ if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
+ break;
+ /*
+ * Did we hit a memory allocation failure?
+ */
+ if (sent == 0) {
+ status = -ENOBUFS;
+ if (vm_wait)
+ break;
+ /* Retry, knowing now that we're below the
+ * socket send buffer limit
+ */
+ vm_wait = true;
+ }
+ continue;
+ }
if (status < 0)
break;
- if (sent == 0) {
- status = -EAGAIN;
- break;
- }
+ vm_wait = false;
}
- if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
- status = -ENOBUFS;
switch (status) {
case -ENOTSOCK:
@@ -755,11 +773,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_error_report = transport->old_error_report;
}
+static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+}
+
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{
smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
+ xs_sock_reset_state_flags(xprt);
smp_mb__after_atomic();
}
@@ -962,10 +988,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_local_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_local_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1043,10 +1072,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1074,7 +1106,14 @@ static void xs_data_ready(struct sock *sk)
if (xprt != NULL) {
struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt);
- queue_work(rpciod_workqueue, &transport->recv_worker);
+ transport->old_data_ready(sk);
+ /* Any data means we had a useful conversation, so
+ * then we don't need to delay the next reconnect
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
+ if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
}
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1474,10 +1513,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
for (;;) {
lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- release_sock(sk);
- if (read <= 0)
- break;
- total += read;
+ if (read <= 0) {
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ release_sock(sk);
+ if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ break;
+ } else {
+ release_sock(sk);
+ total += read;
+ }
rd_desc.count = 65536;
}
out:
@@ -1493,34 +1537,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
}
/**
- * xs_tcp_data_ready - "data ready" callback for TCP sockets
- * @sk: socket with data to read
- *
- */
-static void xs_tcp_data_ready(struct sock *sk)
-{
- struct sock_xprt *transport;
- struct rpc_xprt *xprt;
-
- dprintk("RPC: xs_tcp_data_ready...\n");
-
- read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
- transport = container_of(xprt, struct sock_xprt, xprt);
-
- /* Any data means we had a useful conversation, so
- * the we don't need to delay the next reconnect
- */
- if (xprt->reestablish_timeout)
- xprt->reestablish_timeout = 0;
- queue_work(rpciod_workqueue, &transport->recv_worker);
-
-out:
- read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
*
@@ -1714,7 +1730,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
static unsigned short xs_get_random_port(void)
{
- unsigned short range = xprt_max_resvport - xprt_min_resvport;
+ unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
unsigned short rand = (unsigned short) prandom_u32() % range;
return rand + xprt_min_resvport;
}
@@ -2156,6 +2172,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
write_unlock_bh(&sk->sk_callback_lock);
}
xs_udp_do_set_buffer_size(xprt);
+
+ xprt->stat.connect_start = jiffies;
}
static void xs_udp_setup_socket(struct work_struct *work)
@@ -2219,6 +2237,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
unsigned int keepcnt = xprt->timeout->to_retries + 1;
unsigned int opt_on = 1;
unsigned int timeo;
+ unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2230,6 +2249,16 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
(char *)&keepcnt, sizeof(keepcnt));
+ /* Avoid temporary address, they are bad for long-lived
+ * connections such as NFS mounts.
+ * RFC4941, section 3.6 suggests that:
+ * Individual applications, which have specific
+ * knowledge about the normal duration of connections,
+ * MAY override this as appropriate.
+ */
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
+ (char *)&addr_pref, sizeof(addr_pref));
+
/* TCP user timeout (see RFC5482) */
timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
(xprt->timeout->to_retries + 1);
@@ -2241,7 +2270,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_tcp_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
@@ -2278,6 +2307,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
/* SYN_SENT! */
if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ break;
+ case -EADDRNOTAVAIL:
+ /* Source port number is unavailable. Try a new one! */
+ transport->srcport = 0;
}
out:
return ret;
@@ -2352,6 +2385,25 @@ out:
xprt_wake_pending_tasks(xprt, status);
}
+static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
+{
+ unsigned long start, now = jiffies;
+
+ start = xprt->stat.connect_start + xprt->reestablish_timeout;
+ if (time_after(start, now))
+ return start - now;
+ return 0;
+}
+
+static void xs_reconnect_backoff(struct rpc_xprt *xprt)
+{
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+ xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+}
+
/**
* xs_connect - connect a socket to a remote endpoint
* @xprt: pointer to transport structure
@@ -2369,6 +2421,7 @@ out:
static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ unsigned long delay = 0;
WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
@@ -2380,19 +2433,15 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
/* Start by resetting any existing state */
xs_reset_transport(transport);
- queue_delayed_work(rpciod_workqueue,
- &transport->connect_worker,
- xprt->reestablish_timeout);
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
- if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
- } else {
+ delay = xs_reconnect_delay(xprt);
+ xs_reconnect_backoff(xprt);
+
+ } else
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
- queue_delayed_work(rpciod_workqueue,
- &transport->connect_worker, 0);
- }
+
+ queue_delayed_work(xprtiod_workqueue,
+ &transport->connect_worker,
+ delay);
}
/**
@@ -2944,6 +2993,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->ops = &xs_tcp_ops;
xprt->timeout = &xs_tcp_default_timeout;
+ xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+
INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3153,8 +3204,12 @@ static int param_set_uint_minmax(const char *val,
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
- return param_set_uint_minmax(val, kp,
+ if (kp->arg == &xprt_min_resvport)
+ return param_set_uint_minmax(val, kp,
RPC_MIN_RESVPORT,
+ xprt_max_resvport);
+ return param_set_uint_minmax(val, kp,
+ xprt_min_resvport,
RPC_MAX_RESVPORT);
}
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index b62caa1c770c..ed97a5876ebe 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -728,12 +728,13 @@ int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg,
u32 bearer_id, u32 *prev_node)
{
struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
- struct tipc_peer *peer = mon->self;
+ struct tipc_peer *peer;
if (!mon)
return -EINVAL;
read_lock_bh(&mon->lock);
+ peer = mon->self;
do {
if (*prev_node) {
if (peer->addr == *prev_node)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index c49b8df438cb..f9f5f3c3dab5 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2180,7 +2180,8 @@ restart:
TIPC_CONN_MSG, SHORT_H_SIZE,
0, dnode, onode, dport, oport,
TIPC_CONN_SHUTDOWN);
- tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
+ if (skb)
+ tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
}
tsk->connected = 0;
sock->state = SS_DISCONNECTING;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b016c011970b..ae7e14cae085 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -396,10 +396,13 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
tuncfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
- if (enable_mcast(ub, remote))
+ err = enable_mcast(ub, remote);
+ if (err)
goto err;
return 0;
err:
+ if (ub->ubsock)
+ udp_tunnel_sock_release(ub->ubsock);
kfree(ub);
return err;
}
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810abedc2e..8831e7c42167 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS
To compile this driver as a module, choose M here: the module
will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+ tristate "virtio transport for Virtual Sockets"
+ depends on VSOCKETS && VIRTIO
+ select VIRTIO_VSOCKETS_COMMON
+ help
+ This module implements a virtio transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine host supports Virtual
+ Sockets over virtio.
+
+ To compile this driver as a module, choose M here: the module will be
+ called vmw_vsock_virtio_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+ tristate
+ help
+ This option is selected by any driver which needs to access
+ the virtio_vsock. The module will be called
+ vmw_vsock_virtio_transport_common.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d70f224..bc27c70e0e59 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,13 @@
obj-$(CONFIG_VSOCKETS) += vsock.o
obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
vsock-y += af_vsock.o vsock_addr.o
vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
vmci_transport_notify_qstate.o
+
+vmw_vsock_virtio_transport-y += virtio_transport.o
+
+vmw_vsock_virtio_transport_common-y += virtio_transport_common.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b96ac918e0ba..17dbbe64cd73 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
return ret;
}
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
{
int i;
@@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
-
transport->release(vsk);
lock_sock(sk);
@@ -1995,6 +1999,15 @@ void vsock_core_exit(void)
}
EXPORT_SYMBOL_GPL(vsock_core_exit);
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+ /* vsock_register_mutex not taken since only the transport uses this
+ * function and only while registered.
+ */
+ return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");
MODULE_VERSION("1.0.1.0-k");
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000000000000..936d7eee62d0
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,620 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <net/sock.h>
+#include <linux/mutex.h>
+#include <net/af_vsock.h>
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
+
+struct virtio_vsock {
+ struct virtio_device *vdev;
+ struct virtqueue *vqs[VSOCK_VQ_MAX];
+
+ /* Virtqueue processing is deferred to a workqueue */
+ struct work_struct tx_work;
+ struct work_struct rx_work;
+ struct work_struct event_work;
+
+ /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX]
+ * must be accessed with tx_lock held.
+ */
+ struct mutex tx_lock;
+
+ struct work_struct send_pkt_work;
+ spinlock_t send_pkt_list_lock;
+ struct list_head send_pkt_list;
+
+ atomic_t queued_replies;
+
+ /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
+ * must be accessed with rx_lock held.
+ */
+ struct mutex rx_lock;
+ int rx_buf_nr;
+ int rx_buf_max_nr;
+
+ /* The following fields are protected by event_lock.
+ * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
+ */
+ struct mutex event_lock;
+ struct virtio_vsock_event event_list[8];
+
+ u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+ return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+ struct virtio_vsock *vsock = virtio_vsock_get();
+
+ return vsock->guest_cid;
+}
+
+static void
+virtio_transport_send_pkt_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, send_pkt_work);
+ struct virtqueue *vq;
+ bool added = false;
+ bool restart_rx = false;
+
+ mutex_lock(&vsock->tx_lock);
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ int ret, in_sg = 0, out_sg = 0;
+ bool reply;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ if (list_empty(&vsock->send_pkt_list)) {
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ reply = pkt->reply;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[out_sg++] = &hdr;
+ if (pkt->buf) {
+ sg_init_one(&buf, pkt->buf, pkt->len);
+ sgs[out_sg++] = &buf;
+ }
+
+ ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+ /* Usually this means that there is no more space available in
+ * the vq
+ */
+ if (ret < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ break;
+ }
+
+ if (reply) {
+ struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ val = atomic_dec_return(&vsock->queued_replies);
+
+ /* Do we now have resources to resume rx processing? */
+ if (val + 1 == virtqueue_get_vring_size(rx_vq))
+ restart_rx = true;
+ }
+
+ added = true;
+ }
+
+ if (added)
+ virtqueue_kick(vq);
+
+ mutex_unlock(&vsock->tx_lock);
+
+ if (restart_rx)
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static int
+virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock *vsock;
+ int len = pkt->len;
+
+ vsock = virtio_vsock_get();
+ if (!vsock) {
+ virtio_transport_free_pkt(pkt);
+ return -ENODEV;
+ }
+
+ if (pkt->reply)
+ atomic_inc(&vsock->queued_replies);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+ return len;
+}
+
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+{
+ int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ struct virtqueue *vq;
+ int ret;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ do {
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ break;
+
+ pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!pkt->buf) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+
+ pkt->len = buf_len;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[0] = &hdr;
+
+ sg_init_one(&buf, pkt->buf, buf_len);
+ sgs[1] = &buf;
+ ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+ if (ret) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ vsock->rx_buf_nr++;
+ } while (vq->num_free);
+ if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+ vsock->rx_buf_max_nr = vsock->rx_buf_nr;
+ virtqueue_kick(vq);
+}
+
+static void virtio_transport_tx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, tx_work);
+ struct virtqueue *vq;
+ bool added = false;
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+ mutex_lock(&vsock->tx_lock);
+ do {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+ virtio_transport_free_pkt(pkt);
+ added = true;
+ }
+ } while (!virtqueue_enable_cb(vq));
+ mutex_unlock(&vsock->tx_lock);
+
+ if (added)
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+}
+
+/* Is there space left for replies to rx packets? */
+static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
+{
+ struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+ val = atomic_read(&vsock->queued_replies);
+
+ return val < virtqueue_get_vring_size(vq);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_recv_pkt(pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
+/* event_lock must be held */
+static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ sg_init_one(&sg, event, sizeof(*event));
+
+ return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) {
+ struct virtio_vsock_event *event = &vsock->event_list[i];
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+}
+
+static void virtio_vsock_reset_sock(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ release_sock(sk);
+}
+
+static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
+{
+ struct virtio_device *vdev = vsock->vdev;
+ u64 guest_cid;
+
+ vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
+ &guest_cid, sizeof(guest_cid));
+ vsock->guest_cid = le64_to_cpu(guest_cid);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_handle(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ switch (le32_to_cpu(event->id)) {
+ case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET:
+ virtio_vsock_update_guest_cid(vsock);
+ vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+ break;
+ }
+}
+
+static void virtio_transport_event_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, event_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ mutex_lock(&vsock->event_lock);
+
+ do {
+ struct virtio_vsock_event *event;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((event = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len == sizeof(*event))
+ virtio_vsock_event_handle(vsock, event);
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+
+ mutex_unlock(&vsock->event_lock);
+}
+
+static void virtio_vsock_event_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->event_work);
+}
+
+static void virtio_vsock_tx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->tx_work);
+}
+
+static void virtio_vsock_rx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static struct virtio_transport virtio_transport = {
+ .transport = {
+ .get_local_cid = virtio_transport_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+ .set_buffer_size = virtio_transport_set_buffer_size,
+ .set_min_buffer_size = virtio_transport_set_min_buffer_size,
+ .set_max_buffer_size = virtio_transport_set_max_buffer_size,
+ .get_buffer_size = virtio_transport_get_buffer_size,
+ .get_min_buffer_size = virtio_transport_get_min_buffer_size,
+ .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ },
+
+ .send_pkt = virtio_transport_send_pkt,
+};
+
+static int virtio_vsock_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *callbacks[] = {
+ virtio_vsock_rx_done,
+ virtio_vsock_tx_done,
+ virtio_vsock_event_done,
+ };
+ static const char * const names[] = {
+ "rx",
+ "tx",
+ "event",
+ };
+ struct virtio_vsock *vsock = NULL;
+ int ret;
+
+ ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
+ if (ret)
+ return ret;
+
+ /* Only one virtio-vsock device per guest is supported */
+ if (the_virtio_vsock) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
+ if (!vsock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vsock->vdev = vdev;
+
+ ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names);
+ if (ret < 0)
+ goto out;
+
+ virtio_vsock_update_guest_cid(vsock);
+
+ ret = vsock_core_init(&virtio_transport.transport);
+ if (ret < 0)
+ goto out_vqs;
+
+ vsock->rx_buf_nr = 0;
+ vsock->rx_buf_max_nr = 0;
+ atomic_set(&vsock->queued_replies, 0);
+
+ vdev->priv = vsock;
+ the_virtio_vsock = vsock;
+ mutex_init(&vsock->tx_lock);
+ mutex_init(&vsock->rx_lock);
+ mutex_init(&vsock->event_lock);
+ spin_lock_init(&vsock->send_pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->send_pkt_list);
+ INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+ INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+ INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+ INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+
+ mutex_lock(&vsock->rx_lock);
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->event_lock);
+ virtio_vsock_event_fill(vsock);
+ mutex_unlock(&vsock->event_lock);
+
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return 0;
+
+out_vqs:
+ vsock->vdev->config->del_vqs(vsock->vdev);
+out:
+ kfree(vsock);
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return ret;
+}
+
+static void virtio_vsock_remove(struct virtio_device *vdev)
+{
+ struct virtio_vsock *vsock = vdev->priv;
+ struct virtio_vsock_pkt *pkt;
+
+ flush_work(&vsock->rx_work);
+ flush_work(&vsock->tx_work);
+ flush_work(&vsock->event_work);
+ flush_work(&vsock->send_pkt_work);
+
+ vdev->config->reset(vdev);
+
+ mutex_lock(&vsock->rx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->tx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->tx_lock);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ while (!list_empty(&vsock->send_pkt_list)) {
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ mutex_lock(&the_virtio_vsock_mutex);
+ the_virtio_vsock = NULL;
+ vsock_core_exit();
+ mutex_unlock(&the_virtio_vsock_mutex);
+
+ vdev->config->del_vqs(vdev);
+
+ kfree(vsock);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver virtio_vsock_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtio_vsock_probe,
+ .remove = virtio_vsock_remove,
+};
+
+static int __init virtio_vsock_init(void)
+{
+ int ret;
+
+ virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+ if (!virtio_vsock_workqueue)
+ return -ENOMEM;
+ ret = register_virtio_driver(&virtio_vsock_driver);
+ if (ret)
+ destroy_workqueue(virtio_vsock_workqueue);
+ return ret;
+}
+
+static void __exit virtio_vsock_exit(void)
+{
+ unregister_virtio_driver(&virtio_vsock_driver);
+ destroy_workqueue(virtio_vsock_workqueue);
+}
+
+module_init(virtio_vsock_init);
+module_exit(virtio_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("virtio transport for vsock");
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..a53b3a16b4f1
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+ const struct vsock_transport *t = vsock_core_get_transport();
+
+ return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+ size_t len,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct virtio_vsock_pkt *pkt;
+ int err;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return NULL;
+
+ pkt->hdr.type = cpu_to_le16(info->type);
+ pkt->hdr.op = cpu_to_le16(info->op);
+ pkt->hdr.src_cid = cpu_to_le64(src_cid);
+ pkt->hdr.dst_cid = cpu_to_le64(dst_cid);
+ pkt->hdr.src_port = cpu_to_le32(src_port);
+ pkt->hdr.dst_port = cpu_to_le32(dst_port);
+ pkt->hdr.flags = cpu_to_le32(info->flags);
+ pkt->len = len;
+ pkt->hdr.len = cpu_to_le32(len);
+ pkt->reply = info->reply;
+
+ if (info->msg && len > 0) {
+ pkt->buf = kmalloc(len, GFP_KERNEL);
+ if (!pkt->buf)
+ goto out_pkt;
+ err = memcpy_from_msg(pkt->buf, info->msg, len);
+ if (err)
+ goto out;
+ }
+
+ trace_virtio_transport_alloc_pkt(src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ info->type,
+ info->op,
+ info->flags);
+
+ return pkt;
+
+out:
+ kfree(pkt->buf);
+out_pkt:
+ kfree(pkt);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt_info *info)
+{
+ u32 src_cid, src_port, dst_cid, dst_port;
+ struct virtio_vsock_sock *vvs;
+ struct virtio_vsock_pkt *pkt;
+ u32 pkt_len = info->pkt_len;
+
+ src_cid = vm_sockets_get_local_cid();
+ src_port = vsk->local_addr.svm_port;
+ if (!info->remote_cid) {
+ dst_cid = vsk->remote_addr.svm_cid;
+ dst_port = vsk->remote_addr.svm_port;
+ } else {
+ dst_cid = info->remote_cid;
+ dst_port = info->remote_port;
+ }
+
+ vvs = vsk->trans;
+
+ /* we can send less than pkt_len bytes */
+ if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+ pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+ /* virtio_transport_get_credit might return less than pkt_len credit */
+ pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+ /* Do not send zero length OP_RW pkt */
+ if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+ return pkt_len;
+
+ pkt = virtio_transport_alloc_pkt(info, pkt_len,
+ src_cid, src_port,
+ dst_cid, dst_port);
+ if (!pkt) {
+ virtio_transport_put_credit(vvs, pkt_len);
+ return -ENOMEM;
+ }
+
+ virtio_transport_inc_tx_pkt(vvs, pkt);
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes -= pkt->len;
+ vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+ pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ u32 ret;
+
+ spin_lock_bh(&vvs->tx_lock);
+ ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (ret > credit)
+ ret = credit;
+ vvs->tx_cnt += ret;
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->tx_cnt -= credit;
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+ int type,
+ struct virtio_vsock_hdr *hdr)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+ .type = type,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ size_t bytes, total = 0;
+ int err = -EFAULT;
+
+ spin_lock_bh(&vvs->rx_lock);
+ while (total < len && !list_empty(&vvs->rx_queue)) {
+ pkt = list_first_entry(&vvs->rx_queue,
+ struct virtio_vsock_pkt, list);
+
+ bytes = len - total;
+ if (bytes > pkt->len - pkt->off)
+ bytes = pkt->len - pkt->off;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ total += bytes;
+ pkt->off += bytes;
+ if (pkt->off == pkt->len) {
+ virtio_transport_dec_rx_pkt(vvs, pkt);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ }
+ spin_unlock_bh(&vvs->rx_lock);
+
+ /* Send a credit pkt to peer */
+ virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
+
+ return total;
+
+out:
+ if (total)
+ err = total;
+ return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+
+ return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->rx_lock);
+ bytes = vvs->rx_bytes;
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (bytes < 0)
+ bytes = 0;
+
+ return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->tx_lock);
+ bytes = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk)
+{
+ struct virtio_vsock_sock *vvs;
+
+ vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+ if (!vvs)
+ return -ENOMEM;
+
+ vsk->trans = vvs;
+ vvs->vsk = vsk;
+ if (psk) {
+ struct virtio_vsock_sock *ptrans = psk->trans;
+
+ vvs->buf_size = ptrans->buf_size;
+ vvs->buf_size_min = ptrans->buf_size_min;
+ vvs->buf_size_max = ptrans->buf_size_max;
+ vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+ } else {
+ vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+ vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+ vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+ }
+
+ vvs->buf_alloc = vvs->buf_size;
+
+ spin_lock_init(&vvs->rx_lock);
+ spin_lock_init(&vvs->tx_lock);
+ INIT_LIST_HEAD(&vvs->rx_queue);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size_min)
+ vvs->buf_size_min = val;
+ if (val > vvs->buf_size_max)
+ vvs->buf_size_max = val;
+ vvs->buf_size = val;
+ vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val > vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now)
+{
+ if (vsock_stream_has_data(vsk))
+ *data_ready_now = true;
+ else
+ *data_ready_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+ size_t target,
+ bool *space_avail_now)
+{
+ s64 free_space;
+
+ free_space = vsock_stream_has_space(vsk);
+ if (free_space > 0)
+ *space_avail_now = true;
+ else if (free_space == 0)
+ *space_avail_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+ size_t target, ssize_t copied, bool data_read,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+ ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_REQUEST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_SHUTDOWN,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .flags = (mode & RCV_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+ (mode & SEND_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t dgram_len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RW,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .msg = msg,
+ .pkt_len = len,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .reply = !!pkt,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket. There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = le16_to_cpu(pkt->hdr.type),
+ .reply = true,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ pkt = virtio_transport_alloc_pkt(&info, 0,
+ le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port),
+ le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ if (!pkt)
+ return -ENOMEM;
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ sock_flag(sk, SOCK_DONE)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+ bool cancel_timeout)
+{
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ if (vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+
+ if (vsk->close_work_scheduled &&
+ (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+ vsk->close_work_scheduled = false;
+
+ vsock_remove_sock(vsk);
+
+ /* Release refcnt obtained when we scheduled the timeout */
+ sock_put(sk);
+ }
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+ struct vsock_sock *vsk =
+ container_of(work, struct vsock_sock, close_work.work);
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DONE)) {
+ (void)virtio_transport_reset(vsk, NULL);
+
+ virtio_transport_do_close(vsk, false);
+ }
+
+ vsk->close_work_scheduled = false;
+
+ release_sock(sk);
+ sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+
+ if (!(sk->sk_state == SS_CONNECTED ||
+ sk->sk_state == SS_DISCONNECTING))
+ return true;
+
+ /* Already received SHUTDOWN from peer, reply with RST */
+ if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+ (void)virtio_transport_reset(vsk, NULL);
+ return true;
+ }
+
+ if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+ (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+ if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+ virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+ if (sock_flag(sk, SOCK_DONE)) {
+ return true;
+ }
+
+ sock_hold(sk);
+ INIT_DELAYED_WORK(&vsk->close_work,
+ virtio_transport_close_timeout);
+ vsk->close_work_scheduled = true;
+ schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+ return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+ bool remove_sock = true;
+
+ lock_sock(sk);
+ if (sk->sk_type == SOCK_STREAM)
+ remove_sock = virtio_transport_close(vsk);
+ release_sock(sk);
+
+ if (remove_sock)
+ vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ int err;
+ int skerr;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RESPONSE:
+ sk->sk_state = SS_CONNECTED;
+ sk->sk_socket->state = SS_CONNECTED;
+ vsock_insert_connected(vsk);
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_INVALID:
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ skerr = ECONNRESET;
+ err = 0;
+ goto destroy;
+ default:
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+ return 0;
+
+destroy:
+ virtio_transport_reset(vsk, pkt);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = skerr;
+ sk->sk_error_report(sk);
+ return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ int err = 0;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RW:
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+ pkt->off = 0;
+
+ spin_lock_bh(&vvs->rx_lock);
+ virtio_transport_inc_rx_pkt(vvs, pkt);
+ list_add_tail(&pkt->list, &vvs->rx_queue);
+ spin_unlock_bh(&vvs->rx_lock);
+
+ sk->sk_data_ready(sk);
+ return err;
+ case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+ sk->sk_write_space(sk);
+ break;
+ case VIRTIO_VSOCK_OP_SHUTDOWN:
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+ vsk->peer_shutdown |= RCV_SHUTDOWN;
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+ vsk->peer_shutdown |= SEND_SHUTDOWN;
+ if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+ vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ if (le32_to_cpu(pkt->hdr.flags))
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ virtio_transport_do_close(vsk, true);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ virtio_transport_free_pkt(pkt);
+ return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RESPONSE,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+ .remote_port = le32_to_cpu(pkt->hdr.src_port),
+ .reply = true,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct vsock_sock *vchild;
+ struct sock *child;
+
+ if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+ virtio_transport_reset(vsk, pkt);
+ return -EINVAL;
+ }
+
+ if (sk_acceptq_is_full(sk)) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+ sk->sk_type, 0);
+ if (!child) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ sk->sk_ack_backlog++;
+
+ lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+ child->sk_state = SS_CONNECTED;
+
+ vchild = vsock_sk(child);
+ vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+ vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+
+ vsock_insert_connected(vchild);
+ vsock_enqueue_accept(sk, child);
+ virtio_transport_send_response(vchild, pkt);
+
+ release_sock(child);
+
+ sk->sk_data_ready(sk);
+ return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct sockaddr_vm src, dst;
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ bool space_available;
+
+ vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+
+ trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+ dst.svm_cid, dst.svm_port,
+ le32_to_cpu(pkt->hdr.len),
+ le16_to_cpu(pkt->hdr.type),
+ le16_to_cpu(pkt->hdr.op),
+ le32_to_cpu(pkt->hdr.flags),
+ le32_to_cpu(pkt->hdr.buf_alloc),
+ le32_to_cpu(pkt->hdr.fwd_cnt));
+
+ if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+
+ /* The socket must be in connected or bound table
+ * otherwise send reset back
+ */
+ sk = vsock_find_connected_socket(&src, &dst);
+ if (!sk) {
+ sk = vsock_find_bound_socket(&dst);
+ if (!sk) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+ }
+
+ vsk = vsock_sk(sk);
+
+ space_available = virtio_transport_space_update(sk, pkt);
+
+ lock_sock(sk);
+
+ /* Update CID in case it has changed after a transport reset event */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (space_available)
+ sk->sk_write_space(sk);
+
+ switch (sk->sk_state) {
+ case VSOCK_SS_LISTEN:
+ virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTING:
+ virtio_transport_recv_connecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTED:
+ virtio_transport_recv_connected(sk, pkt);
+ break;
+ case SS_DISCONNECTING:
+ virtio_transport_recv_disconnecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ default:
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ release_sock(sk);
+
+ /* Release refcnt obtained when we fetched this socket out of the
+ * bound or connected list.
+ */
+ sock_put(sk);
+ return;
+
+free_pkt:
+ virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+ kfree(pkt->buf);
+ kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 4120b7a538be..4be4fbbc0b50 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
static void vmci_transport_release(struct vsock_sock *vsk)
{
+ vsock_remove_sock(vsk);
+
if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index bb3d64ee68aa..0f506220a3bd 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -716,7 +716,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
ASSERT_RTNL();
- if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
+ if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
!(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
return false;