aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/ulp')
-rw-r--r--drivers/infiniband/ulp/Makefile1
-rw-r--r--drivers/infiniband/ulp/ipoib/Kconfig8
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h22
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c53
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ethtool.c21
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_fs.c50
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c91
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c195
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c42
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_netlink.c13
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c11
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c12
-rw-r--r--drivers/infiniband/ulp/iser/Kconfig2
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c160
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h143
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c188
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c313
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c389
-rw-r--r--drivers/infiniband/ulp/isert/Kconfig2
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c498
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.h69
-rw-r--r--drivers/infiniband/ulp/opa_vnic/Kconfig6
-rw-r--r--drivers/infiniband/ulp/opa_vnic/Makefile3
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h31
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c6
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h1
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c1
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c27
-rw-r--r--drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c2
-rw-r--r--drivers/infiniband/ulp/rtrs/Kconfig27
-rw-r--r--drivers/infiniband/ulp/rtrs/Makefile21
-rw-r--r--drivers/infiniband/ulp/rtrs/README213
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c198
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c514
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt-trace.c15
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt-trace.h86
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt.c3190
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-clt.h251
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-log.h28
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-pri.h409
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c51
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c315
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv-trace.c16
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv-trace.h88
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv.c2297
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv.h154
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs.c654
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs.h188
-rw-r--r--drivers/infiniband/ulp/srp/Kconfig2
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c797
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h45
-rw-r--r--drivers/infiniband/ulp/srpt/Kconfig2
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c333
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.h27
54 files changed, 10140 insertions, 2141 deletions
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
index 437813c7b481..4d0004b58377 100644
--- a/drivers/infiniband/ulp/Makefile
+++ b/drivers/infiniband/ulp/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/
obj-$(CONFIG_INFINIBAND_ISER) += iser/
obj-$(CONFIG_INFINIBAND_ISERT) += isert/
obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic/
+obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index 7af68604af77..254e31a90a66 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -2,7 +2,7 @@
config INFINIBAND_IPOIB
tristate "IP-over-InfiniBand"
depends on NETDEVICES && INET && (IPV6 || IPV6=n)
- ---help---
+ help
Support for the IP-over-InfiniBand protocol (IPoIB). This
transports IP packets over InfiniBand so you can use your IB
device as a fancy NIC.
@@ -13,7 +13,7 @@ config INFINIBAND_IPOIB_CM
bool "IP-over-InfiniBand Connected Mode support"
depends on INFINIBAND_IPOIB
default n
- ---help---
+ help
This option enables support for IPoIB connected mode. After
enabling this option, you need to switch to connected mode
through /sys/class/net/ibXXX/mode to actually create
@@ -28,7 +28,7 @@ config INFINIBAND_IPOIB_DEBUG
bool "IP-over-InfiniBand debugging" if EXPERT
depends on INFINIBAND_IPOIB
default y
- ---help---
+ help
This option causes debugging code to be compiled into the
IPoIB driver. The output can be turned on via the
debug_level and mcast_debug_level module parameters (which
@@ -42,7 +42,7 @@ config INFINIBAND_IPOIB_DEBUG
config INFINIBAND_IPOIB_DEBUG_DATA
bool "IP-over-InfiniBand data path debugging"
depends on INFINIBAND_IPOIB_DEBUG
- ---help---
+ help
This option compiles debugging code into the data path
of the IPoIB driver. The output can be turned on via the
data_debug_level module parameter; however, even with output
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2aa3457a30ce..35e9c8a330e2 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -377,8 +377,12 @@ struct ipoib_dev_priv {
struct ipoib_rx_buf *rx_ring;
struct ipoib_tx_buf *tx_ring;
+ /* cyclic ring variables for managing tx_ring, for UD only */
unsigned int tx_head;
unsigned int tx_tail;
+ /* cyclic ring variables for counting overall outstanding send WRs */
+ unsigned int global_tx_head;
+ unsigned int global_tx_tail;
struct ib_sge tx_sge[MAX_SKB_FRAGS + 1];
struct ib_ud_wr tx_wr;
struct ib_wc send_wc[MAX_SEND_CQE];
@@ -407,9 +411,9 @@ struct ipoib_dev_priv {
struct dentry *path_dentry;
#endif
u64 hca_caps;
+ u64 kernel_caps;
struct ipoib_ethtool_st ethtool;
unsigned int max_send_sge;
- bool sm_fullmember_sendonly_support;
const struct net_device_ops *rn_ops;
};
@@ -451,7 +455,7 @@ struct ipoib_neigh {
struct list_head list;
struct ipoib_neigh __rcu *hnext;
struct rcu_head rcu;
- atomic_t refcnt;
+ refcount_t refcnt;
unsigned long alive;
};
@@ -461,7 +465,7 @@ struct ipoib_neigh {
void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
{
- if (atomic_dec_and_test(&neigh->refcnt))
+ if (refcount_dec_and_test(&neigh->refcnt))
ipoib_neigh_dtor(neigh);
}
struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
@@ -498,9 +502,9 @@ void ipoib_reap_ah(struct work_struct *work);
struct ipoib_path *__path_find(struct net_device *dev, void *gid);
void ipoib_mark_paths_invalid(struct net_device *dev);
void ipoib_flush_paths(struct net_device *dev);
-struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
const char *format);
-int ipoib_intf_init(struct ib_device *hca, u8 port, const char *format,
+int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format,
struct net_device *dev);
void ipoib_ib_tx_timer_func(struct timer_list *t);
void ipoib_ib_dev_flush_light(struct work_struct *work);
@@ -511,7 +515,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev);
int ipoib_ib_dev_open_default(struct net_device *dev);
int ipoib_ib_dev_open(struct net_device *dev);
-int ipoib_ib_dev_stop(struct net_device *dev);
+void ipoib_ib_dev_stop(struct net_device *dev);
void ipoib_ib_dev_up(struct net_device *dev);
void ipoib_ib_dev_down(struct net_device *dev);
int ipoib_ib_dev_stop_default(struct net_device *dev);
@@ -523,7 +527,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
void ipoib_mcast_restart_task(struct work_struct *work);
void ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev);
+void ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev);
@@ -674,8 +678,6 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
#else
-struct ipoib_cm_tx;
-
#define ipoib_max_conn_qp 0
static inline int ipoib_cm_admin_enabled(struct net_device *dev)
@@ -838,6 +840,4 @@ extern int ipoib_debug_level;
#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
-extern const char ipoib_driver_version[];
-
#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c59e00a0881f..b610d36295bb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -465,7 +465,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id,
goto err_qp;
}
- psn = prandom_u32() & 0xffffff;
+ psn = get_random_u32() & 0xffffff;
ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
if (ret)
goto err_modify;
@@ -512,13 +512,13 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
return ipoib_cm_req_handler(cm_id, event);
case IB_CM_DREQ_RECEIVED:
ib_send_cm_drep(cm_id, NULL, 0);
- /* Fall through */
+ fallthrough;
case IB_CM_REJ_RECEIVED:
p = cm_id->context;
priv = ipoib_priv(p->dev);
if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
ipoib_warn(priv, "unable to move qp to error state\n");
- /* Fall through */
+ fallthrough;
default:
return 0;
}
@@ -756,7 +756,8 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
return;
}
- if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
+ if ((priv->global_tx_head - priv->global_tx_tail) ==
+ ipoib_sendq_size - 1) {
ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
tx->qp->qp_num);
netif_stop_queue(dev);
@@ -786,7 +787,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
} else {
netif_trans_update(dev);
++tx->tx_head;
- ++priv->tx_head;
+ ++priv->global_tx_head;
}
}
@@ -820,10 +821,11 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
netif_tx_lock(dev);
++tx->tx_tail;
- ++priv->tx_tail;
+ ++priv->global_tx_tail;
if (unlikely(netif_queue_stopped(dev) &&
- (priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1 &&
+ ((priv->global_tx_head - priv->global_tx_tail) <=
+ ipoib_sendq_size >> 1) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
netif_wake_queue(dev);
@@ -882,8 +884,8 @@ int ipoib_cm_dev_open(struct net_device *dev)
goto err_cm;
}
- ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
- 0);
+ ret = ib_cm_listen(priv->cm.id,
+ cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num));
if (ret) {
pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name,
IPOIB_CM_IETF_ID | priv->qp->qp_num);
@@ -1120,12 +1122,8 @@ static int ipoib_cm_modify_tx_init(struct net_device *dev,
struct ipoib_dev_priv *priv = ipoib_priv(dev);
struct ib_qp_attr qp_attr;
int qp_attr_mask, ret;
- ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
- if (ret) {
- ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
- return ret;
- }
+ qp_attr.pkey_index = priv->pkey_index;
qp_attr.qp_state = IB_QPS_INIT;
qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
qp_attr.port_num = priv->port;
@@ -1232,8 +1230,9 @@ timeout:
dev_kfree_skb_any(tx_req->skb);
netif_tx_lock_bh(p->dev);
++p->tx_tail;
- ++priv->tx_tail;
- if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size >> 1) &&
+ ++priv->global_tx_tail;
+ if (unlikely((priv->global_tx_head - priv->global_tx_tail) <=
+ ipoib_sendq_size >> 1) &&
netif_queue_stopped(p->dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
netif_wake_queue(p->dev);
@@ -1504,20 +1503,20 @@ static void ipoib_cm_stale_task(struct work_struct *work)
spin_unlock_irq(&priv->lock);
}
-static ssize_t show_mode(struct device *d, struct device_attribute *attr,
+static ssize_t mode_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_device *dev = to_net_dev(d);
struct ipoib_dev_priv *priv = ipoib_priv(dev);
if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
- return sprintf(buf, "connected\n");
+ return sysfs_emit(buf, "connected\n");
else
- return sprintf(buf, "datagram\n");
+ return sysfs_emit(buf, "datagram\n");
}
-static ssize_t set_mode(struct device *d, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t mode_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct net_device *dev = to_net_dev(d);
int ret;
@@ -1543,7 +1542,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
return (!ret || ret == -EBUSY) ? count : ret;
}
-static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
+static DEVICE_ATTR_RW(mode);
int ipoib_cm_add_mode_attr(struct net_device *dev)
{
@@ -1584,6 +1583,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
int max_srq_sge, i;
+ u8 addr;
INIT_LIST_HEAD(&priv->cm.passive_ids);
INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1637,24 +1637,21 @@ int ipoib_cm_dev_init(struct net_device *dev)
}
}
- priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+ addr = IPOIB_FLAGS_RC;
+ dev_addr_mod(dev, 0, &addr, 1);
return 0;
}
void ipoib_cm_dev_cleanup(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
- int ret;
if (!priv->cm.srq)
return;
ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
- ret = ib_destroy_srq(priv->cm.srq);
- if (ret)
- ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
-
+ ib_destroy_srq(priv->cm.srq);
priv->cm.srq = NULL;
if (!priv->cm.srq_ring)
return;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 63e4f9d15fd9..8af99b18d361 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -65,17 +65,16 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
ib_get_device_fw_str(priv->ca, drvinfo->fw_version);
- strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
+ strscpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
sizeof(drvinfo->bus_info));
- strlcpy(drvinfo->version, ipoib_driver_version,
- sizeof(drvinfo->version));
-
- strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
+ strscpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
}
static int ipoib_get_coalesce(struct net_device *dev,
- struct ethtool_coalesce *coal)
+ struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -86,7 +85,9 @@ static int ipoib_get_coalesce(struct net_device *dev,
}
static int ipoib_set_coalesce(struct net_device *dev,
- struct ethtool_coalesce *coal)
+ struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
int ret;
@@ -169,6 +170,10 @@ static inline int ib_speed_enum_to_int(int speed)
return SPEED_14000;
case IB_SPEED_EDR:
return SPEED_25000;
+ case IB_SPEED_HDR:
+ return SPEED_50000;
+ case IB_SPEED_NDR:
+ return SPEED_100000;
}
return SPEED_UNKNOWN;
@@ -213,6 +218,8 @@ static int ipoib_get_link_ksettings(struct net_device *netdev,
}
static const struct ethtool_ops ipoib_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS |
+ ETHTOOL_COALESCE_RX_MAX_FRAMES,
.get_link_ksettings = ipoib_get_link_ksettings,
.get_drvinfo = ipoib_get_drvinfo,
.get_coalesce = ipoib_get_coalesce,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 64c19f6fa931..12ba7a0fe0b5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -124,35 +124,14 @@ static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
return 0;
}
-static const struct seq_operations ipoib_mcg_seq_ops = {
+static const struct seq_operations ipoib_mcg_sops = {
.start = ipoib_mcg_seq_start,
.next = ipoib_mcg_seq_next,
.stop = ipoib_mcg_seq_stop,
.show = ipoib_mcg_seq_show,
};
-static int ipoib_mcg_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int ret;
-
- ret = seq_open(file, &ipoib_mcg_seq_ops);
- if (ret)
- return ret;
-
- seq = file->private_data;
- seq->private = inode->i_private;
-
- return 0;
-}
-
-static const struct file_operations ipoib_mcg_fops = {
- .owner = THIS_MODULE,
- .open = ipoib_mcg_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(ipoib_mcg);
static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
{
@@ -229,35 +208,14 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
return 0;
}
-static const struct seq_operations ipoib_path_seq_ops = {
+static const struct seq_operations ipoib_path_sops = {
.start = ipoib_path_seq_start,
.next = ipoib_path_seq_next,
.stop = ipoib_path_seq_stop,
.show = ipoib_path_seq_show,
};
-static int ipoib_path_open(struct inode *inode, struct file *file)
-{
- struct seq_file *seq;
- int ret;
-
- ret = seq_open(file, &ipoib_path_seq_ops);
- if (ret)
- return ret;
-
- seq = file->private_data;
- seq->private = inode->i_private;
-
- return 0;
-}
-
-static const struct file_operations ipoib_path_fops = {
- .owner = THIS_MODULE,
- .open = ipoib_path_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(ipoib_path);
void ipoib_create_debug_files(struct net_device *dev)
{
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index c332b4761816..ed25061fac62 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -407,9 +407,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail;
+ ++priv->global_tx_tail;
if (unlikely(netif_queue_stopped(dev) &&
- ((priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1) &&
+ ((priv->global_tx_head - priv->global_tx_tail) <=
+ ipoib_sendq_size >> 1) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
netif_wake_queue(dev);
@@ -571,7 +573,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb,
unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb);
if (skb_is_gso(skb)) {
- hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ hlen = skb_tcp_all_headers(skb);
phead = skb->data;
if (unlikely(!skb_pull(skb, hlen))) {
ipoib_warn(priv, "linear data too small\n");
@@ -634,7 +636,8 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb,
else
priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
/* increase the tx_head after send success, but use it for queue state */
- if (priv->tx_head - priv->tx_tail == ipoib_sendq_size - 1) {
+ if ((priv->global_tx_head - priv->global_tx_tail) ==
+ ipoib_sendq_size - 1) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev);
}
@@ -662,17 +665,17 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb,
rc = priv->tx_head;
++priv->tx_head;
+ ++priv->global_tx_head;
}
return rc;
}
-static void __ipoib_reap_ah(struct net_device *dev)
+static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv)
{
- struct ipoib_dev_priv *priv = ipoib_priv(dev);
struct ipoib_ah *ah, *tah;
unsigned long flags;
- netif_tx_lock_bh(dev);
+ netif_tx_lock_bh(priv->dev);
spin_lock_irqsave(&priv->lock, flags);
list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
@@ -683,37 +686,37 @@ static void __ipoib_reap_ah(struct net_device *dev)
}
spin_unlock_irqrestore(&priv->lock, flags);
- netif_tx_unlock_bh(dev);
+ netif_tx_unlock_bh(priv->dev);
}
void ipoib_reap_ah(struct work_struct *work)
{
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
- struct net_device *dev = priv->dev;
- __ipoib_reap_ah(dev);
+ ipoib_reap_dead_ahs(priv);
if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
queue_delayed_work(priv->wq, &priv->ah_reap_task,
round_jiffies_relative(HZ));
}
-static void ipoib_flush_ah(struct net_device *dev)
+static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv)
{
- struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
- cancel_delayed_work(&priv->ah_reap_task);
- flush_workqueue(priv->wq);
- ipoib_reap_ah(&priv->ah_reap_task.work);
+ clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+ queue_delayed_work(priv->wq, &priv->ah_reap_task,
+ round_jiffies_relative(HZ));
}
-static void ipoib_stop_ah(struct net_device *dev)
+static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv)
{
- struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
set_bit(IPOIB_STOP_REAPER, &priv->flags);
- ipoib_flush_ah(dev);
+ cancel_delayed_work(&priv->ah_reap_task);
+ /*
+ * After ipoib_stop_ah_reaper() we always go through
+ * ipoib_reap_dead_ahs() which ensures the work is really stopped and
+ * does a final flush out of the dead_ah's list
+ */
}
static int recvs_pending(struct net_device *dev)
@@ -807,6 +810,7 @@ int ipoib_ib_dev_stop_default(struct net_device *dev)
ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(tx_req->skb);
++priv->tx_tail;
+ ++priv->global_tx_tail;
}
for (i = 0; i < ipoib_recvq_size; ++i) {
@@ -841,18 +845,6 @@ timeout:
return 0;
}
-int ipoib_ib_dev_stop(struct net_device *dev)
-{
- struct ipoib_dev_priv *priv = ipoib_priv(dev);
-
- priv->rn_ops->ndo_stop(dev);
-
- clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
- ipoib_flush_ah(dev);
-
- return 0;
-}
-
int ipoib_ib_dev_open_default(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -896,10 +888,7 @@ int ipoib_ib_dev_open(struct net_device *dev)
return -1;
}
- clear_bit(IPOIB_STOP_REAPER, &priv->flags);
- queue_delayed_work(priv->wq, &priv->ah_reap_task,
- round_jiffies_relative(HZ));
-
+ ipoib_start_ah_reaper(priv);
if (priv->rn_ops->ndo_open(dev)) {
pr_warn("%s: Failed to open dev\n", dev->name);
goto dev_stop;
@@ -910,13 +899,20 @@ int ipoib_ib_dev_open(struct net_device *dev)
return 0;
dev_stop:
- set_bit(IPOIB_STOP_REAPER, &priv->flags);
- cancel_delayed_work(&priv->ah_reap_task);
- set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
- ipoib_ib_dev_stop(dev);
+ ipoib_stop_ah_reaper(priv);
return -1;
}
+void ipoib_ib_dev_stop(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+ priv->rn_ops->ndo_stop(dev);
+
+ clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+ ipoib_stop_ah_reaper(priv);
+}
+
void ipoib_pkey_dev_check_presence(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1061,13 +1057,11 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
{
union ib_gid search_gid;
union ib_gid gid0;
- union ib_gid *netdev_gid;
int err;
u16 index;
- u8 port;
+ u32 port;
bool ret = false;
- netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
if (rdma_query_gid(priv->ca, priv->port, 0, &gid0))
return false;
@@ -1077,7 +1071,8 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
* to do it later
*/
priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
- netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
+ dev_addr_mod(priv->dev, 4, (u8 *)&gid0.global.subnet_prefix,
+ sizeof(gid0.global.subnet_prefix));
search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
search_gid.global.interface_id = priv->local_gid.global.interface_id;
@@ -1114,7 +1109,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
* if he sets the device address back to be based on GID index 0,
* he no longer wishs to control it.
*
- * If the user doesn't control the the device address,
+ * If the user doesn't control the device address,
* IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
* the port GUID has changed and GID at index 0 has changed
* so we need to change priv->local_gid and priv->dev->dev_addr
@@ -1139,8 +1134,8 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
memcpy(&priv->local_gid, &gid0,
sizeof(priv->local_gid));
- memcpy(priv->dev->dev_addr + 4, &gid0,
- sizeof(priv->local_gid));
+ dev_addr_mod(priv->dev, 4, (u8 *)&gid0,
+ sizeof(priv->local_gid));
ret = true;
}
}
@@ -1227,7 +1222,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
ipoib_mcast_dev_flush(dev);
if (oper_up)
set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
- ipoib_flush_ah(dev);
+ ipoib_reap_dead_ahs(priv);
}
if (level >= IPOIB_FLUSH_NORMAL)
@@ -1302,7 +1297,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
* the neighbor garbage collection is stopped and reaped.
* That should all be done now, so make a final ah flush.
*/
- ipoib_stop_ah(dev);
+ ipoib_reap_dead_ahs(priv);
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 4a0d3a9e72e1..ac25fc80fb33 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -52,10 +52,6 @@
#include <linux/inetdevice.h>
#include <rdma/ib_cache.h>
-#define DRV_VERSION "1.0.0"
-
-const char ipoib_driver_version[] = DRV_VERSION;
-
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
@@ -90,11 +86,11 @@ struct workqueue_struct *ipoib_workqueue;
struct ib_sa_client ipoib_sa_client;
-static void ipoib_add_one(struct ib_device *device);
+static int ipoib_add_one(struct ib_device *device);
static void ipoib_remove_one(struct ib_device *device, void *client_data);
static void ipoib_neigh_reclaim(struct rcu_head *rp);
static struct net_device *ipoib_get_net_dev_by_params(
- struct ib_device *dev, u8 port, u16 pkey,
+ struct ib_device *dev, u32 port, u16 pkey,
const union ib_gid *gid, const struct sockaddr *addr,
void *client_data);
static int ipoib_set_mac(struct net_device *dev, void *addr);
@@ -145,8 +141,6 @@ int ipoib_open(struct net_device *dev)
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
- priv->sm_fullmember_sendonly_support = false;
-
if (ipoib_ib_dev_open(dev)) {
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
return 0;
@@ -170,8 +164,13 @@ int ipoib_open(struct net_device *dev)
dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
}
up_read(&priv->vlan_rwsem);
- }
+ } else if (priv->parent) {
+ struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+ if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags))
+ ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n",
+ ppriv->dev->name);
+ }
netif_start_queue(dev);
return 0;
@@ -317,7 +316,7 @@ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
return false;
}
-/**
+/*
* Find the master net_device on top of the given net_device.
* @dev: base IPoIB net_device
*
@@ -346,9 +345,10 @@ struct ipoib_walk_data {
struct net_device *result;
};
-static int ipoib_upper_walk(struct net_device *upper, void *_data)
+static int ipoib_upper_walk(struct net_device *upper,
+ struct netdev_nested_priv *priv)
{
- struct ipoib_walk_data *data = _data;
+ struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data;
int ret = 0;
if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
@@ -361,8 +361,9 @@ static int ipoib_upper_walk(struct net_device *upper, void *_data)
}
/**
- * Find a net_device matching the given address, which is an upper device of
- * the given net_device.
+ * ipoib_get_net_dev_match_addr - Find a net_device matching
+ * the given address, which is an upper device of the given net_device.
+ *
* @addr: IP address to look for.
* @dev: base IPoIB net_device
*
@@ -372,10 +373,12 @@ static int ipoib_upper_walk(struct net_device *upper, void *_data)
static struct net_device *ipoib_get_net_dev_match_addr(
const struct sockaddr *addr, struct net_device *dev)
{
+ struct netdev_nested_priv priv;
struct ipoib_walk_data data = {
.addr = addr,
};
+ priv.data = (void *)&data;
rcu_read_lock();
if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
dev_hold(dev);
@@ -383,7 +386,7 @@ static struct net_device *ipoib_get_net_dev_match_addr(
goto out;
}
- netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
+ netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv);
out:
rcu_read_unlock();
return data.result;
@@ -441,7 +444,7 @@ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
/* Returns the number of matching net_devs found (between 0 and 2). Also
* return the matching net_device in the @net_dev parameter, holding a
* reference to the net_device, if the number of matches >= 1 */
-static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port,
u16 pkey_index,
const union ib_gid *gid,
const struct sockaddr *addr,
@@ -466,7 +469,7 @@ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
}
static struct net_device *ipoib_get_net_dev_by_params(
- struct ib_device *dev, u8 port, u16 pkey,
+ struct ib_device *dev, u32 port, u16 pkey,
const union ib_gid *gid, const struct sockaddr *addr,
void *client_data)
{
@@ -483,9 +486,6 @@ static struct net_device *ipoib_get_net_dev_by_params(
if (ret)
return NULL;
- if (!dev_list)
- return NULL;
-
/* See if we can find a unique device matching the L2 parameters */
matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
gid, NULL, &net_dev);
@@ -509,7 +509,7 @@ static struct net_device *ipoib_get_net_dev_by_params(
default:
dev_warn_ratelimited(&dev->dev,
"duplicate IP address detected\n");
- /* Fall through */
+ fallthrough;
case 1:
return net_dev;
}
@@ -533,6 +533,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
"will cause multicast packet drops\n");
netdev_update_features(dev);
dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
+ netif_set_real_num_tx_queues(dev, 1);
rtnl_unlock();
priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
@@ -544,6 +545,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
netdev_update_features(dev);
dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+ netif_set_real_num_tx_queues(dev, dev->num_tx_queues);
rtnl_unlock();
ipoib_flush_paths(dev);
return (!rtnl_trylock()) ? -EBUSY : 0;
@@ -740,7 +742,7 @@ void ipoib_flush_paths(struct net_device *dev)
static void path_rec_completion(int status,
struct sa_path_rec *pathrec,
- void *path_ptr)
+ int num_prs, void *path_ptr)
{
struct ipoib_path *path = path_ptr;
struct net_device *dev = path->dev;
@@ -1185,12 +1187,19 @@ unref:
static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
+ struct rdma_netdev *rn = netdev_priv(dev);
+ if (rn->tx_timeout) {
+ rn->tx_timeout(dev, txqueue);
+ return;
+ }
ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
jiffies_to_msecs(jiffies - dev_trans_start(dev)));
- ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
- netif_queue_stopped(dev),
- priv->tx_head, priv->tx_tail);
+ ipoib_warn(priv,
+ "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n",
+ netif_queue_stopped(dev), priv->tx_head, priv->tx_tail,
+ priv->global_tx_head, priv->global_tx_tail);
+
/* XXX reset QP, etc. */
}
@@ -1279,7 +1288,7 @@ struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
neigh = rcu_dereference_bh(neigh->hnext)) {
if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
/* found, take one ref on behalf of the caller */
- if (!atomic_inc_not_zero(&neigh->refcnt)) {
+ if (!refcount_inc_not_zero(&neigh->refcnt)) {
/* deleted */
neigh = NULL;
goto out_unlock;
@@ -1374,7 +1383,7 @@ static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
INIT_LIST_HEAD(&neigh->list);
ipoib_cm_set(neigh, NULL);
/* one ref on behalf of the caller */
- atomic_set(&neigh->refcnt, 1);
+ refcount_set(&neigh->refcnt, 1);
return neigh;
}
@@ -1406,7 +1415,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
lockdep_is_held(&priv->lock))) {
if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
/* found, take one ref on behalf of the caller */
- if (!atomic_inc_not_zero(&neigh->refcnt)) {
+ if (!refcount_inc_not_zero(&neigh->refcnt)) {
/* deleted */
neigh = NULL;
break;
@@ -1421,7 +1430,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
goto out_unlock;
/* one ref on behalf of the hash table */
- atomic_inc(&neigh->refcnt);
+ refcount_inc(&neigh->refcnt);
neigh->alive = jiffies;
/* put in hash */
rcu_assign_pointer(neigh->hnext,
@@ -1655,8 +1664,10 @@ static void ipoib_napi_add(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
- netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
- netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
+ netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll,
+ IPOIB_NUM_WC);
+ netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll,
+ MAX_SEND_CQE);
}
static void ipoib_napi_del(struct net_device *dev)
@@ -1687,6 +1698,7 @@ static void ipoib_dev_uninit_default(struct net_device *dev)
static int ipoib_dev_init_default(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
+ u8 addr_mod[3];
ipoib_napi_add(dev);
@@ -1705,7 +1717,7 @@ static int ipoib_dev_init_default(struct net_device *dev)
goto out_rx_ring_cleanup;
}
- /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+ /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */
if (ipoib_transport_dev_init(dev, priv->ca)) {
pr_warn("%s: ipoib_transport_dev_init failed\n",
@@ -1714,9 +1726,10 @@ static int ipoib_dev_init_default(struct net_device *dev)
}
/* after qp created set dev address */
- priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
- priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff;
- priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff;
+ addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff;
+ addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff;
+ addr_mod[2] = (priv->qp->qp_num) & 0xff;
+ dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod));
return 0;
@@ -1736,10 +1749,10 @@ static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
- if (!priv->rn_ops->ndo_do_ioctl)
+ if (!priv->rn_ops->ndo_eth_ioctl)
return -EOPNOTSUPP;
- return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd);
+ return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd);
}
static int ipoib_dev_init(struct net_device *dev)
@@ -1839,11 +1852,12 @@ static void ipoib_parent_unregister_pre(struct net_device *ndev)
static void ipoib_set_dev_features(struct ipoib_dev_priv *priv)
{
priv->hca_caps = priv->ca->attrs.device_cap_flags;
+ priv->kernel_caps = priv->ca->attrs.kernel_cap_flags;
if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
- if (priv->hca_caps & IB_DEVICE_UD_TSO)
+ if (priv->kernel_caps & IBK_UD_TSO)
priv->dev->hw_features |= NETIF_F_TSO;
priv->dev->features |= priv->dev->hw_features;
@@ -1862,7 +1876,7 @@ static int ipoib_parent_init(struct net_device *ndev)
priv->port);
return result;
}
- priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+ priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr);
result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
if (result) {
@@ -1877,8 +1891,7 @@ static int ipoib_parent_init(struct net_device *ndev)
priv->ca->name, priv->port, result);
return result;
}
- memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw,
- sizeof(union ib_gid));
+ dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid));
SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
priv->dev->dev_port = priv->port - 1;
@@ -1895,14 +1908,22 @@ static void ipoib_child_init(struct net_device *ndev)
priv->max_ib_mtu = ppriv->max_ib_mtu;
set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
- memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
- memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+ if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
+ memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
+ sizeof(priv->local_gid));
+ else {
+ __dev_addr_set(priv->dev, ppriv->dev->dev_addr,
+ INFINIBAND_ALEN);
+ memcpy(&priv->local_gid, &ppriv->local_gid,
+ sizeof(priv->local_gid));
+ }
}
static int ipoib_ndo_init(struct net_device *ndev)
{
struct ipoib_dev_priv *priv = ipoib_priv(ndev);
int rc;
+ struct rdma_netdev *rn = netdev_priv(ndev);
if (priv->parent) {
ipoib_child_init(ndev);
@@ -1915,6 +1936,7 @@ static int ipoib_ndo_init(struct net_device *ndev)
/* MTU will be reset when mcast join happens */
ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
+ rn->mtu = priv->mcast_mtu;
ndev->max_mtu = IPOIB_CM_MTU;
ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
@@ -1977,7 +1999,8 @@ static void ipoib_ndo_uninit(struct net_device *dev)
/* no more works over the priv->wq */
if (priv->wq) {
- flush_workqueue(priv->wq);
+ /* See ipoib_mcast_carrier_on_task() */
+ WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
destroy_workqueue(priv->wq);
priv->wq = NULL;
}
@@ -2058,7 +2081,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = {
.ndo_set_vf_guid = ipoib_set_vf_guid,
.ndo_set_mac_address = ipoib_set_mac,
.ndo_get_stats64 = ipoib_get_stats,
- .ndo_do_ioctl = ipoib_ioctl,
+ .ndo_eth_ioctl = ipoib_ioctl,
};
static const struct net_device_ops ipoib_netdev_ops_vf = {
@@ -2073,12 +2096,20 @@ static const struct net_device_ops ipoib_netdev_ops_vf = {
.ndo_set_rx_mode = ipoib_set_mcast_list,
.ndo_get_iflink = ipoib_get_iflink,
.ndo_get_stats64 = ipoib_get_stats,
- .ndo_do_ioctl = ipoib_ioctl,
+ .ndo_eth_ioctl = ipoib_ioctl,
+};
+
+static const struct net_device_ops ipoib_netdev_default_pf = {
+ .ndo_init = ipoib_dev_init_default,
+ .ndo_uninit = ipoib_dev_uninit_default,
+ .ndo_open = ipoib_ib_dev_open_default,
+ .ndo_stop = ipoib_ib_dev_stop_default,
};
void ipoib_setup_common(struct net_device *dev)
{
dev->header_ops = &ipoib_header_ops;
+ dev->netdev_ops = &ipoib_netdev_default_pf;
ipoib_set_ethtool_ops(dev);
@@ -2128,14 +2159,7 @@ static void ipoib_build_priv(struct net_device *dev)
INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
}
-static const struct net_device_ops ipoib_netdev_default_pf = {
- .ndo_init = ipoib_dev_init_default,
- .ndo_uninit = ipoib_dev_uninit_default,
- .ndo_open = ipoib_ib_dev_open_default,
- .ndo_stop = ipoib_ib_dev_stop_default,
-};
-
-static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
+static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port,
const char *name)
{
struct net_device *dev;
@@ -2152,7 +2176,7 @@ static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
return dev;
}
-int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
+int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name,
struct net_device *dev)
{
struct rdma_netdev *rn = netdev_priv(dev);
@@ -2172,7 +2196,6 @@ int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
if (rc != -EOPNOTSUPP)
goto out;
- dev->netdev_ops = &ipoib_netdev_default_pf;
rn->send = ipoib_send;
rn->attach_mcast = ipoib_mcast_attach;
rn->detach_mcast = ipoib_mcast_detach;
@@ -2181,7 +2204,7 @@ int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
priv->rn_ops = dev->netdev_ops;
- if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION)
+ if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION)
dev->netdev_ops = &ipoib_netdev_ops_vf;
else
dev->netdev_ops = &ipoib_netdev_ops_pf;
@@ -2204,7 +2227,7 @@ out:
return rc;
}
-struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
const char *name)
{
struct net_device *dev;
@@ -2249,23 +2272,24 @@ void ipoib_intf_free(struct net_device *dev)
kfree(priv);
}
-static ssize_t show_pkey(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t pkey_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct net_device *ndev = to_net_dev(dev);
struct ipoib_dev_priv *priv = ipoib_priv(ndev);
- return sprintf(buf, "0x%04x\n", priv->pkey);
+ return sysfs_emit(buf, "0x%04x\n", priv->pkey);
}
-static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+static DEVICE_ATTR_RO(pkey);
-static ssize_t show_umcast(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t umcast_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct net_device *ndev = to_net_dev(dev);
struct ipoib_dev_priv *priv = ipoib_priv(ndev);
- return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
+ return sysfs_emit(buf, "%d\n",
+ test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}
void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
@@ -2280,9 +2304,8 @@ void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
}
-static ssize_t set_umcast(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t umcast_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
@@ -2290,7 +2313,7 @@ static ssize_t set_umcast(struct device *dev,
return count;
}
-static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
+static DEVICE_ATTR_RW(umcast);
int ipoib_add_umcast_attr(struct net_device *dev)
{
@@ -2307,7 +2330,7 @@ static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
memcpy(&priv->local_gid.global.interface_id,
&gid->global.interface_id,
sizeof(gid->global.interface_id));
- memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
+ dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid));
clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
netif_addr_unlock_bh(netdev);
@@ -2361,9 +2384,9 @@ static int ipoib_set_mac(struct net_device *dev, void *addr)
return 0;
}
-static ssize_t create_child(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t create_child_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int pkey;
int ret;
@@ -2378,11 +2401,11 @@ static ssize_t create_child(struct device *dev,
return ret ? ret : count;
}
-static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
+static DEVICE_ATTR_WO(create_child);
-static ssize_t delete_child(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t delete_child_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int pkey;
int ret;
@@ -2398,7 +2421,7 @@ static ssize_t delete_child(struct device *dev,
return ret ? ret : count;
}
-static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
+static DEVICE_ATTR_WO(delete_child);
int ipoib_add_pkey_attr(struct net_device *dev)
{
@@ -2435,7 +2458,7 @@ static ssize_t dev_id_show(struct device *dev,
"\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
current->comm);
- return sprintf(buf, "%#x\n", ndev->dev_id);
+ return sysfs_emit(buf, "%#x\n", ndev->dev_id);
}
static DEVICE_ATTR_RO(dev_id);
@@ -2446,7 +2469,7 @@ static int ipoib_intercept_dev_id_attr(struct net_device *dev)
}
static struct net_device *ipoib_add_port(const char *format,
- struct ib_device *hca, u8 port)
+ struct ib_device *hca, u32 port)
{
struct rtnl_link_ops *ops = ipoib_get_link_ops();
struct rdma_netdev_alloc_params params;
@@ -2469,6 +2492,8 @@ static struct net_device *ipoib_add_port(const char *format,
/* call event handler to ensure pkey in sync */
queue_work(ipoib_workqueue, &priv->flush_heavy);
+ ndev->rtnl_link_ops = ipoib_get_link_ops();
+
result = register_netdev(ndev);
if (result) {
pr_warn("%s: couldn't register ipoib port %d; error %d\n",
@@ -2518,7 +2543,7 @@ sysfs_failed:
return ERR_PTR(-ENOMEM);
}
-static void ipoib_add_one(struct ib_device *device)
+static int ipoib_add_one(struct ib_device *device)
{
struct list_head *dev_list;
struct net_device *dev;
@@ -2528,7 +2553,7 @@ static void ipoib_add_one(struct ib_device *device)
dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
if (!dev_list)
- return;
+ return -ENOMEM;
INIT_LIST_HEAD(dev_list);
@@ -2545,10 +2570,11 @@ static void ipoib_add_one(struct ib_device *device)
if (!count) {
kfree(dev_list);
- return;
+ return -EOPNOTSUPP;
}
ib_set_client_data(device, &ipoib_client, dev_list);
+ return 0;
}
static void ipoib_remove_one(struct ib_device *device, void *client_data)
@@ -2556,9 +2582,6 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data)
struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
struct list_head *dev_list = client_data;
- if (!dev_list)
- return;
-
list_for_each_entry_safe(priv, tmp, dev_list, list) {
LIST_HEAD(head);
ipoib_parent_unregister_pre(priv->dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b9e9562f5034..5b3154503bf4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -135,12 +135,11 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
kfree(mcast);
}
-static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
- int can_sleep)
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev)
{
struct ipoib_mcast *mcast;
- mcast = kzalloc(sizeof(*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+ mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC);
if (!mcast)
return NULL;
@@ -218,6 +217,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
struct rdma_ah_attr av;
int ret;
int set_qkey = 0;
+ int mtu;
mcast->mcmember = *mcmember;
@@ -240,13 +240,12 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
priv->broadcast->mcmember.flow_label = mcmember->flow_label;
priv->broadcast->mcmember.hop_limit = mcmember->hop_limit;
/* assume if the admin and the mcast are the same both can be changed */
+ mtu = rdma_mtu_enum_to_int(priv->ca, priv->port,
+ priv->broadcast->mcmember.mtu);
if (priv->mcast_mtu == priv->admin_mtu)
- priv->admin_mtu =
- priv->mcast_mtu =
- IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
- else
- priv->mcast_mtu =
- IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+ priv->admin_mtu = IPOIB_UD_MTU(mtu);
+ priv->mcast_mtu = IPOIB_UD_MTU(mtu);
+ rn->mtu = priv->mcast_mtu;
priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
spin_unlock_irq(&priv->lock);
@@ -276,7 +275,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
memset(&av, 0, sizeof(av));
av.type = rdma_ah_find_type(priv->ca, priv->port);
- rdma_ah_set_dlid(&av, be16_to_cpu(mcast->mcmember.mlid)),
+ rdma_ah_set_dlid(&av, be16_to_cpu(mcast->mcmember.mlid));
rdma_ah_set_port_num(&av, priv->port);
rdma_ah_set_sl(&av, mcast->mcmember.sl);
rdma_ah_set_static_rate(&av, mcast->mcmember.rate);
@@ -335,15 +334,6 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
return;
}
/*
- * Check if can send sendonly MCG's with sendonly-fullmember join state.
- * It done here after the successfully join to the broadcast group,
- * because the broadcast group must always be joined first and is always
- * re-joined if the SM changes substantially.
- */
- priv->sm_fullmember_sendonly_support =
- ib_sa_sendonly_fullmem_support(&ipoib_sa_client,
- priv->ca, priv->port);
- /*
* Take rtnl_lock to avoid racing with ipoib_stop() and
* turning the carrier back on while a device is being
* removed. However, ipoib_stop() will attempt to flush
@@ -538,9 +528,7 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
* most closely emulates the behavior, from a user space
* application perspective, of Ethernet multicast operation.
*/
- if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
- priv->sm_fullmember_sendonly_support)
- /* SM supports sendonly-fullmember, otherwise fallback to full-member */
+ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
rec.join_state = SENDONLY_FULLMEMBER_JOIN;
}
spin_unlock_irq(&priv->lock);
@@ -599,7 +587,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
if (!priv->broadcast) {
struct ipoib_mcast *broadcast;
- broadcast = ipoib_mcast_alloc(dev, 0);
+ broadcast = ipoib_mcast_alloc(dev);
if (!broadcast) {
ipoib_warn(priv, "failed to allocate broadcast group\n");
/*
@@ -681,15 +669,13 @@ void ipoib_mcast_start_thread(struct net_device *dev)
spin_unlock_irqrestore(&priv->lock, flags);
}
-int ipoib_mcast_stop_thread(struct net_device *dev)
+void ipoib_mcast_stop_thread(struct net_device *dev)
{
struct ipoib_dev_priv *priv = ipoib_priv(dev);
ipoib_dbg_mcast(priv, "stopping multicast thread\n");
cancel_delayed_work_sync(&priv->mcast_task);
-
- return 0;
}
static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
@@ -782,7 +768,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
mgid);
- mcast = ipoib_mcast_alloc(dev, 0);
+ mcast = ipoib_mcast_alloc(dev);
if (!mcast) {
ipoib_warn(priv, "unable to allocate memory "
"for multicast structure\n");
@@ -936,7 +922,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
mgid.raw);
- nmcast = ipoib_mcast_alloc(dev, 0);
+ nmcast = ipoib_mcast_alloc(dev);
if (!nmcast) {
ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
continue;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
index 38c984d16996..ea16ba5d8da6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -32,7 +32,6 @@
#include <linux/netdevice.h>
#include <linux/if_arp.h> /* For ARPHRD_xxx */
-#include <linux/module.h>
#include <net/rtnetlink.h>
#include "ipoib.h"
@@ -144,6 +143,16 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
return 0;
}
+static void ipoib_del_child_link(struct net_device *dev, struct list_head *head)
+{
+ struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+ if (!priv->parent)
+ return;
+
+ unregister_netdevice_queue(dev, head);
+}
+
static size_t ipoib_get_size(const struct net_device *dev)
{
return nla_total_size(2) + /* IFLA_IPOIB_PKEY */
@@ -153,11 +162,13 @@ static size_t ipoib_get_size(const struct net_device *dev)
static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
.kind = "ipoib",
+ .netns_refund = true,
.maxtype = IFLA_IPOIB_MAX,
.policy = ipoib_policy,
.priv_size = sizeof(struct ipoib_dev_priv),
.setup = ipoib_setup_common,
.newlink = ipoib_new_child_link,
+ .dellink = ipoib_del_child_link,
.changelink = ipoib_changelink,
.get_size = ipoib_get_size,
.fill_info = ipoib_fill_info,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index b69304d28f06..368e5d77416d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -158,6 +158,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
int ret, size, req_vec;
int i;
+ static atomic_t counter;
size = ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(dev);
@@ -171,8 +172,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
if (ret != -EOPNOTSUPP)
return ret;
- req_vec = (priv->port - 1) * 2;
-
+ req_vec = atomic_inc_return(&counter) * 2;
cq_attr.cqe = size;
cq_attr.comp_vector = req_vec % priv->ca->num_comp_vectors;
priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL,
@@ -197,15 +197,18 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
init_attr.send_cq = priv->send_cq;
init_attr.recv_cq = priv->recv_cq;
- if (priv->hca_caps & IB_DEVICE_UD_TSO)
+ if (priv->kernel_caps & IBK_UD_TSO)
init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
- if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+ if (priv->kernel_caps & IBK_BLOCK_MULTICAST_LOOPBACK)
init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
+ if (priv->kernel_caps & IBK_RDMA_NETDEV_OPA)
+ init_attr.create_flags |= IB_QP_CREATE_NETDEV_USE;
+
priv->qp = ib_create_qp(priv->pd, &init_attr);
if (IS_ERR(priv->qp)) {
pr_warn("%s: failed to create QP\n", ca->name);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 8ac8e18fbe0c..4bd161e86f8d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/init.h>
@@ -40,15 +39,15 @@
#include "ipoib.h"
-static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+static ssize_t parent_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_device *dev = to_net_dev(d);
struct ipoib_dev_priv *priv = ipoib_priv(dev);
- return sprintf(buf, "%s\n", priv->parent->name);
+ return sysfs_emit(buf, "%s\n", priv->parent->name);
}
-static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+static DEVICE_ATTR_RO(parent);
static bool is_child_unique(struct ipoib_dev_priv *ppriv,
struct ipoib_dev_priv *priv)
@@ -97,6 +96,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
{
struct net_device *ndev = priv->dev;
int result;
+ struct rdma_netdev *rn = netdev_priv(ndev);
ASSERT_RTNL();
@@ -117,6 +117,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
goto out_early;
}
+ rn->mtu = priv->mcast_mtu;
+
priv->parent = ppriv->dev;
priv->pkey = pkey;
priv->child_type = type;
@@ -192,6 +194,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
}
priv = ipoib_priv(ndev);
+ ndev->rtnl_link_ops = ipoib_get_link_ops();
+
result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
if (result && ndev->reg_state == NETREG_UNINITIALIZED)
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
index 1d29dffeaff9..3016a0c9a9f0 100644
--- a/drivers/infiniband/ulp/iser/Kconfig
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -3,7 +3,7 @@ config INFINIBAND_ISER
tristate "iSCSI Extensions for RDMA (iSER)"
depends on SCSI && INET && INFINIBAND_ADDR_TRANS
select SCSI_ISCSI_ATTRS
- ---help---
+ help
Support for the iSCSI Extensions for RDMA (iSER) Protocol
over InfiniBand. This allows you to access storage devices
that speak iSCSI over iSER over InfiniBand.
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 3690e28cc7ea..620ae5b2d80d 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -89,13 +89,20 @@ int iser_debug_level = 0;
module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
+static int iscsi_iser_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops iscsi_iser_size_ops = {
+ .set = iscsi_iser_set,
+ .get = param_get_uint,
+};
+
static unsigned int iscsi_max_lun = 512;
-module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
-MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
+module_param_cb(max_lun, &iscsi_iser_size_ops, &iscsi_max_lun, S_IRUGO);
+MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session, should > 0 (default:512)");
unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS;
-module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024");
+module_param_cb(max_sectors, &iscsi_iser_size_ops, &iser_max_sectors,
+ S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command, should > 0 (default:1024)");
bool iser_always_reg = true;
module_param_named(always_register, iser_always_reg, bool, S_IRUGO);
@@ -106,9 +113,17 @@ bool iser_pi_enable = false;
module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
-int iser_pi_guard;
-module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO);
-MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
+static int iscsi_iser_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned int n = 0;
+
+ ret = kstrtouint(val, 10, &n);
+ if (ret != 0 || n == 0)
+ return -EINVAL;
+
+ return param_set_uint(val, kp);
+}
/*
* iscsi_iser_recv() - Process a successful recv completion
@@ -120,9 +135,8 @@ MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
* Notes: In case of data length errors or iscsi PDU completion failures
* this routine will signal iscsi layer of connection failure.
*/
-void
-iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
- char *rx_data, int rx_data_len)
+void iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+ char *rx_data, int rx_data_len)
{
int rc = 0;
int datalen;
@@ -157,8 +171,7 @@ error:
* Netes: This routine can't fail, just assign iscsi task
* hdr and max hdr size.
*/
-static int
-iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
+static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
{
struct iscsi_iser_task *iser_task = task->dd_data;
@@ -179,31 +192,21 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
* state mutex to avoid dereferencing the IB device which
* may have already been terminated.
*/
-int
-iser_initialize_task_headers(struct iscsi_task *task,
- struct iser_tx_desc *tx_desc)
+int iser_initialize_task_headers(struct iscsi_task *task,
+ struct iser_tx_desc *tx_desc)
{
struct iser_conn *iser_conn = task->conn->dd_data;
struct iser_device *device = iser_conn->ib_conn.device;
struct iscsi_iser_task *iser_task = task->dd_data;
u64 dma_addr;
- const bool mgmt_task = !task->sc && !in_interrupt();
- int ret = 0;
- if (unlikely(mgmt_task))
- mutex_lock(&iser_conn->state_mutex);
-
- if (unlikely(iser_conn->state != ISER_CONN_UP)) {
- ret = -ENODEV;
- goto out;
- }
+ if (unlikely(iser_conn->state != ISER_CONN_UP))
+ return -ENODEV;
dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
ISER_HEADERS_LEN, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
- ret = -ENOMEM;
- goto out;
- }
+ if (ib_dma_mapping_error(device->ib_device, dma_addr))
+ return -ENOMEM;
tx_desc->inv_wr.next = NULL;
tx_desc->reg_wr.wr.next = NULL;
@@ -214,11 +217,8 @@ iser_initialize_task_headers(struct iscsi_task *task,
tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
iser_task->iser_conn = iser_conn;
-out:
- if (unlikely(mgmt_task))
- mutex_unlock(&iser_conn->state_mutex);
- return ret;
+ return 0;
}
/**
@@ -230,8 +230,7 @@ out:
* Return: Returns zero on success or -ENOMEM when failing
* to init task headers (dma mapping error).
*/
-static int
-iscsi_iser_task_init(struct iscsi_task *task)
+static int iscsi_iser_task_init(struct iscsi_task *task)
{
struct iscsi_iser_task *iser_task = task->dd_data;
int ret;
@@ -265,8 +264,8 @@ iscsi_iser_task_init(struct iscsi_task *task)
* xmit.
*
**/
-static int
-iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
+static int iscsi_iser_mtask_xmit(struct iscsi_conn *conn,
+ struct iscsi_task *task)
{
int error = 0;
@@ -283,9 +282,8 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
return error;
}
-static int
-iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
- struct iscsi_task *task)
+static int iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
+ struct iscsi_task *task)
{
struct iscsi_r2t_info *r2t = &task->unsol_r2t;
struct iscsi_data hdr;
@@ -319,8 +317,7 @@ iscsi_iser_task_xmit_unsol_data_exit:
*
* Return: zero on success or escalates $error on failure.
*/
-static int
-iscsi_iser_task_xmit(struct iscsi_task *task)
+static int iscsi_iser_task_xmit(struct iscsi_task *task)
{
struct iscsi_conn *conn = task->conn;
struct iscsi_iser_task *iser_task = task->dd_data;
@@ -403,8 +400,7 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task)
*
* In addition the error sector is marked.
*/
-static u8
-iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
+static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
{
struct iscsi_iser_task *iser_task = task->dd_data;
enum iser_data_dir dir = iser_task->dir[ISER_DIR_IN] ?
@@ -453,11 +449,9 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session,
* -EINVAL in case end-point doesn't exsits anymore or iser connection
* state is not UP (teardown already started).
*/
-static int
-iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
- struct iscsi_cls_conn *cls_conn,
- uint64_t transport_eph,
- int is_leading)
+static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
+ struct iscsi_cls_conn *cls_conn,
+ uint64_t transport_eph, int is_leading)
{
struct iscsi_conn *conn = cls_conn->dd_data;
struct iser_conn *iser_conn;
@@ -499,6 +493,7 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
iser_conn->iscsi_conn = conn;
out:
+ iscsi_put_endpoint(ep);
mutex_unlock(&iser_conn->state_mutex);
return error;
}
@@ -511,8 +506,7 @@ out:
* from this point iscsi must call conn_stop in session/connection
* teardown so iser transport must wait for it.
*/
-static int
-iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
+static int iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
{
struct iscsi_conn *iscsi_conn;
struct iser_conn *iser_conn;
@@ -534,8 +528,7 @@ iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
* handle, so we call it under iser the state lock to protect against
* this kind of race.
*/
-static void
-iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
+static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
{
struct iscsi_conn *conn = cls_conn->dd_data;
struct iser_conn *iser_conn = conn->dd_data;
@@ -570,26 +563,31 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
*
* Removes and free iscsi host.
*/
-static void
-iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
+static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
{
struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
iscsi_session_teardown(cls_session);
- iscsi_host_remove(shost);
+ iscsi_host_remove(shost, false);
iscsi_host_free(shost);
}
-static inline unsigned int
-iser_dif_prot_caps(int prot_caps)
+static inline unsigned int iser_dif_prot_caps(int prot_caps)
{
- return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
- SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
- SHOST_DIX_TYPE1_PROTECTION : 0) |
- ((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
- SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
- ((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
- SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
+ int ret = 0;
+
+ if (prot_caps & IB_PROT_T10DIF_TYPE_1)
+ ret |= SHOST_DIF_TYPE1_PROTECTION |
+ SHOST_DIX_TYPE0_PROTECTION |
+ SHOST_DIX_TYPE1_PROTECTION;
+ if (prot_caps & IB_PROT_T10DIF_TYPE_2)
+ ret |= SHOST_DIF_TYPE2_PROTECTION |
+ SHOST_DIX_TYPE2_PROTECTION;
+ if (prot_caps & IB_PROT_T10DIF_TYPE_3)
+ ret |= SHOST_DIF_TYPE3_PROTECTION |
+ SHOST_DIX_TYPE3_PROTECTION;
+
+ return ret;
}
/**
@@ -652,7 +650,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
SHOST_DIX_GUARD_CRC);
}
- if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+ if (!(ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG))
shost->virt_boundary_mask = SZ_4K - 1;
if (iscsi_host_add(shost, ib_dev->dev.parent)) {
@@ -687,15 +685,14 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
return cls_session;
remove_host:
- iscsi_host_remove(shost);
+ iscsi_host_remove(shost, false);
free_host:
iscsi_host_free(shost);
return NULL;
}
-static int
-iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
- enum iscsi_param param, char *buf, int buflen)
+static int iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
+ enum iscsi_param param, char *buf, int buflen)
{
int value;
@@ -739,14 +736,14 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
}
/**
- * iscsi_iser_set_param() - set class connection parameter
+ * iscsi_iser_conn_get_stats() - get iscsi connection statistics
* @cls_conn: iscsi class connection
* @stats: iscsi stats to output
*
* Output connection statistics.
*/
-static void
-iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats)
+static void iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn,
+ struct iscsi_stats *stats)
{
struct iscsi_conn *conn = cls_conn->dd_data;
@@ -797,9 +794,9 @@ static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
* Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error)
* if fails.
*/
-static struct iscsi_endpoint *
-iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
- int non_blocking)
+static struct iscsi_endpoint *iscsi_iser_ep_connect(struct Scsi_Host *shost,
+ struct sockaddr *dst_addr,
+ int non_blocking)
{
int err;
struct iser_conn *iser_conn;
@@ -842,8 +839,7 @@ failure:
* or more likely iser connection state transitioned to TEMINATING or
* DOWN during the wait period.
*/
-static int
-iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+static int iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
{
struct iser_conn *iser_conn = ep->dd_data;
int rc;
@@ -878,8 +874,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
* and cleanup or actually call it immediately in case we didn't pass
* iscsi conn bind/start stage, thus it is safe.
*/
-static void
-iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
+static void iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
{
struct iser_conn *iser_conn = ep->dd_data;
@@ -976,6 +971,7 @@ static struct scsi_host_template iscsi_iser_sht = {
.proc_name = "iscsi_iser",
.this_id = -1,
.track_queue_depth = 1,
+ .cmd_size = sizeof(struct iscsi_cmd),
};
static struct iscsi_transport iscsi_iser_transport = {
@@ -988,6 +984,7 @@ static struct iscsi_transport iscsi_iser_transport = {
/* connection management */
.create_conn = iscsi_iser_conn_create,
.bind_conn = iscsi_iser_conn_bind,
+ .unbind_conn = iscsi_conn_unbind,
.destroy_conn = iscsi_conn_teardown,
.attr_is_visible = iser_attr_is_visible,
.set_param = iscsi_iser_set_param,
@@ -1021,11 +1018,6 @@ static int __init iser_init(void)
iser_dbg("Starting iSER datamover...\n");
- if (iscsi_max_lun < 1) {
- iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);
- return -EINVAL;
- }
-
memset(&ig, 0, sizeof(struct iser_global));
ig.desc_cache = kmem_cache_create("iser_descriptors",
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 029c00163442..dee8c97ff056 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -65,7 +65,6 @@
#include <linux/in6.h>
#include <rdma/ib_verbs.h>
-#include <rdma/ib_fmr_pool.h>
#include <rdma/rdma_cm.h>
#define DRV_NAME "iser"
@@ -120,8 +119,6 @@
#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX)
-#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2)
-
/* the max TX (send) WR supported by the iSER QP is defined by *
* max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect *
* to have at max for SCSI command. The tx posting & completion handling code *
@@ -149,8 +146,6 @@
- ISER_MAX_RX_MISC_PDUS) / \
(1 + ISER_INFLIGHT_DATAOUTS))
-#define ISER_SIGNAL_CMD_COUNT 32
-
/* Constant PDU lengths calculations */
#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr))
@@ -208,12 +203,12 @@ struct iser_reg_resources;
*
* @sge: memory region sg element
* @rkey: memory region remote key
- * @mem_h: pointer to registration context (FMR/Fastreg)
+ * @desc: pointer to fast registration context
*/
struct iser_mem_reg {
- struct ib_sge sge;
- u32 rkey;
- void *mem_h;
+ struct ib_sge sge;
+ u32 rkey;
+ struct iser_fr_desc *desc;
};
enum iser_desc_type {
@@ -298,46 +293,6 @@ struct iser_login_desc {
struct iser_conn;
struct ib_conn;
-struct iscsi_iser_task;
-
-/**
- * struct iser_comp - iSER completion context
- *
- * @cq: completion queue
- * @active_qps: Number of active QPs attached
- * to completion context
- */
-struct iser_comp {
- struct ib_cq *cq;
- int active_qps;
-};
-
-/**
- * struct iser_reg_ops - Memory registration operations
- * per-device registration schemes
- *
- * @alloc_reg_res: Allocate registration resources
- * @free_reg_res: Free registration resources
- * @reg_mem: Register memory buffers
- * @unreg_mem: Un-register memory buffers
- * @reg_desc_get: Get a registration descriptor for pool
- * @reg_desc_put: Get a registration descriptor to pool
- */
-struct iser_reg_ops {
- int (*alloc_reg_res)(struct ib_conn *ib_conn,
- unsigned cmds_max,
- unsigned int size);
- void (*free_reg_res)(struct ib_conn *ib_conn);
- int (*reg_mem)(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *mem,
- struct iser_reg_resources *rsc,
- struct iser_mem_reg *reg);
- void (*unreg_mem)(struct iscsi_iser_task *iser_task,
- enum iser_data_dir cmd_dir);
- struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
- void (*reg_desc_put)(struct ib_conn *ib_conn,
- struct iser_fr_desc *desc);
-};
/**
* struct iser_device - iSER device handle
@@ -348,11 +303,6 @@ struct iser_reg_ops {
* @event_handler: IB events handle routine
* @ig_list: entry in devices list
* @refcount: Reference counter, dominated by open iser connections
- * @comps_used: Number of completion contexts used, Min between online
- * cpus and device max completion vectors
- * @comps: Dinamically allocated array of completion handlers
- * @reg_ops: Registration ops
- * @remote_inv_sup: Remote invalidate is supported on this device
*/
struct iser_device {
struct ib_device *ib_device;
@@ -360,28 +310,18 @@ struct iser_device {
struct ib_event_handler event_handler;
struct list_head ig_list;
int refcount;
- int comps_used;
- struct iser_comp *comps;
- const struct iser_reg_ops *reg_ops;
- bool remote_inv_sup;
};
/**
* struct iser_reg_resources - Fast registration resources
*
* @mr: memory region
- * @fmr_pool: pool of fmrs
* @sig_mr: signature memory region
- * @page_vec: fast reg page list used by fmr pool
* @mr_valid: is mr valid indicator
*/
struct iser_reg_resources {
- union {
- struct ib_mr *mr;
- struct ib_fmr_pool *fmr_pool;
- };
+ struct ib_mr *mr;
struct ib_mr *sig_mr;
- struct iser_page_vec *page_vec;
u8 mr_valid:1;
};
@@ -391,6 +331,7 @@ struct iser_reg_resources {
* @list: entry in connection fastreg pool
* @rsc: data buffer registration resources
* @sig_protected: is region protected indicator
+ * @all_list: first and last list members
*/
struct iser_fr_desc {
struct list_head list;
@@ -403,8 +344,9 @@ struct iser_fr_desc {
* struct iser_fr_pool - connection fast registration pool
*
* @list: list of fastreg descriptors
- * @lock: protects fmr/fastreg pool
+ * @lock: protects fastreg pool
* @size: size of the pool
+ * @all_list: first and last list members
*/
struct iser_fr_pool {
struct list_head list;
@@ -418,23 +360,19 @@ struct iser_fr_pool {
*
* @cma_id: rdma_cm connection maneger handle
* @qp: Connection Queue-pair
- * @post_recv_buf_count: post receive counter
- * @sig_count: send work request signal count
- * @rx_wr: receive work request for batch posts
+ * @cq: Connection completion queue
+ * @cq_size: The number of max outstanding completions
* @device: reference to iser device
- * @comp: iser completion context
- * @fr_pool: connection fast registration poool
+ * @fr_pool: connection fast registration pool
* @pi_support: Indicate device T10-PI support
* @reg_cqe: completion handler
*/
struct ib_conn {
struct rdma_cm_id *cma_id;
struct ib_qp *qp;
- int post_recv_buf_count;
- u8 sig_count;
- struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
+ struct ib_cq *cq;
+ u32 cq_size;
struct iser_device *device;
- struct iser_comp *comp;
struct iser_fr_pool fr_pool;
bool pi_support;
struct ib_cqe reg_cqe;
@@ -449,8 +387,6 @@ struct ib_conn {
* @state: connection logical state
* @qp_max_recv_dtos: maximum number of data outs, corresponds
* to max number of post recvs
- * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
- * @min_posted_rx: (qp_max_recv_dtos >> 2)
* @max_cmds: maximum cmds allowed for this connection
* @name: connection peer portal
* @release_work: deffered work for release job
@@ -461,7 +397,6 @@ struct ib_conn {
* (state is ISER_CONN_UP)
* @conn_list: entry in ig conn list
* @login_desc: login descriptor
- * @rx_desc_head: head of rx_descs cyclic buffer
* @rx_descs: rx buffers array (cyclic buffer)
* @num_rx_descs: number of rx descriptors
* @scsi_sg_tablesize: scsi host sg_tablesize
@@ -474,8 +409,6 @@ struct iser_conn {
struct iscsi_endpoint *ep;
enum iser_conn_state state;
unsigned qp_max_recv_dtos;
- unsigned qp_max_recv_dtos_mask;
- unsigned min_posted_rx;
u16 max_cmds;
char name[ISER_OBJECT_NAME_SIZE];
struct work_struct release_work;
@@ -485,7 +418,6 @@ struct iser_conn {
struct completion up_completion;
struct list_head conn_list;
struct iser_login_desc login_desc;
- unsigned int rx_desc_head;
struct iser_rx_desc *rx_descs;
u32 num_rx_descs;
unsigned short scsi_sg_tablesize;
@@ -518,12 +450,6 @@ struct iscsi_iser_task {
struct iser_data_buf prot[ISER_DIRS_NUM];
};
-struct iser_page_vec {
- u64 *pages;
- int npages;
- struct ib_mr fake_mr;
-};
-
/**
* struct iser_global - iSER global context
*
@@ -544,12 +470,9 @@ struct iser_global {
extern struct iser_global ig;
extern int iser_debug_level;
extern bool iser_pi_enable;
-extern int iser_pi_guard;
extern unsigned int iser_max_sectors;
extern bool iser_always_reg;
-int iser_assign_reg_ops(struct iser_device *device);
-
int iser_send_control(struct iscsi_conn *conn,
struct iscsi_task *task);
@@ -591,60 +514,40 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
struct iser_data_buf *mem,
enum iser_data_dir cmd_dir);
-int iser_reg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir,
- bool all_imm);
-void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir);
+int iser_reg_mem_fastreg(struct iscsi_iser_task *task,
+ enum iser_data_dir dir,
+ bool all_imm);
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *task,
+ enum iser_data_dir dir);
int iser_connect(struct iser_conn *iser_conn,
struct sockaddr *src_addr,
struct sockaddr *dst_addr,
int non_blocking);
-void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
- enum iser_data_dir cmd_dir);
-void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
- enum iser_data_dir cmd_dir);
-
int iser_post_recvl(struct iser_conn *iser_conn);
-int iser_post_recvm(struct iser_conn *iser_conn, int count);
-int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
- bool signal);
+int iser_post_recvm(struct iser_conn *iser_conn,
+ struct iser_rx_desc *rx_desc);
+int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc);
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *data,
enum iser_data_dir iser_dir,
enum dma_data_direction dma_dir);
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *data,
- enum dma_data_direction dir);
+ enum iser_data_dir iser_dir,
+ enum dma_data_direction dma_dir);
int iser_initialize_task_headers(struct iscsi_task *task,
struct iser_tx_desc *tx_desc);
int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
struct iscsi_session *session);
-int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
- unsigned cmds_max,
- unsigned int size);
-void iser_free_fmr_pool(struct ib_conn *ib_conn);
int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
unsigned cmds_max,
unsigned int size);
void iser_free_fastreg_pool(struct ib_conn *ib_conn);
u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir, sector_t *sector);
-struct iser_fr_desc *
-iser_reg_desc_get_fr(struct ib_conn *ib_conn);
-void
-iser_reg_desc_put_fr(struct ib_conn *ib_conn,
- struct iser_fr_desc *desc);
-struct iser_fr_desc *
-iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
-void
-iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
- struct iser_fr_desc *desc);
static inline struct iser_conn *
to_iser_conn(struct ib_conn *ib_conn)
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 4a7045bb0831..7b83f48f60c5 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -52,30 +52,17 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
struct iser_mem_reg *mem_reg;
int err;
struct iser_ctrl *hdr = &iser_task->desc.iser_header;
- struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
err = iser_dma_map_task_data(iser_task,
- buf_in,
ISER_DIR_IN,
DMA_FROM_DEVICE);
if (err)
return err;
- if (scsi_prot_sg_count(iser_task->sc)) {
- struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN];
-
- err = iser_dma_map_task_data(iser_task,
- pbuf_in,
- ISER_DIR_IN,
- DMA_FROM_DEVICE);
- if (err)
- return err;
- }
-
- err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false);
+ err = iser_reg_mem_fastreg(iser_task, ISER_DIR_IN, false);
if (err) {
iser_err("Failed to set up Data-IN RDMA\n");
- return err;
+ goto out_err;
}
mem_reg = &iser_task->rdma_reg[ISER_DIR_IN];
@@ -88,6 +75,10 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
(unsigned long long)mem_reg->sge.addr);
return 0;
+
+out_err:
+ iser_dma_unmap_task_data(iser_task, ISER_DIR_IN, DMA_FROM_DEVICE);
+ return err;
}
/* Register user buffer memory and initialize passive rdma
@@ -95,11 +86,8 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
* task->data[ISER_DIR_OUT].data_len, Protection size
* is stored at task->prot[ISER_DIR_OUT].data_len
*/
-static int
-iser_prepare_write_cmd(struct iscsi_task *task,
- unsigned int imm_sz,
- unsigned int unsol_sz,
- unsigned int edtl)
+static int iser_prepare_write_cmd(struct iscsi_task *task, unsigned int imm_sz,
+ unsigned int unsol_sz, unsigned int edtl)
{
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_mem_reg *mem_reg;
@@ -109,28 +97,16 @@ iser_prepare_write_cmd(struct iscsi_task *task,
struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
err = iser_dma_map_task_data(iser_task,
- buf_out,
ISER_DIR_OUT,
DMA_TO_DEVICE);
if (err)
return err;
- if (scsi_prot_sg_count(iser_task->sc)) {
- struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT];
-
- err = iser_dma_map_task_data(iser_task,
- pbuf_out,
- ISER_DIR_OUT,
- DMA_TO_DEVICE);
- if (err)
- return err;
- }
-
- err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT,
- buf_out->data_len == imm_sz);
- if (err != 0) {
+ err = iser_reg_mem_fastreg(iser_task, ISER_DIR_OUT,
+ buf_out->data_len == imm_sz);
+ if (err) {
iser_err("Failed to register write cmd RDMA mem\n");
- return err;
+ goto out_err;
}
mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
@@ -157,11 +133,15 @@ iser_prepare_write_cmd(struct iscsi_task *task,
}
return 0;
+
+out_err:
+ iser_dma_unmap_task_data(iser_task, ISER_DIR_OUT, DMA_TO_DEVICE);
+ return err;
}
/* creates a new tx descriptor and adds header regd buffer */
-static void iser_create_send_desc(struct iser_conn *iser_conn,
- struct iser_tx_desc *tx_desc)
+static void iser_create_send_desc(struct iser_conn *iser_conn,
+ struct iser_tx_desc *tx_desc)
{
struct iser_device *device = iser_conn->ib_conn.device;
@@ -247,11 +227,9 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
struct iser_device *device = ib_conn->device;
iser_conn->qp_max_recv_dtos = session->cmds_max;
- iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
- iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
- if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
- iser_conn->pages_per_mr))
+ if (iser_alloc_fastreg_pool(ib_conn, session->scsi_cmds_max,
+ iser_conn->pages_per_mr))
goto create_rdma_reg_res_failed;
if (iser_alloc_login_buf(iser_conn))
@@ -280,7 +258,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
rx_sg->lkey = device->pd->local_dma_lkey;
}
- iser_conn->rx_desc_head = 0;
return 0;
rx_desc_dma_map_failed:
@@ -293,7 +270,7 @@ rx_desc_dma_map_failed:
rx_desc_alloc_fail:
iser_free_login_buf(iser_conn);
alloc_login_buf_fail:
- device->reg_ops->free_reg_res(ib_conn);
+ iser_free_fastreg_pool(ib_conn);
create_rdma_reg_res_failed:
iser_err("failed allocating rx descriptors / data buffers\n");
return -ENOMEM;
@@ -306,8 +283,7 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
struct ib_conn *ib_conn = &iser_conn->ib_conn;
struct iser_device *device = ib_conn->device;
- if (device->reg_ops->free_reg_res)
- device->reg_ops->free_reg_res(ib_conn);
+ iser_free_fastreg_pool(ib_conn);
rx_desc = iser_conn->rx_descs;
for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
@@ -323,37 +299,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
{
struct iser_conn *iser_conn = conn->dd_data;
- struct ib_conn *ib_conn = &iser_conn->ib_conn;
struct iscsi_session *session = conn->session;
+ int err = 0;
+ int i;
iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
/* check if this is the last login - going to full feature phase */
if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
- return 0;
-
- /*
- * Check that there is one posted recv buffer
- * (for the last login response).
- */
- WARN_ON(ib_conn->post_recv_buf_count != 1);
+ goto out;
if (session->discovery_sess) {
iser_info("Discovery session, re-using login RX buffer\n");
- return 0;
- } else
- iser_info("Normal session, posting batch of RX %d buffers\n",
- iser_conn->min_posted_rx);
-
- /* Initial post receive buffers */
- if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
- return -ENOMEM;
+ goto out;
+ }
- return 0;
-}
+ iser_info("Normal session, posting batch of RX %d buffers\n",
+ iser_conn->qp_max_recv_dtos - 1);
-static inline bool iser_signal_comp(u8 sig_count)
-{
- return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
+ /*
+ * Initial post receive buffers.
+ * There is one already posted recv buffer (for the last login
+ * response). Therefore, the first recv buffer is skipped here.
+ */
+ for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) {
+ err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]);
+ if (err)
+ goto out;
+ }
+out:
+ return err;
}
/**
@@ -361,8 +335,7 @@ static inline bool iser_signal_comp(u8 sig_count)
* @conn: link to matching iscsi connection
* @task: SCSI command task
*/
-int iser_send_command(struct iscsi_conn *conn,
- struct iscsi_task *task)
+int iser_send_command(struct iscsi_conn *conn, struct iscsi_task *task)
{
struct iser_conn *iser_conn = conn->dd_data;
struct iscsi_iser_task *iser_task = task->dd_data;
@@ -372,7 +345,6 @@ int iser_send_command(struct iscsi_conn *conn,
struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
struct scsi_cmnd *sc = task->sc;
struct iser_tx_desc *tx_desc = &iser_task->desc;
- u8 sig_count = ++iser_conn->ib_conn.sig_count;
edtl = ntohl(hdr->data_length);
@@ -419,8 +391,7 @@ int iser_send_command(struct iscsi_conn *conn,
iser_task->status = ISER_TASK_STATUS_STARTED;
- err = iser_post_send(&iser_conn->ib_conn, tx_desc,
- iser_signal_comp(sig_count));
+ err = iser_post_send(&iser_conn->ib_conn, tx_desc);
if (!err)
return 0;
@@ -435,8 +406,7 @@ send_command_error:
* @task: SCSI command task
* @hdr: pointer to the LLD's iSCSI message header
*/
-int iser_send_data_out(struct iscsi_conn *conn,
- struct iscsi_task *task,
+int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_task *task,
struct iscsi_data *hdr)
{
struct iser_conn *iser_conn = conn->dd_data;
@@ -488,7 +458,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
itt, buf_offset, data_seg_len);
- err = iser_post_send(&iser_conn->ib_conn, tx_desc, true);
+ err = iser_post_send(&iser_conn->ib_conn, tx_desc);
if (!err)
return 0;
@@ -498,8 +468,7 @@ send_data_out_error:
return err;
}
-int iser_send_control(struct iscsi_conn *conn,
- struct iscsi_task *task)
+int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task)
{
struct iser_conn *iser_conn = conn->dd_data;
struct iscsi_iser_task *iser_task = task->dd_data;
@@ -551,7 +520,7 @@ int iser_send_control(struct iscsi_conn *conn,
goto send_control_error;
}
- err = iser_post_send(&iser_conn->ib_conn, mdesc, true);
+ err = iser_post_send(&iser_conn->ib_conn, mdesc);
if (!err)
return 0;
@@ -568,6 +537,7 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
struct iscsi_hdr *hdr;
char *data;
int length;
+ bool full_feature_phase;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
iser_err_comp(wc, "login_rsp");
@@ -581,6 +551,9 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
hdr = desc->rsp + sizeof(struct iser_ctrl);
data = desc->rsp + ISER_HEADERS_LEN;
length = wc->byte_len - ISER_HEADERS_LEN;
+ full_feature_phase = ((hdr->flags & ISCSI_FULL_FEATURE_PHASE) ==
+ ISCSI_FULL_FEATURE_PHASE) &&
+ (hdr->flags & ISCSI_FLAG_CMD_FINAL);
iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
hdr->itt, length);
@@ -591,11 +564,15 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
desc->rsp_dma, ISER_RX_LOGIN_SIZE,
DMA_FROM_DEVICE);
- ib_conn->post_recv_buf_count--;
+ if (!full_feature_phase ||
+ iser_conn->iscsi_conn->session->discovery_sess)
+ return;
+
+ /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */
+ iser_post_recvm(iser_conn, iser_conn->rx_descs);
}
-static inline int
-iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
+static inline int iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
{
if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) ||
(desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) {
@@ -608,10 +585,8 @@ iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
return 0;
}
-static int
-iser_check_remote_inv(struct iser_conn *iser_conn,
- struct ib_wc *wc,
- struct iscsi_hdr *hdr)
+static int iser_check_remote_inv(struct iser_conn *iser_conn, struct ib_wc *wc,
+ struct iscsi_hdr *hdr)
{
if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
struct iscsi_task *task;
@@ -632,13 +607,13 @@ iser_check_remote_inv(struct iser_conn *iser_conn,
struct iser_fr_desc *desc;
if (iser_task->dir[ISER_DIR_IN]) {
- desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
+ desc = iser_task->rdma_reg[ISER_DIR_IN].desc;
if (unlikely(iser_inv_desc(desc, rkey)))
return -EINVAL;
}
if (iser_task->dir[ISER_DIR_OUT]) {
- desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
+ desc = iser_task->rdma_reg[ISER_DIR_OUT].desc;
if (unlikely(iser_inv_desc(desc, rkey)))
return -EINVAL;
}
@@ -658,8 +633,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
struct iser_conn *iser_conn = to_iser_conn(ib_conn);
struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
struct iscsi_hdr *hdr;
- int length;
- int outstanding, count, err;
+ int length, err;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
iser_err_comp(wc, "task_rsp");
@@ -688,20 +662,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
DMA_FROM_DEVICE);
- /* decrementing conn->post_recv_buf_count only --after-- freeing the *
- * task eliminates the need to worry on tasks which are completed in *
- * parallel to the execution of iser_conn_term. So the code that waits *
- * for the posted rx bufs refcount to become zero handles everything */
- ib_conn->post_recv_buf_count--;
-
- outstanding = ib_conn->post_recv_buf_count;
- if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
- count = min(iser_conn->qp_max_recv_dtos - outstanding,
- iser_conn->min_posted_rx);
- err = iser_post_recvm(iser_conn, count);
- if (err)
- iser_err("posting %d rx bufs err %d\n", count, err);
- }
+ err = iser_post_recvm(iser_conn, desc);
+ if (err)
+ iser_err("posting rx buffer err %d\n", err);
}
void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
@@ -765,27 +728,16 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
{
- int prot_count = scsi_prot_sg_count(iser_task->sc);
if (iser_task->dir[ISER_DIR_IN]) {
- iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
- iser_dma_unmap_task_data(iser_task,
- &iser_task->data[ISER_DIR_IN],
+ iser_unreg_mem_fastreg(iser_task, ISER_DIR_IN);
+ iser_dma_unmap_task_data(iser_task, ISER_DIR_IN,
DMA_FROM_DEVICE);
- if (prot_count)
- iser_dma_unmap_task_data(iser_task,
- &iser_task->prot[ISER_DIR_IN],
- DMA_FROM_DEVICE);
}
if (iser_task->dir[ISER_DIR_OUT]) {
- iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
- iser_dma_unmap_task_data(iser_task,
- &iser_task->data[ISER_DIR_OUT],
+ iser_unreg_mem_fastreg(iser_task, ISER_DIR_OUT);
+ iser_dma_unmap_task_data(iser_task, ISER_DIR_OUT,
DMA_TO_DEVICE);
- if (prot_count)
- iser_dma_unmap_task_data(iser_task,
- &iser_task->prot[ISER_DIR_OUT],
- DMA_TO_DEVICE);
}
}
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 7a8f24de3631..29ae2c6a250a 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -30,7 +30,6 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
@@ -38,63 +37,13 @@
#include <linux/scatterlist.h>
#include "iscsi_iser.h"
-static
-int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *mem,
- struct iser_reg_resources *rsc,
- struct iser_mem_reg *mem_reg);
-static
-int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *mem,
- struct iser_reg_resources *rsc,
- struct iser_mem_reg *mem_reg);
-
-static const struct iser_reg_ops fastreg_ops = {
- .alloc_reg_res = iser_alloc_fastreg_pool,
- .free_reg_res = iser_free_fastreg_pool,
- .reg_mem = iser_fast_reg_mr,
- .unreg_mem = iser_unreg_mem_fastreg,
- .reg_desc_get = iser_reg_desc_get_fr,
- .reg_desc_put = iser_reg_desc_put_fr,
-};
-
-static const struct iser_reg_ops fmr_ops = {
- .alloc_reg_res = iser_alloc_fmr_pool,
- .free_reg_res = iser_free_fmr_pool,
- .reg_mem = iser_fast_reg_fmr,
- .unreg_mem = iser_unreg_mem_fmr,
- .reg_desc_get = iser_reg_desc_get_fmr,
- .reg_desc_put = iser_reg_desc_put_fmr,
-};
void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
{
iser_err_comp(wc, "memreg");
}
-int iser_assign_reg_ops(struct iser_device *device)
-{
- struct ib_device *ib_dev = device->ib_device;
-
- /* Assign function handles - based on FMR support */
- if (ib_dev->ops.alloc_fmr && ib_dev->ops.dealloc_fmr &&
- ib_dev->ops.map_phys_fmr && ib_dev->ops.unmap_fmr) {
- iser_info("FMR supported, using FMR for registration\n");
- device->reg_ops = &fmr_ops;
- } else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
- iser_info("FastReg supported, using FastReg for registration\n");
- device->reg_ops = &fastreg_ops;
- device->remote_inv_sup = iser_always_reg;
- } else {
- iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
- return -1;
- }
-
- return 0;
-}
-
-struct iser_fr_desc *
-iser_reg_desc_get_fr(struct ib_conn *ib_conn)
+static struct iser_fr_desc *iser_reg_desc_get_fr(struct ib_conn *ib_conn)
{
struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
struct iser_fr_desc *desc;
@@ -109,9 +58,8 @@ iser_reg_desc_get_fr(struct ib_conn *ib_conn)
return desc;
}
-void
-iser_reg_desc_put_fr(struct ib_conn *ib_conn,
- struct iser_fr_desc *desc)
+static void iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+ struct iser_fr_desc *desc)
{
struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
unsigned long flags;
@@ -121,49 +69,11 @@ iser_reg_desc_put_fr(struct ib_conn *ib_conn,
spin_unlock_irqrestore(&fr_pool->lock, flags);
}
-struct iser_fr_desc *
-iser_reg_desc_get_fmr(struct ib_conn *ib_conn)
-{
- struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
-
- return list_first_entry(&fr_pool->list,
- struct iser_fr_desc, list);
-}
-
-void
-iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
- struct iser_fr_desc *desc)
-{
-}
-
-static void iser_data_buf_dump(struct iser_data_buf *data,
- struct ib_device *ibdev)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(data->sg, sg, data->dma_nents, i)
- iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
- "off:0x%x sz:0x%x dma_len:0x%x\n",
- i, (unsigned long)sg_dma_address(sg),
- sg_page(sg), sg->offset, sg->length, sg_dma_len(sg));
-}
-
-static void iser_dump_page_vec(struct iser_page_vec *page_vec)
-{
- int i;
-
- iser_err("page vec npages %d data length %lld\n",
- page_vec->npages, page_vec->fake_mr.length);
- for (i = 0; i < page_vec->npages; i++)
- iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]);
-}
-
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *data,
- enum iser_data_dir iser_dir,
- enum dma_data_direction dma_dir)
+ enum iser_data_dir iser_dir,
+ enum dma_data_direction dma_dir)
{
+ struct iser_data_buf *data = &iser_task->data[iser_dir];
struct ib_device *dev;
iser_task->dir[iser_dir] = 1;
@@ -174,22 +84,44 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
iser_err("dma_map_sg failed!!!\n");
return -EINVAL;
}
+
+ if (scsi_prot_sg_count(iser_task->sc)) {
+ struct iser_data_buf *pdata = &iser_task->prot[iser_dir];
+
+ pdata->dma_nents = ib_dma_map_sg(dev, pdata->sg, pdata->size, dma_dir);
+ if (unlikely(pdata->dma_nents == 0)) {
+ iser_err("protection dma_map_sg failed!!!\n");
+ goto out_unmap;
+ }
+ }
+
return 0;
+
+out_unmap:
+ ib_dma_unmap_sg(dev, data->sg, data->size, dma_dir);
+ return -EINVAL;
}
+
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *data,
- enum dma_data_direction dir)
+ enum iser_data_dir iser_dir,
+ enum dma_data_direction dma_dir)
{
+ struct iser_data_buf *data = &iser_task->data[iser_dir];
struct ib_device *dev;
dev = iser_task->iser_conn->ib_conn.device->ib_device;
- ib_dma_unmap_sg(dev, data->sg, data->size, dir);
+ ib_dma_unmap_sg(dev, data->sg, data->size, dma_dir);
+
+ if (scsi_prot_sg_count(iser_task->sc)) {
+ struct iser_data_buf *pdata = &iser_task->prot[iser_dir];
+
+ ib_dma_unmap_sg(dev, pdata->sg, pdata->size, dma_dir);
+ }
}
-static int
-iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
- struct iser_mem_reg *reg)
+static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
+ struct iser_mem_reg *reg)
{
struct scatterlist *sg = mem->sg;
@@ -213,100 +145,40 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
return 0;
}
-static int iser_set_page(struct ib_mr *mr, u64 addr)
-{
- struct iser_page_vec *page_vec =
- container_of(mr, struct iser_page_vec, fake_mr);
-
- page_vec->pages[page_vec->npages++] = addr;
-
- return 0;
-}
-
-static
-int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *mem,
- struct iser_reg_resources *rsc,
- struct iser_mem_reg *reg)
-{
- struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
- struct iser_device *device = ib_conn->device;
- struct iser_page_vec *page_vec = rsc->page_vec;
- struct ib_fmr_pool *fmr_pool = rsc->fmr_pool;
- struct ib_pool_fmr *fmr;
- int ret, plen;
-
- page_vec->npages = 0;
- page_vec->fake_mr.page_size = SZ_4K;
- plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
- mem->dma_nents, NULL, iser_set_page);
- if (unlikely(plen < mem->dma_nents)) {
- iser_err("page vec too short to hold this SG\n");
- iser_data_buf_dump(mem, device->ib_device);
- iser_dump_page_vec(page_vec);
- return -EINVAL;
- }
-
- fmr = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages,
- page_vec->npages, page_vec->pages[0]);
- if (IS_ERR(fmr)) {
- ret = PTR_ERR(fmr);
- iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
- return ret;
- }
-
- reg->sge.lkey = fmr->fmr->lkey;
- reg->rkey = fmr->fmr->rkey;
- reg->sge.addr = page_vec->fake_mr.iova;
- reg->sge.length = page_vec->fake_mr.length;
- reg->mem_h = fmr;
-
- iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
- " length=0x%x\n", reg->sge.lkey, reg->rkey,
- reg->sge.addr, reg->sge.length);
-
- return 0;
-}
-
-/**
- * Unregister (previosuly registered using FMR) memory.
- * If memory is non-FMR does nothing.
- */
-void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
- enum iser_data_dir cmd_dir)
-{
- struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
-
- if (!reg->mem_h)
- return;
-
- iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h);
-
- ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
-
- reg->mem_h = NULL;
-}
-
void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir)
{
- struct iser_device *device = iser_task->iser_conn->ib_conn.device;
struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+ struct iser_fr_desc *desc;
+ struct ib_mr_status mr_status;
- if (!reg->mem_h)
+ desc = reg->desc;
+ if (!desc)
return;
- device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
- reg->mem_h);
- reg->mem_h = NULL;
+ /*
+ * The signature MR cannot be invalidated and reused without checking.
+ * libiscsi calls the check_protection transport handler only if
+ * SCSI-Response is received. And the signature MR is not checked if
+ * the task is completed for some other reason like a timeout or error
+ * handling. That's why we must check the signature MR here before
+ * putting it to the free pool.
+ */
+ if (unlikely(desc->sig_protected)) {
+ desc->sig_protected = false;
+ ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS,
+ &mr_status);
+ }
+ iser_reg_desc_put_fr(&iser_task->iser_conn->ib_conn, reg->desc);
+ reg->desc = NULL;
}
-static void
-iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_domain *domain)
+static void iser_set_dif_domain(struct scsi_cmnd *sc,
+ struct ib_sig_domain *domain)
{
domain->sig_type = IB_SIG_TYPE_T10_DIF;
domain->sig.dif.pi_interval = scsi_prot_interval(sc);
- domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
+ domain->sig.dif.ref_tag = t10_pi_ref_tag(scsi_cmd_to_rq(sc));
/*
* At the moment we hard code those, but in the future
* we will take them from sc.
@@ -316,10 +188,10 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_domain *domain)
domain->sig.dif.ref_escape = true;
if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
domain->sig.dif.ref_remap = true;
-};
+}
-static int
-iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
+static int iser_set_sig_attrs(struct scsi_cmnd *sc,
+ struct ib_sig_attrs *sig_attrs)
{
switch (scsi_get_prot_op(sc)) {
case SCSI_PROT_WRITE_INSERT:
@@ -352,8 +224,7 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
return 0;
}
-static inline void
-iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
+static inline void iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
{
*mask = 0;
if (sc->prot_flags & SCSI_PROT_REF_CHECK)
@@ -362,11 +233,8 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
*mask |= IB_SIG_CHECK_GUARD;
}
-static inline void
-iser_inv_rkey(struct ib_send_wr *inv_wr,
- struct ib_mr *mr,
- struct ib_cqe *cqe,
- struct ib_send_wr *next_wr)
+static inline void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr,
+ struct ib_cqe *cqe, struct ib_send_wr *next_wr)
{
inv_wr->opcode = IB_WR_LOCAL_INV;
inv_wr->wr_cqe = cqe;
@@ -376,12 +244,11 @@ iser_inv_rkey(struct ib_send_wr *inv_wr,
inv_wr->next = next_wr;
}
-static int
-iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
- struct iser_data_buf *mem,
- struct iser_data_buf *sig_mem,
- struct iser_reg_resources *rsc,
- struct iser_mem_reg *sig_reg)
+static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
+ struct iser_data_buf *mem,
+ struct iser_data_buf *sig_mem,
+ struct iser_reg_resources *rsc,
+ struct iser_mem_reg *sig_reg)
{
struct iser_tx_desc *tx_desc = &iser_task->desc;
struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
@@ -482,43 +349,26 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
return 0;
}
-static int
-iser_reg_data_sg(struct iscsi_iser_task *task,
- struct iser_data_buf *mem,
- struct iser_fr_desc *desc,
- bool use_dma_key,
- struct iser_mem_reg *reg)
-{
- struct iser_device *device = task->iser_conn->ib_conn.device;
-
- if (use_dma_key)
- return iser_reg_dma(device, mem, reg);
-
- return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg);
-}
-
-int iser_reg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir,
- bool all_imm)
+int iser_reg_mem_fastreg(struct iscsi_iser_task *task,
+ enum iser_data_dir dir,
+ bool all_imm)
{
struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
struct iser_device *device = ib_conn->device;
struct iser_data_buf *mem = &task->data[dir];
struct iser_mem_reg *reg = &task->rdma_reg[dir];
- struct iser_fr_desc *desc = NULL;
+ struct iser_fr_desc *desc;
bool use_dma_key;
int err;
use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) &&
scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL;
+ if (use_dma_key)
+ return iser_reg_dma(device, mem, reg);
- if (!use_dma_key) {
- desc = device->reg_ops->reg_desc_get(ib_conn);
- reg->mem_h = desc;
- }
-
+ desc = iser_reg_desc_get_fr(ib_conn);
if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) {
- err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg);
+ err = iser_fast_reg_mr(task, mem, &desc->rsc, reg);
if (unlikely(err))
goto err_reg;
} else {
@@ -530,19 +380,12 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
desc->sig_protected = true;
}
+ reg->desc = desc;
+
return 0;
err_reg:
- if (desc)
- device->reg_ops->reg_desc_put(ib_conn, desc);
+ iser_reg_desc_put_fr(ib_conn, desc);
return err;
}
-
-void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
- enum iser_data_dir dir)
-{
- struct iser_device *device = task->iser_conn->ib_conn.device;
-
- device->reg_ops->unreg_mem(task, dir);
-}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 127887c6c03f..a00ca117303a 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -32,7 +32,6 @@
* SOFTWARE.
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/delay.h>
@@ -68,58 +67,23 @@ static void iser_event_handler(struct ib_event_handler *handler,
static int iser_create_device_ib_res(struct iser_device *device)
{
struct ib_device *ib_dev = device->ib_device;
- int ret, i, max_cqe;
- ret = iser_assign_reg_ops(device);
- if (ret)
- return ret;
-
- device->comps_used = min_t(int, num_online_cpus(),
- ib_dev->num_comp_vectors);
-
- device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
- GFP_KERNEL);
- if (!device->comps)
- goto comps_err;
-
- max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
-
- iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
- device->comps_used, dev_name(&ib_dev->dev),
- ib_dev->num_comp_vectors, max_cqe);
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
+ iser_err("IB device does not support memory registrations\n");
+ return -1;
+ }
device->pd = ib_alloc_pd(ib_dev,
iser_always_reg ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
if (IS_ERR(device->pd))
goto pd_err;
- for (i = 0; i < device->comps_used; i++) {
- struct iser_comp *comp = &device->comps[i];
-
- comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
- IB_POLL_SOFTIRQ);
- if (IS_ERR(comp->cq)) {
- comp->cq = NULL;
- goto cq_err;
- }
- }
-
INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
iser_event_handler);
ib_register_event_handler(&device->event_handler);
return 0;
-cq_err:
- for (i = 0; i < device->comps_used; i++) {
- struct iser_comp *comp = &device->comps[i];
-
- if (comp->cq)
- ib_free_cq(comp->cq);
- }
- ib_dealloc_pd(device->pd);
pd_err:
- kfree(device->comps);
-comps_err:
iser_err("failed to allocate an IB resource\n");
return -1;
}
@@ -130,113 +94,12 @@ comps_err:
*/
static void iser_free_device_ib_res(struct iser_device *device)
{
- int i;
-
- for (i = 0; i < device->comps_used; i++) {
- struct iser_comp *comp = &device->comps[i];
-
- ib_free_cq(comp->cq);
- comp->cq = NULL;
- }
-
ib_unregister_event_handler(&device->event_handler);
ib_dealloc_pd(device->pd);
- kfree(device->comps);
- device->comps = NULL;
device->pd = NULL;
}
-/**
- * iser_alloc_fmr_pool - Creates FMR pool and page_vector
- * @ib_conn: connection RDMA resources
- * @cmds_max: max number of SCSI commands for this connection
- * @size: max number of pages per map request
- *
- * Return: 0 on success, or errno code on failure
- */
-int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
- unsigned cmds_max,
- unsigned int size)
-{
- struct iser_device *device = ib_conn->device;
- struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
- struct iser_page_vec *page_vec;
- struct iser_fr_desc *desc;
- struct ib_fmr_pool *fmr_pool;
- struct ib_fmr_pool_param params;
- int ret;
-
- INIT_LIST_HEAD(&fr_pool->list);
- spin_lock_init(&fr_pool->lock);
-
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- if (!desc)
- return -ENOMEM;
-
- page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size),
- GFP_KERNEL);
- if (!page_vec) {
- ret = -ENOMEM;
- goto err_frpl;
- }
-
- page_vec->pages = (u64 *)(page_vec + 1);
-
- params.page_shift = ilog2(SZ_4K);
- params.max_pages_per_fmr = size;
- /* make the pool size twice the max number of SCSI commands *
- * the ML is expected to queue, watermark for unmap at 50% */
- params.pool_size = cmds_max * 2;
- params.dirty_watermark = cmds_max;
- params.cache = 0;
- params.flush_function = NULL;
- params.access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ);
-
- fmr_pool = ib_create_fmr_pool(device->pd, &params);
- if (IS_ERR(fmr_pool)) {
- ret = PTR_ERR(fmr_pool);
- iser_err("FMR allocation failed, err %d\n", ret);
- goto err_fmr;
- }
-
- desc->rsc.page_vec = page_vec;
- desc->rsc.fmr_pool = fmr_pool;
- list_add(&desc->list, &fr_pool->list);
-
- return 0;
-
-err_fmr:
- kfree(page_vec);
-err_frpl:
- kfree(desc);
-
- return ret;
-}
-
-/**
- * iser_free_fmr_pool - releases the FMR pool and page vec
- * @ib_conn: connection RDMA resources
- */
-void iser_free_fmr_pool(struct ib_conn *ib_conn)
-{
- struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
- struct iser_fr_desc *desc;
-
- desc = list_first_entry(&fr_pool->list,
- struct iser_fr_desc, list);
- list_del(&desc->list);
-
- iser_info("freeing conn %p fmr pool %p\n",
- ib_conn, desc->rsc.fmr_pool);
-
- ib_destroy_fmr_pool(desc->rsc.fmr_pool);
- kfree(desc->rsc.page_vec);
- kfree(desc);
-}
-
static struct iser_fr_desc *
iser_create_fastreg_desc(struct iser_device *device,
struct ib_pd *pd,
@@ -252,7 +115,7 @@ iser_create_fastreg_desc(struct iser_device *device,
if (!desc)
return ERR_PTR(-ENOMEM);
- if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
mr_type = IB_MR_TYPE_SG_GAPS;
else
mr_type = IB_MR_TYPE_MEM_REG;
@@ -376,70 +239,58 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
struct ib_device *ib_dev;
struct ib_qp_init_attr init_attr;
int ret = -ENOMEM;
- int index, min_index = 0;
+ unsigned int max_send_wr, cq_size;
BUG_ON(ib_conn->device == NULL);
device = ib_conn->device;
ib_dev = device->ib_device;
- memset(&init_attr, 0, sizeof init_attr);
+ /* +1 for drain */
+ if (ib_conn->pi_support)
+ max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+ else
+ max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
+ max_send_wr = min_t(unsigned int, max_send_wr,
+ (unsigned int)ib_dev->attrs.max_qp_wr);
- mutex_lock(&ig.connlist_mutex);
- /* select the CQ with the minimal number of usages */
- for (index = 0; index < device->comps_used; index++) {
- if (device->comps[index].active_qps <
- device->comps[min_index].active_qps)
- min_index = index;
+ cq_size = max_send_wr + ISER_QP_MAX_RECV_DTOS;
+ ib_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_SOFTIRQ);
+ if (IS_ERR(ib_conn->cq)) {
+ ret = PTR_ERR(ib_conn->cq);
+ goto cq_err;
}
- ib_conn->comp = &device->comps[min_index];
- ib_conn->comp->active_qps++;
- mutex_unlock(&ig.connlist_mutex);
- iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+ ib_conn->cq_size = cq_size;
+
+ memset(&init_attr, 0, sizeof(init_attr));
init_attr.event_handler = iser_qp_event_callback;
- init_attr.qp_context = (void *)ib_conn;
- init_attr.send_cq = ib_conn->comp->cq;
- init_attr.recv_cq = ib_conn->comp->cq;
- init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS;
+ init_attr.qp_context = (void *)ib_conn;
+ init_attr.send_cq = ib_conn->cq;
+ init_attr.recv_cq = ib_conn->cq;
+ /* +1 for drain */
+ init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1;
init_attr.cap.max_send_sge = 2;
init_attr.cap.max_recv_sge = 1;
- init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- init_attr.qp_type = IB_QPT_RC;
- if (ib_conn->pi_support) {
- init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+ init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ init_attr.qp_type = IB_QPT_RC;
+ init_attr.cap.max_send_wr = max_send_wr;
+ if (ib_conn->pi_support)
init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
- iser_conn->max_cmds =
- ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
- } else {
- if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
- init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
- iser_conn->max_cmds =
- ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
- } else {
- init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
- iser_conn->max_cmds =
- ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
- iser_dbg("device %s supports max_send_wr %d\n",
- dev_name(&device->ib_device->dev),
- ib_dev->attrs.max_qp_wr);
- }
- }
+ iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(max_send_wr - 1);
ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
if (ret)
goto out_err;
ib_conn->qp = ib_conn->cma_id->qp;
- iser_info("setting conn %p cma_id %p qp %p\n",
- ib_conn, ib_conn->cma_id,
- ib_conn->cma_id->qp);
+ iser_info("setting conn %p cma_id %p qp %p max_send_wr %d\n", ib_conn,
+ ib_conn->cma_id, ib_conn->cma_id->qp, max_send_wr);
return ret;
out_err:
- mutex_lock(&ig.connlist_mutex);
- ib_conn->comp->active_qps--;
- mutex_unlock(&ig.connlist_mutex);
+ ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size);
+cq_err:
iser_err("unable to alloc mem or create resource, err %d\n", ret);
return ret;
@@ -462,7 +313,7 @@ struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
goto inc_refcnt;
device = kzalloc(sizeof *device, GFP_KERNEL);
- if (device == NULL)
+ if (!device)
goto out;
/* assign this device to the device */
@@ -541,8 +392,7 @@ void iser_release_work(struct work_struct *work)
* so the cm_id removal is out of here. It is Safe to
* be invoked multiple times.
*/
-static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
- bool destroy)
+static void iser_free_ib_conn_res(struct iser_conn *iser_conn, bool destroy)
{
struct ib_conn *ib_conn = &iser_conn->ib_conn;
struct iser_device *device = ib_conn->device;
@@ -550,11 +400,9 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
iser_info("freeing conn %p cma_id %p qp %p\n",
iser_conn, ib_conn->cma_id, ib_conn->qp);
- if (ib_conn->qp != NULL) {
- mutex_lock(&ig.connlist_mutex);
- ib_conn->comp->active_qps--;
- mutex_unlock(&ig.connlist_mutex);
+ if (ib_conn->qp) {
rdma_destroy_qp(ib_conn->cma_id);
+ ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size);
ib_conn->qp = NULL;
}
@@ -562,7 +410,7 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
if (iser_conn->rx_descs)
iser_free_rx_descriptors(iser_conn);
- if (device != NULL) {
+ if (device) {
iser_device_try_release(device);
ib_conn->device = NULL;
}
@@ -596,7 +444,7 @@ void iser_conn_release(struct iser_conn *iser_conn)
iser_free_ib_conn_res(iser_conn, true);
mutex_unlock(&iser_conn->state_mutex);
- if (ib_conn->cma_id != NULL) {
+ if (ib_conn->cma_id) {
rdma_destroy_id(ib_conn->cma_id);
ib_conn->cma_id = NULL;
}
@@ -639,7 +487,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
iser_conn, err);
/* block until all flush errors are consumed */
- ib_drain_sq(ib_conn->qp);
+ ib_drain_qp(ib_conn->qp);
}
return 1;
@@ -652,13 +500,12 @@ static void iser_connect_error(struct rdma_cm_id *cma_id)
{
struct iser_conn *iser_conn;
- iser_conn = (struct iser_conn *)cma_id->context;
+ iser_conn = cma_id->context;
iser_conn->state = ISER_CONN_TERMINATING;
}
-static void
-iser_calc_scsi_params(struct iser_conn *iser_conn,
- unsigned int max_sectors)
+static void iser_calc_scsi_params(struct iser_conn *iser_conn,
+ unsigned int max_sectors)
{
struct iser_device *device = iser_conn->ib_conn.device;
struct ib_device_attr *attr = &device->ib_device->attrs;
@@ -667,13 +514,12 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
u32 max_num_sg;
/*
- * FRs without SG_GAPS or FMRs can only map up to a (device) page per
- * entry, but if the first entry is misaligned we'll end up using two
- * entries (head and tail) for a single page worth data, so one
- * additional entry is required.
+ * FRs without SG_GAPS can only map up to a (device) page per entry,
+ * but if the first entry is misaligned we'll end up using two entries
+ * (head and tail) for a single page worth data, so one additional
+ * entry is required.
*/
- if ((attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) &&
- (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+ if (attr->kernel_cap_flags & IBK_SG_GAPS_REG)
reserved_mr_pages = 0;
else
reserved_mr_pages = 1;
@@ -684,14 +530,8 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
max_num_sg = attr->max_fast_reg_page_list_len;
sg_tablesize = DIV_ROUND_UP(max_sectors * SECTOR_SIZE, SZ_4K);
- if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)
- sup_sg_tablesize =
- min_t(
- uint, ISCSI_ISER_MAX_SG_TABLESIZE,
- max_num_sg - reserved_mr_pages);
- else
- sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE;
-
+ sup_sg_tablesize = min_t(uint, ISCSI_ISER_MAX_SG_TABLESIZE,
+ max_num_sg - reserved_mr_pages);
iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize);
iser_conn->pages_per_mr =
iser_conn->scsi_sg_tablesize + reserved_mr_pages;
@@ -703,11 +543,11 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
static void iser_addr_handler(struct rdma_cm_id *cma_id)
{
struct iser_device *device;
- struct iser_conn *iser_conn;
- struct ib_conn *ib_conn;
+ struct iser_conn *iser_conn;
+ struct ib_conn *ib_conn;
int ret;
- iser_conn = (struct iser_conn *)cma_id->context;
+ iser_conn = cma_id->context;
if (iser_conn->state != ISER_CONN_PENDING)
/* bailout */
return;
@@ -724,8 +564,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
/* connection T10-PI support */
if (iser_pi_enable) {
- if (!(device->ib_device->attrs.device_cap_flags &
- IB_DEVICE_INTEGRITY_HANDOVER)) {
+ if (!(device->ib_device->attrs.kernel_cap_flags &
+ IBK_INTEGRITY_HANDOVER)) {
iser_warn("T10-PI requested but not supported on %s, "
"continue without T10-PI\n",
dev_name(&ib_conn->device->ib_device->dev));
@@ -751,11 +591,11 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
static void iser_route_handler(struct rdma_cm_id *cma_id)
{
struct rdma_conn_param conn_param;
- int ret;
+ int ret;
struct iser_cm_hdr req_hdr;
- struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+ struct iser_conn *iser_conn = cma_id->context;
struct ib_conn *ib_conn = &iser_conn->ib_conn;
- struct iser_device *device = ib_conn->device;
+ struct ib_device *ib_dev = ib_conn->device->ib_device;
if (iser_conn->state != ISER_CONN_PENDING)
/* bailout */
@@ -766,19 +606,19 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
goto failure;
memset(&conn_param, 0, sizeof conn_param);
- conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom;
- conn_param.initiator_depth = 1;
- conn_param.retry_count = 7;
- conn_param.rnr_retry_count = 6;
+ conn_param.responder_resources = ib_dev->attrs.max_qp_rd_atom;
+ conn_param.initiator_depth = 1;
+ conn_param.retry_count = 7;
+ conn_param.rnr_retry_count = 6;
memset(&req_hdr, 0, sizeof(req_hdr));
req_hdr.flags = ISER_ZBVA_NOT_SUP;
- if (!device->remote_inv_sup)
+ if (!iser_always_reg)
req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP;
conn_param.private_data = (void *)&req_hdr;
conn_param.private_data_len = sizeof(struct iser_cm_hdr);
- ret = rdma_connect(cma_id, &conn_param);
+ ret = rdma_connect_locked(cma_id, &conn_param);
if (ret) {
iser_err("failure connecting: %d\n", ret);
goto failure;
@@ -796,7 +636,7 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id,
struct ib_qp_attr attr;
struct ib_qp_init_attr init_attr;
- iser_conn = (struct iser_conn *)cma_id->context;
+ iser_conn = cma_id->context;
if (iser_conn->state != ISER_CONN_PENDING)
/* bailout */
return;
@@ -819,7 +659,7 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id,
static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
{
- struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+ struct iser_conn *iser_conn = cma_id->context;
if (iser_conn_terminate(iser_conn)) {
if (iser_conn->iscsi_conn)
@@ -833,7 +673,7 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
bool destroy)
{
- struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+ struct iser_conn *iser_conn = cma_id->context;
/*
* We are not guaranteed that we visited disconnected_handler
@@ -843,14 +683,15 @@ static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
iser_disconnected_handler(cma_id);
iser_free_ib_conn_res(iser_conn, destroy);
complete(&iser_conn->ib_completion);
-};
+}
-static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+static int iser_cma_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
{
struct iser_conn *iser_conn;
int ret = 0;
- iser_conn = (struct iser_conn *)cma_id->context;
+ iser_conn = cma_id->context;
iser_info("%s (%d): status %d conn %p id %p\n",
rdma_event_msg(event->event), event->event,
event->status, cma_id->context, cma_id);
@@ -869,7 +710,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
case RDMA_CM_EVENT_REJECTED:
iser_info("Connection rejected: %s\n",
rdma_reject_msg(cma_id, event->status));
- /* FALLTHROUGH */
+ fallthrough;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
@@ -915,18 +756,15 @@ void iser_conn_init(struct iser_conn *iser_conn)
INIT_LIST_HEAD(&iser_conn->conn_list);
mutex_init(&iser_conn->state_mutex);
- ib_conn->post_recv_buf_count = 0;
ib_conn->reg_cqe.done = iser_reg_comp;
}
- /**
+/*
* starts the process of connecting to the target
* sleeps until the connection is established or rejected
*/
-int iser_connect(struct iser_conn *iser_conn,
- struct sockaddr *src_addr,
- struct sockaddr *dst_addr,
- int non_blocking)
+int iser_connect(struct iser_conn *iser_conn, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int non_blocking)
{
struct ib_conn *ib_conn = &iser_conn->ib_conn;
int err = 0;
@@ -943,8 +781,7 @@ int iser_connect(struct iser_conn *iser_conn,
iser_conn->state = ISER_CONN_PENDING;
ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
- (void *)iser_conn,
- RDMA_PS_TCP, IB_QPT_RC);
+ iser_conn, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ib_conn->cma_id)) {
err = PTR_ERR(ib_conn->cma_id);
iser_err("rdma_create_id failed: %d\n", err);
@@ -987,7 +824,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
struct ib_conn *ib_conn = &iser_conn->ib_conn;
struct iser_login_desc *desc = &iser_conn->login_desc;
struct ib_recv_wr wr;
- int ib_ret;
+ int ret;
desc->sge.addr = desc->rsp_dma;
desc->sge.length = ISER_RX_LOGIN_SIZE;
@@ -999,46 +836,30 @@ int iser_post_recvl(struct iser_conn *iser_conn)
wr.num_sge = 1;
wr.next = NULL;
- ib_conn->post_recv_buf_count++;
- ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL);
- if (ib_ret) {
- iser_err("ib_post_recv failed ret=%d\n", ib_ret);
- ib_conn->post_recv_buf_count--;
- }
+ ret = ib_post_recv(ib_conn->qp, &wr, NULL);
+ if (unlikely(ret))
+ iser_err("ib_post_recv login failed ret=%d\n", ret);
- return ib_ret;
+ return ret;
}
-int iser_post_recvm(struct iser_conn *iser_conn, int count)
+int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc)
{
struct ib_conn *ib_conn = &iser_conn->ib_conn;
- unsigned int my_rx_head = iser_conn->rx_desc_head;
- struct iser_rx_desc *rx_desc;
- struct ib_recv_wr *wr;
- int i, ib_ret;
-
- for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
- rx_desc = &iser_conn->rx_descs[my_rx_head];
- rx_desc->cqe.done = iser_task_rsp;
- wr->wr_cqe = &rx_desc->cqe;
- wr->sg_list = &rx_desc->rx_sg;
- wr->num_sge = 1;
- wr->next = wr + 1;
- my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
- }
+ struct ib_recv_wr wr;
+ int ret;
- wr--;
- wr->next = NULL; /* mark end of work requests list */
+ rx_desc->cqe.done = iser_task_rsp;
+ wr.wr_cqe = &rx_desc->cqe;
+ wr.sg_list = &rx_desc->rx_sg;
+ wr.num_sge = 1;
+ wr.next = NULL;
- ib_conn->post_recv_buf_count += count;
- ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, NULL);
- if (unlikely(ib_ret)) {
- iser_err("ib_post_recv failed ret=%d\n", ib_ret);
- ib_conn->post_recv_buf_count -= count;
- } else
- iser_conn->rx_desc_head = my_rx_head;
+ ret = ib_post_recv(ib_conn->qp, &wr, NULL);
+ if (unlikely(ret))
+ iser_err("ib_post_recv failed ret=%d\n", ret);
- return ib_ret;
+ return ret;
}
@@ -1046,16 +867,14 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
* iser_post_send - Initiate a Send DTO operation
* @ib_conn: connection RDMA resources
* @tx_desc: iSER TX descriptor
- * @signal: true to send work request as SIGNALED
*
* Return: 0 on success, -1 on failure
*/
-int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
- bool signal)
+int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc)
{
struct ib_send_wr *wr = &tx_desc->send_wr;
struct ib_send_wr *first_wr;
- int ib_ret;
+ int ret;
ib_dma_sync_single_for_device(ib_conn->device->ib_device,
tx_desc->dma_addr, ISER_HEADERS_LEN,
@@ -1066,7 +885,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
wr->sg_list = tx_desc->tx_sg;
wr->num_sge = tx_desc->num_sge;
wr->opcode = IB_WR_SEND;
- wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
+ wr->send_flags = IB_SEND_SIGNALED;
if (tx_desc->inv_wr.next)
first_wr = &tx_desc->inv_wr;
@@ -1075,19 +894,19 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
else
first_wr = wr;
- ib_ret = ib_post_send(ib_conn->qp, first_wr, NULL);
- if (unlikely(ib_ret))
+ ret = ib_post_send(ib_conn->qp, first_wr, NULL);
+ if (unlikely(ret))
iser_err("ib_post_send failed, ret:%d opcode:%d\n",
- ib_ret, wr->opcode);
+ ret, wr->opcode);
- return ib_ret;
+ return ret;
}
u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir, sector_t *sector)
{
struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
- struct iser_fr_desc *desc = reg->mem_h;
+ struct iser_fr_desc *desc = reg->desc;
unsigned long sector_size = iser_task->sc->device->sector_size;
struct ib_mr_status mr_status;
int ret;
@@ -1107,7 +926,7 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
sector_t sector_off = mr_status.sig_err.sig_err_offset;
sector_div(sector_off, sector_size + 8);
- *sector = scsi_get_lba(iser_task->sc) + sector_off;
+ *sector = scsi_get_sector(iser_task->sc) + sector_off;
iser_err("PI error found type %d at sector %llx "
"expected %x vs actual %x\n",
diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig
index 1a3f5ca8354c..798147adbef1 100644
--- a/drivers/infiniband/ulp/isert/Kconfig
+++ b/drivers/infiniband/ulp/isert/Kconfig
@@ -2,5 +2,5 @@
config INFINIBAND_ISERT
tristate "iSCSI Extensions for RDMA (iSER) target support"
depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET
- ---help---
+ help
Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics.
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index a1a035270cab..b360a1527cd1 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -15,6 +15,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_cm.h>
#include <rdma/rdma_cm.h>
#include <target/target_core_base.h>
#include <target/target_core_fabric.h>
@@ -23,24 +24,30 @@
#include "ib_isert.h"
-#define ISERT_MAX_CONN 8
-#define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
-#define ISER_MAX_TX_CQ_LEN \
- ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN)
-#define ISER_MAX_CQ_LEN (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
- ISERT_MAX_CONN)
-
static int isert_debug_level;
module_param_named(debug_level, isert_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
+static int isert_sg_tablesize_set(const char *val,
+ const struct kernel_param *kp);
+static const struct kernel_param_ops sg_tablesize_ops = {
+ .set = isert_sg_tablesize_set,
+ .get = param_get_int,
+};
+
+static int isert_sg_tablesize = ISCSI_ISER_MIN_SG_TABLESIZE;
+module_param_cb(sg_tablesize, &sg_tablesize_ops, &isert_sg_tablesize, 0644);
+MODULE_PARM_DESC(sg_tablesize,
+ "Number of gather/scatter entries in a single scsi command, should >= 128 (default: 128, max: 4096)");
+
static DEFINE_MUTEX(device_list_mutex);
static LIST_HEAD(device_list);
+static struct workqueue_struct *isert_login_wq;
static struct workqueue_struct *isert_comp_wq;
static struct workqueue_struct *isert_release_wq;
static int
-isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
+isert_put_response(struct iscsit_conn *conn, struct iscsit_cmd *cmd);
static int
isert_login_post_recv(struct isert_conn *isert_conn);
static int
@@ -53,6 +60,18 @@ static void isert_send_done(struct ib_cq *cq, struct ib_wc *wc);
static void isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static void isert_login_send_done(struct ib_cq *cq, struct ib_wc *wc);
+static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp)
+{
+ int n = 0, ret;
+
+ ret = kstrtoint(val, 10, &n);
+ if (ret != 0 || n < ISCSI_ISER_MIN_SG_TABLESIZE ||
+ n > ISCSI_ISER_MAX_SG_TABLESIZE)
+ return -EINVAL;
+
+ return param_set_int(val, kp);
+}
+
static inline bool
isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd)
{
@@ -60,7 +79,6 @@ isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd)
cmd->prot_op != TARGET_PROT_NORMAL);
}
-
static void
isert_qp_event_callback(struct ib_event *e, void *context)
{
@@ -81,53 +99,34 @@ isert_qp_event_callback(struct ib_event *e, void *context)
}
}
-static struct isert_comp *
-isert_comp_get(struct isert_conn *isert_conn)
-{
- struct isert_device *device = isert_conn->device;
- struct isert_comp *comp;
- int i, min = 0;
-
- mutex_lock(&device_list_mutex);
- for (i = 0; i < device->comps_used; i++)
- if (device->comps[i].active_qps <
- device->comps[min].active_qps)
- min = i;
- comp = &device->comps[min];
- comp->active_qps++;
- mutex_unlock(&device_list_mutex);
-
- isert_info("conn %p, using comp %p min_index: %d\n",
- isert_conn, comp, min);
-
- return comp;
-}
-
-static void
-isert_comp_put(struct isert_comp *comp)
-{
- mutex_lock(&device_list_mutex);
- comp->active_qps--;
- mutex_unlock(&device_list_mutex);
-}
-
static struct ib_qp *
isert_create_qp(struct isert_conn *isert_conn,
- struct isert_comp *comp,
struct rdma_cm_id *cma_id)
{
+ u32 cq_size = ISERT_QP_MAX_REQ_DTOS + ISERT_QP_MAX_RECV_DTOS + 2;
struct isert_device *device = isert_conn->device;
+ struct ib_device *ib_dev = device->ib_device;
struct ib_qp_init_attr attr;
- int ret;
+ int ret, factor;
+
+ isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_WORKQUEUE);
+ if (IS_ERR(isert_conn->cq)) {
+ isert_err("Unable to allocate cq\n");
+ ret = PTR_ERR(isert_conn->cq);
+ return ERR_PTR(ret);
+ }
+ isert_conn->cq_size = cq_size;
memset(&attr, 0, sizeof(struct ib_qp_init_attr));
attr.event_handler = isert_qp_event_callback;
attr.qp_context = isert_conn;
- attr.send_cq = comp->cq;
- attr.recv_cq = comp->cq;
+ attr.send_cq = isert_conn->cq;
+ attr.recv_cq = isert_conn->cq;
attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
- attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
+ factor = rdma_rw_mr_factor(device->ib_device, cma_id->port_num,
+ isert_sg_tablesize);
+ attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX * factor;
attr.cap.max_send_sge = device->ib_device->attrs.max_send_sge;
attr.cap.max_recv_sge = 1;
attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -138,6 +137,8 @@ isert_create_qp(struct isert_conn *isert_conn,
ret = rdma_create_qp(cma_id, device->pd, &attr);
if (ret) {
isert_err("rdma_create_qp failed for cma_id %d\n", ret);
+ ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
+
return ERR_PTR(ret);
}
@@ -145,25 +146,6 @@ isert_create_qp(struct isert_conn *isert_conn,
}
static int
-isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id)
-{
- struct isert_comp *comp;
- int ret;
-
- comp = isert_comp_get(isert_conn);
- isert_conn->qp = isert_create_qp(isert_conn, comp, cma_id);
- if (IS_ERR(isert_conn->qp)) {
- ret = PTR_ERR(isert_conn->qp);
- goto err;
- }
-
- return 0;
-err:
- isert_comp_put(comp);
- return ret;
-}
-
-static int
isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
{
struct isert_device *device = isert_conn->device;
@@ -182,15 +164,15 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
rx_desc = isert_conn->rx_descs;
for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) {
- dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ dma_addr = ib_dma_map_single(ib_dev, rx_desc->buf,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
if (ib_dma_mapping_error(ib_dev, dma_addr))
goto dma_map_fail;
rx_desc->dma_addr = dma_addr;
rx_sg = &rx_desc->rx_sg;
- rx_sg->addr = rx_desc->dma_addr;
+ rx_sg->addr = rx_desc->dma_addr + isert_get_hdr_offset(rx_desc);
rx_sg->length = ISER_RX_PAYLOAD_SIZE;
rx_sg->lkey = device->pd->local_dma_lkey;
rx_desc->rx_cqe.done = isert_recv_done;
@@ -202,7 +184,7 @@ dma_map_fail:
rx_desc = isert_conn->rx_descs;
for (j = 0; j < i; j++, rx_desc++) {
ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
}
kfree(isert_conn->rx_descs);
isert_conn->rx_descs = NULL;
@@ -223,68 +205,13 @@ isert_free_rx_descriptors(struct isert_conn *isert_conn)
rx_desc = isert_conn->rx_descs;
for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) {
ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
}
kfree(isert_conn->rx_descs);
isert_conn->rx_descs = NULL;
}
-static void
-isert_free_comps(struct isert_device *device)
-{
- int i;
-
- for (i = 0; i < device->comps_used; i++) {
- struct isert_comp *comp = &device->comps[i];
-
- if (comp->cq)
- ib_free_cq(comp->cq);
- }
- kfree(device->comps);
-}
-
-static int
-isert_alloc_comps(struct isert_device *device)
-{
- int i, max_cqe, ret = 0;
-
- device->comps_used = min(ISERT_MAX_CQ, min_t(int, num_online_cpus(),
- device->ib_device->num_comp_vectors));
-
- isert_info("Using %d CQs, %s supports %d vectors support "
- "pi_capable %d\n",
- device->comps_used, dev_name(&device->ib_device->dev),
- device->ib_device->num_comp_vectors,
- device->pi_capable);
-
- device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
- GFP_KERNEL);
- if (!device->comps)
- return -ENOMEM;
-
- max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
-
- for (i = 0; i < device->comps_used; i++) {
- struct isert_comp *comp = &device->comps[i];
-
- comp->device = device;
- comp->cq = ib_alloc_cq(device->ib_device, comp, max_cqe, i,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(comp->cq)) {
- isert_err("Unable to allocate cq\n");
- ret = PTR_ERR(comp->cq);
- comp->cq = NULL;
- goto out_cq;
- }
- }
-
- return 0;
-out_cq:
- isert_free_comps(device);
- return ret;
-}
-
static int
isert_create_device_ib_res(struct isert_device *device)
{
@@ -295,30 +222,21 @@ isert_create_device_ib_res(struct isert_device *device)
ib_dev->attrs.max_send_sge, ib_dev->attrs.max_recv_sge);
isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
- ret = isert_alloc_comps(device);
- if (ret)
- goto out;
-
device->pd = ib_alloc_pd(ib_dev, 0);
if (IS_ERR(device->pd)) {
ret = PTR_ERR(device->pd);
isert_err("failed to allocate pd, device %p, ret=%d\n",
device, ret);
- goto out_cq;
+ return ret;
}
/* Check signature cap */
- device->pi_capable = ib_dev->attrs.device_cap_flags &
- IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
+ if (ib_dev->attrs.kernel_cap_flags & IBK_INTEGRITY_HANDOVER)
+ device->pi_capable = true;
+ else
+ device->pi_capable = false;
return 0;
-
-out_cq:
- isert_free_comps(device);
-out:
- if (ret > 0)
- ret = -EINVAL;
- return ret;
}
static void
@@ -327,7 +245,6 @@ isert_free_device_ib_res(struct isert_device *device)
isert_info("device %p\n", device);
ib_dealloc_pd(device->pd);
- isert_free_comps(device);
}
static void
@@ -408,10 +325,9 @@ isert_free_login_buf(struct isert_conn *isert_conn)
ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE);
kfree(isert_conn->login_rsp_buf);
- ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
- ISER_RX_PAYLOAD_SIZE,
- DMA_FROM_DEVICE);
- kfree(isert_conn->login_req_buf);
+ ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
+ kfree(isert_conn->login_desc);
}
static int
@@ -420,25 +336,25 @@ isert_alloc_login_buf(struct isert_conn *isert_conn,
{
int ret;
- isert_conn->login_req_buf = kzalloc(sizeof(*isert_conn->login_req_buf),
+ isert_conn->login_desc = kzalloc(sizeof(*isert_conn->login_desc),
GFP_KERNEL);
- if (!isert_conn->login_req_buf)
+ if (!isert_conn->login_desc)
return -ENOMEM;
- isert_conn->login_req_dma = ib_dma_map_single(ib_dev,
- isert_conn->login_req_buf,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma);
+ isert_conn->login_desc->dma_addr = ib_dma_map_single(ib_dev,
+ isert_conn->login_desc->buf,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(ib_dev, isert_conn->login_desc->dma_addr);
if (ret) {
- isert_err("login_req_dma mapping error: %d\n", ret);
- isert_conn->login_req_dma = 0;
- goto out_free_login_req_buf;
+ isert_err("login_desc dma mapping error: %d\n", ret);
+ isert_conn->login_desc->dma_addr = 0;
+ goto out_free_login_desc;
}
isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL);
if (!isert_conn->login_rsp_buf) {
ret = -ENOMEM;
- goto out_unmap_login_req_buf;
+ goto out_unmap_login_desc;
}
isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev,
@@ -455,11 +371,11 @@ isert_alloc_login_buf(struct isert_conn *isert_conn,
out_free_login_rsp_buf:
kfree(isert_conn->login_rsp_buf);
-out_unmap_login_req_buf:
- ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
-out_free_login_req_buf:
- kfree(isert_conn->login_req_buf);
+out_unmap_login_desc:
+ ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
+out_free_login_desc:
+ kfree(isert_conn->login_desc);
return ret;
}
@@ -489,6 +405,13 @@ isert_set_nego_params(struct isert_conn *isert_conn,
}
}
+static void
+isert_destroy_qp(struct isert_conn *isert_conn)
+{
+ ib_destroy_qp(isert_conn->qp);
+ ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
+}
+
static int
isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
{
@@ -502,7 +425,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
if (!np->enabled) {
spin_unlock_bh(&np->np_thread_lock);
isert_dbg("iscsi_np is not enabled, reject connect request\n");
- return rdma_reject(cma_id, NULL, 0);
+ return rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED);
}
spin_unlock_bh(&np->np_thread_lock);
@@ -516,30 +439,32 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
isert_init_conn(isert_conn);
isert_conn->cm_id = cma_id;
- ret = isert_alloc_login_buf(isert_conn, cma_id->device);
- if (ret)
- goto out;
-
device = isert_device_get(cma_id);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
- goto out_rsp_dma_map;
+ goto out;
}
isert_conn->device = device;
- isert_set_nego_params(isert_conn, &event->param.conn);
-
- ret = isert_conn_setup_qp(isert_conn, cma_id);
+ ret = isert_alloc_login_buf(isert_conn, cma_id->device);
if (ret)
goto out_conn_dev;
+ isert_set_nego_params(isert_conn, &event->param.conn);
+
+ isert_conn->qp = isert_create_qp(isert_conn, cma_id);
+ if (IS_ERR(isert_conn->qp)) {
+ ret = PTR_ERR(isert_conn->qp);
+ goto out_rsp_dma_map;
+ }
+
ret = isert_login_post_recv(isert_conn);
if (ret)
- goto out_conn_dev;
+ goto out_destroy_qp;
ret = isert_rdma_accept(isert_conn);
if (ret)
- goto out_conn_dev;
+ goto out_destroy_qp;
mutex_lock(&isert_np->mutex);
list_add_tail(&isert_conn->node, &isert_np->accepted);
@@ -547,13 +472,15 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
return 0;
-out_conn_dev:
- isert_device_put(device);
+out_destroy_qp:
+ isert_destroy_qp(isert_conn);
out_rsp_dma_map:
isert_free_login_buf(isert_conn);
+out_conn_dev:
+ isert_device_put(device);
out:
kfree(isert_conn);
- rdma_reject(cma_id, NULL, 0);
+ rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED);
return ret;
}
@@ -571,14 +498,10 @@ isert_connect_release(struct isert_conn *isert_conn)
!isert_conn->dev_removed)
rdma_destroy_id(isert_conn->cm_id);
- if (isert_conn->qp) {
- struct isert_comp *comp = isert_conn->qp->recv_cq->cq_context;
-
- isert_comp_put(comp);
- ib_destroy_qp(isert_conn->qp);
- }
+ if (isert_conn->qp)
+ isert_destroy_qp(isert_conn);
- if (isert_conn->login_req_buf)
+ if (isert_conn->login_desc)
isert_free_login_buf(isert_conn);
isert_device_put(device);
@@ -766,8 +689,8 @@ isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_ESTABLISHED:
isert_connected_handler(cma_id);
break;
- case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */
- case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */
ret = isert_disconnected_handler(cma_id, event->event);
break;
@@ -786,7 +709,7 @@ isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_REJECTED:
isert_info("Connection rejected: %s\n",
rdma_reject_msg(cma_id, event->status));
- /* fall through */
+ fallthrough;
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_CONNECT_ERROR:
ret = isert_connect_error(cma_id);
@@ -964,17 +887,18 @@ isert_login_post_recv(struct isert_conn *isert_conn)
int ret;
memset(&sge, 0, sizeof(struct ib_sge));
- sge.addr = isert_conn->login_req_dma;
+ sge.addr = isert_conn->login_desc->dma_addr +
+ isert_get_hdr_offset(isert_conn->login_desc);
sge.length = ISER_RX_PAYLOAD_SIZE;
sge.lkey = isert_conn->device->pd->local_dma_lkey;
isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
sge.addr, sge.length, sge.lkey);
- isert_conn->login_req_buf->rx_cqe.done = isert_login_recv_done;
+ isert_conn->login_desc->rx_cqe.done = isert_login_recv_done;
memset(&rx_wr, 0, sizeof(struct ib_recv_wr));
- rx_wr.wr_cqe = &isert_conn->login_req_buf->rx_cqe;
+ rx_wr.wr_cqe = &isert_conn->login_desc->rx_cqe;
rx_wr.sg_list = &sge;
rx_wr.num_sge = 1;
@@ -986,7 +910,7 @@ isert_login_post_recv(struct isert_conn *isert_conn)
}
static int
-isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
+isert_put_login_tx(struct iscsit_conn *conn, struct iscsi_login *login,
u32 length)
{
struct isert_conn *isert_conn = conn->context;
@@ -1051,9 +975,9 @@ post_send:
static void
isert_rx_login_req(struct isert_conn *isert_conn)
{
- struct iser_rx_desc *rx_desc = isert_conn->login_req_buf;
+ struct iser_rx_desc *rx_desc = isert_conn->login_desc;
int rx_buflen = isert_conn->login_req_len;
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
struct iscsi_login *login = conn->conn_login;
int size;
@@ -1063,7 +987,7 @@ isert_rx_login_req(struct isert_conn *isert_conn)
if (login->first_request) {
struct iscsi_login_req *login_req =
- (struct iscsi_login_req *)&rx_desc->iscsi_header;
+ (struct iscsi_login_req *)isert_get_iscsi_hdr(rx_desc);
/*
* Setup the initial iscsi_login values from the leading
* login request PDU.
@@ -1082,36 +1006,36 @@ isert_rx_login_req(struct isert_conn *isert_conn)
login->tsih = be16_to_cpu(login_req->tsih);
}
- memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN);
+ memcpy(&login->req[0], isert_get_iscsi_hdr(rx_desc), ISCSI_HDR_LEN);
size = min(rx_buflen, MAX_KEY_VALUE_PAIRS);
isert_dbg("Using login payload size: %d, rx_buflen: %d "
"MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen,
MAX_KEY_VALUE_PAIRS);
- memcpy(login->req_buf, &rx_desc->data[0], size);
+ memcpy(login->req_buf, isert_get_data(rx_desc), size);
if (login->first_request) {
complete(&isert_conn->login_comp);
return;
}
- schedule_delayed_work(&conn->login_work, 0);
+ queue_delayed_work(isert_login_wq, &conn->login_work, 0);
}
-static struct iscsi_cmd
-*isert_allocate_cmd(struct iscsi_conn *conn, struct iser_rx_desc *rx_desc)
+static struct iscsit_cmd
+*isert_allocate_cmd(struct iscsit_conn *conn, struct iser_rx_desc *rx_desc)
{
struct isert_conn *isert_conn = conn->context;
struct isert_cmd *isert_cmd;
- struct iscsi_cmd *cmd;
+ struct iscsit_cmd *cmd;
cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE);
if (!cmd) {
- isert_err("Unable to allocate iscsi_cmd + isert_cmd\n");
+ isert_err("Unable to allocate iscsit_cmd + isert_cmd\n");
return NULL;
}
isert_cmd = iscsit_priv_cmd(cmd);
isert_cmd->conn = isert_conn;
- isert_cmd->iscsi_cmd = cmd;
+ isert_cmd->iscsit_cmd = cmd;
isert_cmd->rx_desc = rx_desc;
return cmd;
@@ -1119,10 +1043,10 @@ static struct iscsi_cmd
static int
isert_handle_scsi_cmd(struct isert_conn *isert_conn,
- struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd,
+ struct isert_cmd *isert_cmd, struct iscsit_cmd *cmd,
struct iser_rx_desc *rx_desc, unsigned char *buf)
{
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf;
int imm_data, imm_data_len, unsol_data, sg_nents, rc;
bool dump_payload = false;
@@ -1153,14 +1077,15 @@ isert_handle_scsi_cmd(struct isert_conn *isert_conn,
if (imm_data_len != data_len) {
sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE));
sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents,
- &rx_desc->data[0], imm_data_len);
+ isert_get_data(rx_desc), imm_data_len);
isert_dbg("Copy Immediate sg_nents: %u imm_data_len: %d\n",
sg_nents, imm_data_len);
} else {
sg_init_table(&isert_cmd->sg, 1);
cmd->se_cmd.t_data_sg = &isert_cmd->sg;
cmd->se_cmd.t_data_nents = 1;
- sg_set_buf(&isert_cmd->sg, &rx_desc->data[0], imm_data_len);
+ sg_set_buf(&isert_cmd->sg, isert_get_data(rx_desc),
+ imm_data_len);
isert_dbg("Transfer Immediate imm_data_len: %d\n",
imm_data_len);
}
@@ -1177,7 +1102,7 @@ isert_handle_scsi_cmd(struct isert_conn *isert_conn,
sequence_cmd:
rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
- if (!rc && dump_payload == false && unsol_data)
+ if (!rc && !dump_payload && unsol_data)
iscsit_set_unsolicited_dataout(cmd);
else if (dump_payload && imm_data)
target_put_sess_cmd(&cmd->se_cmd);
@@ -1190,8 +1115,8 @@ isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
struct iser_rx_desc *rx_desc, unsigned char *buf)
{
struct scatterlist *sg_start;
- struct iscsi_conn *conn = isert_conn->conn;
- struct iscsi_cmd *cmd = NULL;
+ struct iscsit_conn *conn = isert_conn->conn;
+ struct iscsit_cmd *cmd = NULL;
struct iscsi_data *hdr = (struct iscsi_data *)buf;
u32 unsol_data_len = ntoh24(hdr->dlength);
int rc, sg_nents, sg_off, page_off;
@@ -1229,9 +1154,9 @@ isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
}
isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u "
"sg_nents: %u from %p %u\n", sg_start, sg_off,
- sg_nents, &rx_desc->data[0], unsol_data_len);
+ sg_nents, isert_get_data(rx_desc), unsol_data_len);
- sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0],
+ sg_copy_from_buffer(sg_start, sg_nents, isert_get_data(rx_desc),
unsol_data_len);
rc = iscsit_check_dataout_payload(cmd, hdr, false);
@@ -1242,20 +1167,15 @@ isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
* multiple data-outs on the same command can arrive -
* so post the buffer before hand
*/
- rc = isert_post_recv(isert_conn, rx_desc);
- if (rc) {
- isert_err("ib_post_recv failed with %d\n", rc);
- return rc;
- }
- return 0;
+ return isert_post_recv(isert_conn, rx_desc);
}
static int
isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
- struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+ struct iscsit_cmd *cmd, struct iser_rx_desc *rx_desc,
unsigned char *buf)
{
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf;
int rc;
@@ -1271,10 +1191,10 @@ isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
static int
isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
- struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+ struct iscsit_cmd *cmd, struct iser_rx_desc *rx_desc,
struct iscsi_text *hdr)
{
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
u32 payload_length = ntoh24(hdr->dlength);
int rc;
unsigned char *text_in = NULL;
@@ -1290,7 +1210,7 @@ isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd
}
cmd->text_in_ptr = text_in;
- memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length);
+ memcpy(cmd->text_in_ptr, isert_get_data(rx_desc), payload_length);
return iscsit_process_text_cmd(conn, cmd, hdr);
}
@@ -1300,9 +1220,9 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
uint32_t read_stag, uint64_t read_va,
uint32_t write_stag, uint64_t write_va)
{
- struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
- struct iscsi_conn *conn = isert_conn->conn;
- struct iscsi_cmd *cmd;
+ struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc);
+ struct iscsit_conn *conn = isert_conn->conn;
+ struct iscsit_cmd *cmd;
struct isert_cmd *isert_cmd;
int ret = -EINVAL;
u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK);
@@ -1398,8 +1318,8 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct isert_conn *isert_conn = wc->qp->qp_context;
struct ib_device *ib_dev = isert_conn->cm_id->device;
struct iser_rx_desc *rx_desc = cqe_to_rx_desc(wc->wr_cqe);
- struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
- struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
+ struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc);
+ struct iser_ctrl *iser_ctrl = isert_get_iser_hdr(rx_desc);
uint64_t read_va = 0, write_va = 0;
uint32_t read_stag = 0, write_stag = 0;
@@ -1413,7 +1333,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
rx_desc->in_use = true;
ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
isert_dbg("DMA: 0x%llx, iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n",
rx_desc->dma_addr, hdr->opcode, hdr->itt, hdr->flags,
@@ -1448,7 +1368,7 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
read_stag, read_va, write_stag, write_va);
ib_dma_sync_single_for_device(ib_dev, rx_desc->dma_addr,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
}
static void
@@ -1462,8 +1382,8 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
- ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_req_dma,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_desc->dma_addr,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
isert_conn->login_req_len = wc->byte_len - ISER_HEADERS_LEN;
@@ -1478,14 +1398,14 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
complete(&isert_conn->login_req_comp);
mutex_unlock(&isert_conn->mutex);
- ib_dma_sync_single_for_device(ib_dev, isert_conn->login_req_dma,
- ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+ ib_dma_sync_single_for_device(ib_dev, isert_conn->login_desc->dma_addr,
+ ISER_RX_SIZE, DMA_FROM_DEVICE);
}
static void
isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn)
{
- struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+ struct se_cmd *se_cmd = &cmd->iscsit_cmd->se_cmd;
enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
if (!cmd->rw.nr_ops)
@@ -1507,9 +1427,9 @@ isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn)
static void
isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
{
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+ struct iscsit_cmd *cmd = isert_cmd->iscsit_cmd;
struct isert_conn *isert_conn = isert_cmd->conn;
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
struct iscsi_text_rsp *hdr;
isert_dbg("Cmd %p\n", isert_cmd);
@@ -1572,7 +1492,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
transport_generic_free_cmd(&cmd->se_cmd, 0);
break;
}
- /* fall through */
+ fallthrough;
default:
iscsit_release_cmd(cmd);
break;
@@ -1634,12 +1554,12 @@ isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr)
}
sec_offset_err = mr_status.sig_err.sig_err_offset;
do_div(sec_offset_err, block_size);
- se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba;
+ se_cmd->sense_info = sec_offset_err + se_cmd->t_task_lba;
isert_err("PI error found type %d at sector 0x%llx "
"expected 0x%x vs actual 0x%x\n",
mr_status.sig_err.err_type,
- (unsigned long long)se_cmd->bad_sector,
+ (unsigned long long)se_cmd->sense_info,
mr_status.sig_err.expected,
mr_status.sig_err.actual);
ret = 1;
@@ -1656,7 +1576,7 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
struct isert_device *device = isert_conn->device;
struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe);
struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc);
- struct se_cmd *cmd = &isert_cmd->iscsi_cmd->se_cmd;
+ struct se_cmd *cmd = &isert_cmd->iscsit_cmd->se_cmd;
int ret = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
@@ -1685,7 +1605,7 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
/*
* XXX: isert_put_response() failure is not retried.
*/
- ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+ ret = isert_put_response(isert_conn->conn, isert_cmd->iscsit_cmd);
if (ret)
pr_warn_ratelimited("isert_put_response() ret: %d\n", ret);
}
@@ -1698,7 +1618,7 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
struct isert_device *device = isert_conn->device;
struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe);
struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc);
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+ struct iscsit_cmd *cmd = isert_cmd->iscsit_cmd;
struct se_cmd *se_cmd = &cmd->se_cmd;
int ret = 0;
@@ -1743,14 +1663,14 @@ isert_do_control_comp(struct work_struct *work)
struct isert_cmd, comp_work);
struct isert_conn *isert_conn = isert_cmd->conn;
struct ib_device *ib_dev = isert_conn->cm_id->device;
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+ struct iscsit_cmd *cmd = isert_cmd->iscsit_cmd;
isert_dbg("Cmd %p i_state %d\n", isert_cmd, cmd->i_state);
switch (cmd->i_state) {
case ISTATE_SEND_TASKMGTRSP:
iscsit_tmr_post_handler(cmd, cmd->conn);
- /* fall through */
+ fallthrough;
case ISTATE_SEND_REJECT:
case ISTATE_SEND_TEXTRSP:
cmd->i_state = ISTATE_SENT_STATUS;
@@ -1801,7 +1721,7 @@ isert_send_done(struct ib_cq *cq, struct ib_wc *wc)
isert_dbg("Cmd %p\n", isert_cmd);
- switch (isert_cmd->iscsi_cmd->i_state) {
+ switch (isert_cmd->iscsit_cmd->i_state) {
case ISTATE_SEND_TASKMGTRSP:
case ISTATE_SEND_LOGOUTRSP:
case ISTATE_SEND_REJECT:
@@ -1812,7 +1732,7 @@ isert_send_done(struct ib_cq *cq, struct ib_wc *wc)
queue_work(isert_comp_wq, &isert_cmd->comp_work);
return;
default:
- isert_cmd->iscsi_cmd->i_state = ISTATE_SENT_STATUS;
+ isert_cmd->iscsit_cmd->i_state = ISTATE_SENT_STATUS;
isert_completion_put(tx_desc, isert_cmd, ib_dev, false);
break;
}
@@ -1824,10 +1744,8 @@ isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd)
int ret;
ret = isert_post_recv(isert_conn, isert_cmd->rx_desc);
- if (ret) {
- isert_err("ib_post_recv failed with %d\n", ret);
+ if (ret)
return ret;
- }
ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr, NULL);
if (ret) {
@@ -1838,7 +1756,7 @@ isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd)
}
static int
-isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+isert_put_response(struct iscsit_conn *conn, struct iscsit_cmd *cmd)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -1889,7 +1807,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
}
static void
-isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+isert_aborted_task(struct iscsit_conn *conn, struct iscsit_cmd *cmd)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -1905,7 +1823,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
}
static enum target_prot_op
-isert_get_sup_prot_ops(struct iscsi_conn *conn)
+isert_get_sup_prot_ops(struct iscsit_conn *conn)
{
struct isert_conn *isert_conn = conn->context;
struct isert_device *device = isert_conn->device;
@@ -1925,7 +1843,7 @@ isert_get_sup_prot_ops(struct iscsi_conn *conn)
}
static int
-isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
+isert_put_nopin(struct iscsit_cmd *cmd, struct iscsit_conn *conn,
bool nopout_response)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
@@ -1945,7 +1863,7 @@ isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
}
static int
-isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+isert_put_logout_rsp(struct iscsit_cmd *cmd, struct iscsit_conn *conn)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -1963,7 +1881,7 @@ isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
}
static int
-isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+isert_put_tm_rsp(struct iscsit_cmd *cmd, struct iscsit_conn *conn)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -1981,7 +1899,7 @@ isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
}
static int
-isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+isert_put_reject(struct iscsit_cmd *cmd, struct iscsit_conn *conn)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -2016,7 +1934,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
}
static int
-isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+isert_put_text_rsp(struct iscsit_cmd *cmd, struct iscsit_conn *conn)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
@@ -2076,7 +1994,7 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_domain *domain)
if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT ||
se_cmd->prot_type == TARGET_DIF_TYPE2_PROT)
domain->sig.dif.ref_remap = true;
-};
+}
static int
isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
@@ -2118,7 +2036,7 @@ static int
isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn,
struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
{
- struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+ struct se_cmd *se_cmd = &cmd->iscsit_cmd->se_cmd;
enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
u8 port_num = conn->cm_id->port_num;
u64 addr;
@@ -2131,7 +2049,7 @@ isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn,
if (dir == DMA_FROM_DEVICE) {
addr = cmd->write_va;
rkey = cmd->write_stag;
- offset = cmd->iscsi_cmd->write_data_done;
+ offset = cmd->iscsit_cmd->write_data_done;
} else {
addr = cmd->read_va;
rkey = cmd->read_stag;
@@ -2171,7 +2089,7 @@ rdma_ctx_post:
}
static int
-isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+isert_put_datain(struct iscsit_conn *conn, struct iscsit_cmd *cmd)
{
struct se_cmd *se_cmd = &cmd->se_cmd;
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
@@ -2199,10 +2117,8 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
&isert_cmd->tx_desc.send_wr);
rc = isert_post_recv(isert_conn, isert_cmd->rx_desc);
- if (rc) {
- isert_err("ib_post_recv failed with %d\n", rc);
+ if (rc)
return rc;
- }
chain_wr = &isert_cmd->tx_desc.send_wr;
}
@@ -2214,7 +2130,7 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
}
static int
-isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
+isert_get_dataout(struct iscsit_conn *conn, struct iscsit_cmd *cmd, bool recovery)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
int ret;
@@ -2232,7 +2148,7 @@ isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
}
static int
-isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+isert_immediate_queue(struct iscsit_conn *conn, struct iscsit_cmd *cmd, int state)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
int ret = 0;
@@ -2257,7 +2173,7 @@ isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
}
static int
-isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+isert_response_queue(struct iscsit_conn *conn, struct iscsit_cmd *cmd, int state)
{
struct isert_conn *isert_conn = conn->context;
int ret;
@@ -2316,6 +2232,16 @@ isert_setup_id(struct isert_np *isert_np)
}
isert_dbg("id %p context %p\n", id, id->context);
+ /*
+ * Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+ ret = rdma_set_afonly(id, 1);
+ if (ret) {
+ isert_err("rdma_set_afonly() failed: %d\n", ret);
+ goto out_id;
+ }
+
ret = rdma_bind_addr(id, sa);
if (ret) {
isert_err("rdma_bind_addr() failed: %d\n", ret);
@@ -2407,7 +2333,7 @@ isert_rdma_accept(struct isert_conn *isert_conn)
}
static int
-isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
+isert_get_login_rx(struct iscsit_conn *conn, struct iscsi_login *login)
{
struct isert_conn *isert_conn = conn->context;
int ret;
@@ -2423,9 +2349,9 @@ isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
/*
* For login requests after the first PDU, isert_rx_login_req() will
- * kick schedule_delayed_work(&conn->login_work) as the packet is
- * received, which turns this callback from iscsi_target_do_login_rx()
- * into a NOP.
+ * kick queue_delayed_work(isert_login_wq, &conn->login_work) as
+ * the packet is received, which turns this callback from
+ * iscsi_target_do_login_rx() into a NOP.
*/
if (!login->first_request)
return 0;
@@ -2443,7 +2369,7 @@ isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
}
static void
-isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
+isert_set_conn_info(struct iscsi_np *np, struct iscsit_conn *conn,
struct isert_conn *isert_conn)
{
struct rdma_cm_id *cm_id = isert_conn->cm_id;
@@ -2456,7 +2382,7 @@ isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
}
static int
-isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
+isert_accept_np(struct iscsi_np *np, struct iscsit_conn *conn)
{
struct isert_np *isert_np = np->np_context;
struct isert_conn *isert_conn;
@@ -2472,10 +2398,10 @@ accept_wait:
spin_unlock_bh(&np->np_thread_lock);
isert_dbg("np_thread_state %d\n",
np->np_thread_state);
- /**
+ /*
* No point in stalling here when np_thread
* is in state RESET/SHUTDOWN/EXIT - bail
- **/
+ */
return -ENODEV;
}
spin_unlock_bh(&np->np_thread_lock);
@@ -2564,7 +2490,7 @@ static void isert_release_work(struct work_struct *work)
static void
isert_wait4logout(struct isert_conn *isert_conn)
{
- struct iscsi_conn *conn = isert_conn->conn;
+ struct iscsit_conn *conn = isert_conn->conn;
isert_info("conn %p\n", isert_conn);
@@ -2576,12 +2502,12 @@ isert_wait4logout(struct isert_conn *isert_conn)
}
static void
-isert_wait4cmds(struct iscsi_conn *conn)
+isert_wait4cmds(struct iscsit_conn *conn)
{
- isert_info("iscsi_conn %p\n", conn);
+ isert_info("iscsit_conn %p\n", conn);
if (conn->sess) {
- target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+ target_stop_session(conn->sess->se_sess);
target_wait_for_sess_cmds(conn->sess->se_sess);
}
}
@@ -2596,9 +2522,9 @@ isert_wait4cmds(struct iscsi_conn *conn)
* before blocking on the target_wait_for_session_cmds
*/
static void
-isert_put_unsol_pending_cmds(struct iscsi_conn *conn)
+isert_put_unsol_pending_cmds(struct iscsit_conn *conn)
{
- struct iscsi_cmd *cmd, *tmp;
+ struct iscsit_cmd *cmd, *tmp;
static LIST_HEAD(drop_cmd_list);
spin_lock_bh(&conn->cmd_lock);
@@ -2621,7 +2547,7 @@ isert_put_unsol_pending_cmds(struct iscsi_conn *conn)
}
}
-static void isert_wait_conn(struct iscsi_conn *conn)
+static void isert_wait_conn(struct iscsit_conn *conn)
{
struct isert_conn *isert_conn = conn->context;
@@ -2639,7 +2565,7 @@ static void isert_wait_conn(struct iscsi_conn *conn)
queue_work(isert_release_wq, &isert_conn->release_work);
}
-static void isert_free_conn(struct iscsi_conn *conn)
+static void isert_free_conn(struct iscsit_conn *conn)
{
struct isert_conn *isert_conn = conn->context;
@@ -2647,7 +2573,7 @@ static void isert_free_conn(struct iscsi_conn *conn)
isert_put_conn(isert_conn);
}
-static void isert_get_rx_pdu(struct iscsi_conn *conn)
+static void isert_get_rx_pdu(struct iscsit_conn *conn)
{
struct completion comp;
@@ -2681,20 +2607,23 @@ static struct iscsit_transport iser_target_transport = {
static int __init isert_init(void)
{
- int ret;
+ isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0);
+ if (!isert_login_wq) {
+ isert_err("Unable to allocate isert_login_wq\n");
+ return -ENOMEM;
+ }
isert_comp_wq = alloc_workqueue("isert_comp_wq",
WQ_UNBOUND | WQ_HIGHPRI, 0);
if (!isert_comp_wq) {
isert_err("Unable to allocate isert_comp_wq\n");
- return -ENOMEM;
+ goto destroy_login_wq;
}
isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
if (!isert_release_wq) {
isert_err("Unable to allocate isert_release_wq\n");
- ret = -ENOMEM;
goto destroy_comp_wq;
}
@@ -2705,17 +2634,20 @@ static int __init isert_init(void)
destroy_comp_wq:
destroy_workqueue(isert_comp_wq);
+destroy_login_wq:
+ destroy_workqueue(isert_login_wq);
- return ret;
+ return -ENOMEM;
}
static void __exit isert_exit(void)
{
- flush_scheduled_work();
+ flush_workqueue(isert_login_wq);
destroy_workqueue(isert_release_wq);
destroy_workqueue(isert_comp_wq);
iscsit_unregister_transport(&iser_target_transport);
isert_info("iSER_TARGET[0] - Released iser_target_transport\n");
+ destroy_workqueue(isert_login_wq);
}
MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure");
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 3b296bac4f60..0b2dfd6e7e27 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -59,11 +59,17 @@
ISERT_MAX_TX_MISC_PDUS + \
ISERT_MAX_RX_MISC_PDUS)
-#define ISER_RX_PAD_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \
- (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \
- sizeof(struct ib_cqe) + sizeof(bool)))
+/*
+ * RX size is default of 8k plus headers, but data needs to align to
+ * 512 boundary, so use 1024 to have the extra space for alignment.
+ */
+#define ISER_RX_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 1024)
+
+/* Minimum I/O size is 512KB */
+#define ISCSI_ISER_MIN_SG_TABLESIZE 128
-#define ISCSI_ISER_SG_TABLESIZE 256
+/* Maximum support is 16MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE 4096
enum isert_desc_type {
ISCSI_TX_CONTROL,
@@ -80,21 +86,41 @@ enum iser_conn_state {
};
struct iser_rx_desc {
- struct iser_ctrl iser_header;
- struct iscsi_hdr iscsi_header;
- char data[ISCSI_DEF_MAX_RECV_SEG_LEN];
+ char buf[ISER_RX_SIZE];
u64 dma_addr;
struct ib_sge rx_sg;
struct ib_cqe rx_cqe;
bool in_use;
- char pad[ISER_RX_PAD_SIZE];
-} __packed;
+};
static inline struct iser_rx_desc *cqe_to_rx_desc(struct ib_cqe *cqe)
{
return container_of(cqe, struct iser_rx_desc, rx_cqe);
}
+static void *isert_get_iser_hdr(struct iser_rx_desc *desc)
+{
+ return PTR_ALIGN(desc->buf + ISER_HEADERS_LEN, 512) - ISER_HEADERS_LEN;
+}
+
+static size_t isert_get_hdr_offset(struct iser_rx_desc *desc)
+{
+ return isert_get_iser_hdr(desc) - (void *)desc->buf;
+}
+
+static void *isert_get_iscsi_hdr(struct iser_rx_desc *desc)
+{
+ return isert_get_iser_hdr(desc) + sizeof(struct iser_ctrl);
+}
+
+static void *isert_get_data(struct iser_rx_desc *desc)
+{
+ void *data = isert_get_iser_hdr(desc) + ISER_HEADERS_LEN;
+
+ WARN_ON((uintptr_t)data & 511);
+ return data;
+}
+
struct iser_tx_desc {
struct iser_ctrl iser_header;
struct iscsi_hdr iscsi_header;
@@ -120,7 +146,7 @@ struct isert_cmd {
u64 pdu_buf_dma;
u32 pdu_buf_len;
struct isert_conn *conn;
- struct iscsi_cmd *iscsi_cmd;
+ struct iscsit_cmd *iscsit_cmd;
struct iser_tx_desc tx_desc;
struct iser_rx_desc *rx_desc;
struct rdma_rw_ctx rw;
@@ -141,20 +167,21 @@ struct isert_conn {
u32 responder_resources;
u32 initiator_depth;
bool pi_support;
- struct iser_rx_desc *login_req_buf;
+ struct iser_rx_desc *login_desc;
char *login_rsp_buf;
- u64 login_req_dma;
int login_req_len;
u64 login_rsp_dma;
struct iser_rx_desc *rx_descs;
struct ib_recv_wr rx_wr[ISERT_QP_MAX_RECV_DTOS];
- struct iscsi_conn *conn;
+ struct iscsit_conn *conn;
struct list_head node;
struct completion login_comp;
struct completion login_req_comp;
struct iser_tx_desc login_tx_desc;
struct rdma_cm_id *cm_id;
struct ib_qp *qp;
+ struct ib_cq *cq;
+ u32 cq_size;
struct isert_device *device;
struct mutex mutex;
struct kref kref;
@@ -165,22 +192,6 @@ struct isert_conn {
bool dev_removed;
};
-#define ISERT_MAX_CQ 64
-
-/**
- * struct isert_comp - iSER completion context
- *
- * @device: pointer to device handle
- * @cq: completion queue
- * @active_qps: Number of active QPs attached
- * to completion context
- */
-struct isert_comp {
- struct isert_device *device;
- struct ib_cq *cq;
- int active_qps;
-};
-
struct isert_device {
bool pi_capable;
int refcount;
diff --git a/drivers/infiniband/ulp/opa_vnic/Kconfig b/drivers/infiniband/ulp/opa_vnic/Kconfig
index a1f266b9c0b2..4d43d055fa8e 100644
--- a/drivers/infiniband/ulp/opa_vnic/Kconfig
+++ b/drivers/infiniband/ulp/opa_vnic/Kconfig
@@ -1,9 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-only
config INFINIBAND_OPA_VNIC
- tristate "Intel OPA VNIC support"
+ tristate "Cornelis OPX VNIC support"
depends on X86_64 && INFINIBAND
- ---help---
- This is Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+ help
+ This is Omni-Path Express (OPX) Virtual Network Interface Controller (VNIC)
driver for Ethernet over Omni-Path feature. It implements the HW
independent VNIC functionality. It interfaces with Linux stack for
data path and IB MAD for the control path.
diff --git a/drivers/infiniband/ulp/opa_vnic/Makefile b/drivers/infiniband/ulp/opa_vnic/Makefile
index a8c21d140ccb..196183817cdc 100644
--- a/drivers/infiniband/ulp/opa_vnic/Makefile
+++ b/drivers/infiniband/ulp/opa_vnic/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Makefile - Intel Omni-Path Virtual Network Controller driver
+# Makefile - Cornelis Omni-Path Express Virtual Network Controller driver
# Copyright(c) 2017, Intel Corporation.
+# Copyright(c) 2021, Cornelis Networks.
#
obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic.o
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
index 4480092c68e0..012fc27c5c93 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
@@ -118,12 +118,17 @@
* struct opa_vesw_info - OPA vnic switch information
* @fabric_id: 10-bit fabric id
* @vesw_id: 12-bit virtual ethernet switch id
+ * @rsvd0: reserved bytes
* @def_port_mask: bitmask of default ports
+ * @rsvd1: reserved bytes
* @pkey: partition key
+ * @rsvd2: reserved bytes
* @u_mcast_dlid: unknown multicast dlid
* @u_ucast_dlid: array of unknown unicast dlids
+ * @rsvd3: reserved bytes
* @rc: routing control
* @eth_mtu: Ethernet MTU
+ * @rsvd4: reserved bytes
*/
struct opa_vesw_info {
__be16 fabric_id;
@@ -150,12 +155,14 @@ struct opa_vesw_info {
* struct opa_per_veswport_info - OPA vnic per port information
* @port_num: port number
* @eth_link_status: current ethernet link state
+ * @rsvd0: reserved bytes
* @base_mac_addr: base mac address
* @config_state: configured port state
* @oper_state: operational port state
* @max_mac_tbl_ent: max number of mac table entries
* @max_smac_ent: max smac entries in mac table
* @mac_tbl_digest: mac table digest
+ * @rsvd1: reserved bytes
* @encap_slid: base slid for the port
* @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets
* @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets
@@ -165,8 +172,10 @@ struct opa_vesw_info {
* @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets
* @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets
* @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets
+ * @rsvd2: reserved bytes
* @uc_macs_gen_count: generation count for unicast macs list
* @mc_macs_gen_count: generation count for multicast macs list
+ * @rsvd3: reserved bytes
*/
struct opa_per_veswport_info {
__be32 port_num;
@@ -239,7 +248,7 @@ struct opa_veswport_mactable_entry {
* @offset: mac table starting offset
* @num_entries: Number of entries to get or set
* @mac_tbl_digest: mac table digest
- * @tbl_entries[]: Array of table entries
+ * @tbl_entries: Array of table entries
*
* The EM sends down this structure in a MAD indicating
* the starting offset in the forwarding table that this
@@ -258,7 +267,7 @@ struct opa_veswport_mactable {
__be16 offset;
__be16 num_entries;
__be32 mac_tbl_digest;
- struct opa_veswport_mactable_entry tbl_entries[0];
+ struct opa_veswport_mactable_entry tbl_entries[];
} __packed;
/**
@@ -294,6 +303,7 @@ struct opa_veswport_mactable {
* @rx_512_1023: received packet length is >=512 and < 1023 bytes
* @rx_1024_1518: received packet length is >=1024 and < 1518 bytes
* @rx_1519_max: received packet length >= 1519 bytes
+ * @reserved: reserved bytes
*
* All the above are counters of corresponding conditions.
*/
@@ -347,16 +357,26 @@ struct opa_veswport_summary_counters {
* @veswport_num: virtual ethernet switch port number
* @tx_errors: transmit errors
* @rx_errors: receive errors
+ * @rsvd0: reserved bytes
* @tx_smac_filt: smac filter errors
+ * @rsvd1: reserved bytes
+ * @rsvd2: reserved bytes
+ * @rsvd3: reserved bytes
* @tx_dlid_zero: transmit packets with invalid dlid
+ * @rsvd4: reserved bytes
* @tx_logic: other transmit errors
+ * @rsvd5: reserved bytes
* @tx_drop_state: packet tansmission in non-forward port state
* @rx_bad_veswid: received packet with invalid vesw id
+ * @rsvd6: reserved bytes
* @rx_runt: received ethernet packet with length < 64 bytes
* @rx_oversize: received ethernet packet with length > MTU size
+ * @rsvd7: reserved bytes
* @rx_eth_down: received packets when interface is down
* @rx_drop_state: received packets in non-forwarding port state
* @rx_logic: other receive errors
+ * @rsvd8: reserved bytes
+ * @rsvd9: reserved bytes
*
* All the above are counters of corresponding error conditions.
*/
@@ -417,7 +437,7 @@ struct opa_veswport_trap {
} __packed;
/**
- * struct opa_vnic_iface_macs_entry - single entry in the mac list
+ * struct opa_vnic_iface_mac_entry - single entry in the mac list
* @mac_addr: MAC address
*/
struct opa_vnic_iface_mac_entry {
@@ -440,13 +460,14 @@ struct opa_veswport_iface_macs {
__be16 num_macs_in_msg;
__be16 tot_macs_in_lst;
__be16 gen_count;
- struct opa_vnic_iface_mac_entry entry[0];
+ struct opa_vnic_iface_mac_entry entry[];
} __packed;
/**
* struct opa_vnic_vema_mad - Generic VEMA MAD
* @mad_hdr: Generic MAD header
* @rmpp_hdr: RMPP header for vendor specific MADs
+ * @reserved: reserved bytes
* @oui: Unique org identifier
* @data: MAD data
*/
@@ -467,6 +488,7 @@ struct opa_vnic_vema_mad {
* @trap_num: Trap number
* @toggle_count: Notice toggle bit and count value
* @issuer_lid: Trap issuer's lid
+ * @reserved: reserved bytes
* @issuer_gid: Issuer GID (only if Report method)
* @raw_data: Trap message body
*/
@@ -487,6 +509,7 @@ struct opa_vnic_notice_attr {
* struct opa_vnic_vema_mad_trap - Generic VEMA MAD Trap
* @mad_hdr: Generic MAD header
* @rmpp_hdr: RMPP header for vendor specific MADs
+ * @reserved: reserved bytes
* @oui: Unique org identifier
* @notice: Notice structure
*/
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
index 8ad7da989a0e..29b3d8fce3f5 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
@@ -124,10 +124,8 @@ static struct vnic_stats vnic_gstrings_stats[] = {
static void vnic_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *drvinfo)
{
- strlcpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver));
- strlcpy(drvinfo->version, opa_vnic_driver_version,
- sizeof(drvinfo->version));
- strlcpy(drvinfo->bus_info, dev_name(netdev->dev.parent),
+ strscpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver));
+ strscpy(drvinfo->bus_info, dev_name(netdev->dev.parent),
sizeof(drvinfo->bus_info));
}
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
index 6dbc08e1a6a6..dd942dd642bd 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
@@ -292,7 +292,6 @@ struct opa_vnic_mac_tbl_node {
hlist_for_each_entry(obj, &name[bkt], member)
extern char opa_vnic_driver_name[];
-extern const char opa_vnic_driver_version[];
struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
u8 port_num, u8 vport_num);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index aeff68f582d3..071f35711468 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -50,7 +50,6 @@
* netdev functionality.
*/
-#include <linux/module.h>
#include <linux/if_vlan.h>
#include <linux/crc32.h>
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index be5befd92d16..21c6cea8b1db 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -1,5 +1,6 @@
/*
* Copyright(c) 2017 Intel Corporation.
+ * Copyright(c) 2021 Cornelis Networks.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -46,7 +47,7 @@
*/
/*
- * This file contains OPA Virtual Network Interface Controller (VNIC)
+ * This file contains OPX Virtual Network Interface Controller (VNIC)
* Ethernet Management Agent (EMA) driver
*/
@@ -59,9 +60,7 @@
#include "opa_vnic_internal.h"
-#define DRV_VERSION "1.0"
char opa_vnic_driver_name[] = "opa_vnic";
-const char opa_vnic_driver_version[] = DRV_VERSION;
/*
* The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd
@@ -115,7 +114,7 @@ struct opa_vnic_vema_port {
struct mutex lock;
};
-static void opa_vnic_vema_add_one(struct ib_device *device);
+static int opa_vnic_vema_add_one(struct ib_device *device);
static void opa_vnic_vema_rem_one(struct ib_device *device,
void *client_data);
@@ -235,7 +234,7 @@ static void vema_get_class_port_info(struct opa_vnic_vema_port *port,
port_info = (struct opa_class_port_info *)rsp_mad->data;
memcpy(port_info, &port->class_port_info, sizeof(*port_info));
- port_info->base_version = OPA_MGMT_BASE_VERSION,
+ port_info->base_version = OPA_MGMT_BASE_VERSION;
port_info->class_version = OPA_EMA_CLASS_VERSION;
/*
@@ -549,7 +548,6 @@ static void vema_get(struct opa_vnic_vema_port *port,
vema_get_mac_entries(port, recvd_mad, rsp_mad);
break;
case OPA_EM_ATTR_IFACE_UCAST_MACS:
- /* fall through */
case OPA_EM_ATTR_IFACE_MCAST_MACS:
vema_get_mac_list(port, recvd_mad, rsp_mad, attr_id);
break;
@@ -991,18 +989,18 @@ static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en)
*
* Allocate the vnic control port and initialize it.
*/
-static void opa_vnic_vema_add_one(struct ib_device *device)
+static int opa_vnic_vema_add_one(struct ib_device *device)
{
struct opa_vnic_ctrl_port *cport;
int rc, size = sizeof(*cport);
if (!rdma_cap_opa_vnic(device))
- return;
+ return -EOPNOTSUPP;
size += device->phys_port_cnt * sizeof(struct opa_vnic_vema_port);
cport = kzalloc(size, GFP_KERNEL);
if (!cport)
- return;
+ return -ENOMEM;
cport->num_ports = device->phys_port_cnt;
cport->ibdev = device;
@@ -1014,6 +1012,7 @@ static void opa_vnic_vema_add_one(struct ib_device *device)
ib_set_client_data(device, &opa_vnic_client, cport);
opa_vnic_ctrl_config_dev(cport, true);
+ return 0;
}
/**
@@ -1028,9 +1027,6 @@ static void opa_vnic_vema_rem_one(struct ib_device *device,
{
struct opa_vnic_ctrl_port *cport = client_data;
- if (!cport)
- return;
-
c_info("removing VNIC client\n");
opa_vnic_ctrl_config_dev(cport, false);
vema_unregister(cport);
@@ -1041,9 +1037,6 @@ static int __init opa_vnic_init(void)
{
int rc;
- pr_info("OPA Virtual Network Driver - v%s\n",
- opa_vnic_driver_version);
-
rc = ib_register_client(&opa_vnic_client);
if (rc)
pr_err("VNIC driver register failed %d\n", rc);
@@ -1059,5 +1052,5 @@ static void opa_vnic_deinit(void)
module_exit(opa_vnic_deinit);
MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Intel Corporation");
-MODULE_DESCRIPTION("Intel OPA Virtual Network driver");
+MODULE_AUTHOR("Cornelis Networks");
+MODULE_DESCRIPTION("Cornelis OPX Virtual Network driver");
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
index 868b5aec1537..292c037aa239 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
@@ -74,7 +74,7 @@ void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event)
}
/**
- * opa_vnic_get_error_counters - get summary counters
+ * opa_vnic_get_summary_counters - get summary counters
* @adapter: vnic port adapter
* @cntrs: pointer to destination summary counters structure
*
diff --git a/drivers/infiniband/ulp/rtrs/Kconfig b/drivers/infiniband/ulp/rtrs/Kconfig
new file mode 100644
index 000000000000..9092b62e6dc8
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/Kconfig
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config INFINIBAND_RTRS
+ tristate
+ depends on INFINIBAND_ADDR_TRANS
+
+config INFINIBAND_RTRS_CLIENT
+ tristate "RTRS client module"
+ depends on INFINIBAND_ADDR_TRANS
+ select INFINIBAND_RTRS
+ help
+ RDMA transport client module.
+
+ RDMA Transport (RTRS) client implements a reliable transport layer
+ and also multipathing functionality and that it is intended to be
+ the base layer for a block storage initiator over RDMA.
+
+config INFINIBAND_RTRS_SERVER
+ tristate "RTRS server module"
+ depends on INFINIBAND_ADDR_TRANS
+ select INFINIBAND_RTRS
+ help
+ RDMA transport server module.
+
+ RDMA Transport (RTRS) server module processing connection and IO
+ requests received from the RTRS client module, it will pass the
+ IO requests to its user eg. RNBD_server.
diff --git a/drivers/infiniband/ulp/rtrs/Makefile b/drivers/infiniband/ulp/rtrs/Makefile
new file mode 100644
index 000000000000..5227e7788e1f
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS_rtrs-clt-trace.o = -I$(src)
+
+rtrs-client-y := rtrs-clt.o \
+ rtrs-clt-stats.o \
+ rtrs-clt-sysfs.o \
+ rtrs-clt-trace.o
+
+CFLAGS_rtrs-srv-trace.o = -I$(src)
+
+rtrs-server-y := rtrs-srv.o \
+ rtrs-srv-stats.o \
+ rtrs-srv-sysfs.o \
+ rtrs-srv-trace.o
+
+rtrs-core-y := rtrs.o
+
+obj-$(CONFIG_INFINIBAND_RTRS) += rtrs-core.o
+obj-$(CONFIG_INFINIBAND_RTRS_CLIENT) += rtrs-client.o
+obj-$(CONFIG_INFINIBAND_RTRS_SERVER) += rtrs-server.o
diff --git a/drivers/infiniband/ulp/rtrs/README b/drivers/infiniband/ulp/rtrs/README
new file mode 100644
index 000000000000..5d9ea142e5dd
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/README
@@ -0,0 +1,213 @@
+****************************
+RDMA Transport (RTRS)
+****************************
+
+RTRS (RDMA Transport) is a reliable high speed transport library
+which provides support to establish optimal number of connections
+between client and server machines using RDMA (InfiniBand, RoCE, iWarp)
+transport. It is optimized to transfer (read/write) IO blocks.
+
+In its core interface it follows the BIO semantics of providing the
+possibility to either write data from an sg list to the remote side
+or to request ("read") data transfer from the remote side into a given
+sg list.
+
+RTRS provides I/O fail-over and load-balancing capabilities by using
+multipath I/O (see "add_path" and "mp_policy" configuration entries in
+Documentation/ABI/testing/sysfs-class-rtrs-client).
+
+RTRS is used by the RNBD (RDMA Network Block Device) modules.
+
+==================
+Transport protocol
+==================
+
+Overview
+--------
+An established connection between a client and a server is called rtrs
+session. A session is associated with a set of memory chunks reserved on the
+server side for a given client for rdma transfer. A session
+consists of multiple paths, each representing a separate physical link
+between client and server. Those are used for load balancing and failover.
+Each path consists of as many connections (QPs) as there are cpus on
+the client.
+
+When processing an incoming write or read request, rtrs client uses memory
+chunks reserved for him on the server side. Their number, size and addresses
+need to be exchanged between client and server during the connection
+establishment phase. Apart from the memory related information client needs to
+inform the server about the session name and identify each path and connection
+individually.
+
+On an established session client sends to server write or read messages.
+Server uses immediate field to tell the client which request is being
+acknowledged and for errno. Client uses immediate field to tell the server
+which of the memory chunks has been accessed and at which offset the message
+can be found.
+
+Module parameter always_invalidate is introduced for the security problem
+discussed in LPC RDMA MC 2019. When always_invalidate=Y, on the server side we
+invalidate each rdma buffer before we hand it over to RNBD server and
+then pass it to the block layer. A new rkey is generated and registered for the
+buffer after it returns back from the block layer and RNBD server.
+The new rkey is sent back to the client along with the IO result.
+The procedure is the default behaviour of the driver. This invalidation and
+registration on each IO causes performance drop of up to 20%. A user of the
+driver may choose to load the modules with this mechanism switched off
+(always_invalidate=N), if he understands and can take the risk of a malicious
+client being able to corrupt memory of a server it is connected to. This might
+be a reasonable option in a scenario where all the clients and all the servers
+are located within a secure datacenter.
+
+
+Connection establishment
+------------------------
+
+1. Client starts establishing connections belonging to a path of a session one
+by one via attaching RTRS_MSG_CON_REQ messages to the rdma_connect requests.
+Those include uuid of the session and uuid of the path to be
+established. They are used by the server to find a persisting session/path or
+to create a new one when necessary. The message also contains the protocol
+version and magic for compatibility, total number of connections per session
+(as many as cpus on the client), the id of the current connection and
+the reconnect counter, which is used to resolve the situations where
+client is trying to reconnect a path, while server is still destroying the old
+one.
+
+2. Server accepts the connection requests one by one and attaches
+RTRS_MSG_CONN_RSP messages to the rdma_accept. Apart from magic and
+protocol version, the messages include error code, queue depth supported by
+the server (number of memory chunks which are going to be allocated for that
+session) and the maximum size of one io, RTRS_MSG_NEW_RKEY_F flags is set
+when always_invalidate=Y.
+
+3. After all connections of a path are established client sends to server the
+RTRS_MSG_INFO_REQ message, containing the name of the session. This message
+requests the address information from the server.
+
+4. Server replies to the session info request message with RTRS_MSG_INFO_RSP,
+which contains the addresses and keys of the RDMA buffers allocated for that
+session.
+
+5. Session becomes connected after all paths to be established are connected
+(i.e. steps 1-4 finished for all paths requested for a session)
+
+6. Server and client exchange periodically heartbeat messages (empty rdma
+messages with an immediate field) which are used to detect a crash on remote
+side or network outage in an absence of IO.
+
+7. On any RDMA related error or in the case of a heartbeat timeout, the
+corresponding path is disconnected, all the inflight IO are failed over to a
+healthy path, if any, and the reconnect mechanism is triggered.
+
+CLT SRV
+*for each connection belonging to a path and for each path:
+RTRS_MSG_CON_REQ ------------------->
+ <------------------- RTRS_MSG_CON_RSP
+...
+*after all connections are established:
+RTRS_MSG_INFO_REQ ------------------->
+ <------------------- RTRS_MSG_INFO_RSP
+*heartbeat is started from both sides:
+ -------------------> [RTRS_HB_MSG_IMM]
+[RTRS_HB_MSG_ACK] <-------------------
+[RTRS_HB_MSG_IMM] <-------------------
+ -------------------> [RTRS_HB_MSG_ACK]
+
+IO path
+-------
+
+* Write (always_invalidate=N) *
+
+1. When processing a write request client selects one of the memory chunks
+on the server side and rdma writes there the user data, user header and the
+RTRS_MSG_RDMA_WRITE message. Apart from the type (write), the message only
+contains size of the user header. The client tells the server which chunk has
+been accessed and at what offset the RTRS_MSG_RDMA_WRITE can be found by
+using the IMM field.
+
+2. When confirming a write request server sends an "empty" rdma message with
+an immediate field. The 32 bit field is used to specify the outstanding
+inflight IO and for the error code.
+
+CLT SRV
+usr_data + usr_hdr + rtrs_msg_rdma_write -----------------> [RTRS_IO_REQ_IMM]
+[RTRS_IO_RSP_IMM] <----------------- (id + errno)
+
+* Write (always_invalidate=Y) *
+
+1. When processing a write request client selects one of the memory chunks
+on the server side and rdma writes there the user data, user header and the
+RTRS_MSG_RDMA_WRITE message. Apart from the type (write), the message only
+contains size of the user header. The client tells the server which chunk has
+been accessed and at what offset the RTRS_MSG_RDMA_WRITE can be found by
+using the IMM field, Server invalidate rkey associated to the memory chunks
+first, when it finishes, pass the IO to RNBD server module.
+
+2. When confirming a write request server sends an "empty" rdma message with
+an immediate field. The 32 bit field is used to specify the outstanding
+inflight IO and for the error code. The new rkey is sent back using
+SEND_WITH_IMM WR, client When it recived new rkey message, it validates
+the message and finished IO after update rkey for the rbuffer, then post
+back the recv buffer for later use.
+
+CLT SRV
+usr_data + usr_hdr + rtrs_msg_rdma_write -----------------> [RTRS_IO_REQ_IMM]
+[RTRS_MSG_RKEY_RSP] <----------------- (RTRS_MSG_RKEY_RSP)
+[RTRS_IO_RSP_IMM] <----------------- (id + errno)
+
+
+* Read (always_invalidate=N)*
+
+1. When processing a read request client selects one of the memory chunks
+on the server side and rdma writes there the user header and the
+RTRS_MSG_RDMA_READ message. This message contains the type (read), size of
+the user header, flags (specifying if memory invalidation is necessary) and the
+list of addresses along with keys for the data to be read into.
+
+2. When confirming a read request server transfers the requested data first,
+attaches an invalidation message if requested and finally an "empty" rdma
+message with an immediate field. The 32 bit field is used to specify the
+outstanding inflight IO and the error code.
+
+CLT SRV
+usr_hdr + rtrs_msg_rdma_read --------------> [RTRS_IO_REQ_IMM]
+[RTRS_IO_RSP_IMM] <-------------- usr_data + (id + errno)
+or in case client requested invalidation:
+[RTRS_IO_RSP_IMM_W_INV] <-------------- usr_data + (INV) + (id + errno)
+
+* Read (always_invalidate=Y)*
+
+1. When processing a read request client selects one of the memory chunks
+on the server side and rdma writes there the user header and the
+RTRS_MSG_RDMA_READ message. This message contains the type (read), size of
+the user header, flags (specifying if memory invalidation is necessary) and the
+list of addresses along with keys for the data to be read into.
+Server invalidate rkey associated to the memory chunks first, when it finishes,
+passes the IO to RNBD server module.
+
+2. When confirming a read request server transfers the requested data first,
+attaches an invalidation message if requested and finally an "empty" rdma
+message with an immediate field. The 32 bit field is used to specify the
+outstanding inflight IO and the error code. The new rkey is sent back using
+SEND_WITH_IMM WR, client When it recived new rkey message, it validates
+the message and finished IO after update rkey for the rbuffer, then post
+back the recv buffer for later use.
+
+CLT SRV
+usr_hdr + rtrs_msg_rdma_read --------------> [RTRS_IO_REQ_IMM]
+[RTRS_IO_RSP_IMM] <-------------- usr_data + (id + errno)
+[RTRS_MSG_RKEY_RSP] <----------------- (RTRS_MSG_RKEY_RSP)
+or in case client requested invalidation:
+[RTRS_IO_RSP_IMM_W_INV] <-------------- usr_data + (INV) + (id + errno)
+=========================================
+Contributors List(in alphabetical order)
+=========================================
+Danil Kipnis <danil.kipnis@profitbricks.com>
+Fabian Holler <mail@fholler.de>
+Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
+Jack Wang <jinpu.wang@profitbricks.com>
+Kleber Souza <kleber.souza@profitbricks.com>
+Lutz Pogrell <lutz.pogrell@cloud.ionos.com>
+Milind Dumbare <Milind.dumbare@gmail.com>
+Roman Penyaev <roman.penyaev@profitbricks.com>
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
new file mode 100644
index 000000000000..1e6ffafa2db3
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "rtrs-clt.h"
+
+void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_clt_stats *stats = clt_path->stats;
+ struct rtrs_clt_stats_pcpu *s;
+ int cpu;
+
+ cpu = raw_smp_processor_id();
+ s = get_cpu_ptr(stats->pcpu_stats);
+ if (con->cpu != cpu) {
+ s->cpu_migr.to++;
+
+ /* Careful here, override s pointer */
+ s = per_cpu_ptr(stats->pcpu_stats, con->cpu);
+ atomic_inc(&s->cpu_migr.from);
+ }
+ put_cpu_ptr(stats->pcpu_stats);
+}
+
+void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats)
+{
+ this_cpu_inc(stats->pcpu_stats->rdma.failover_cnt);
+}
+
+int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf)
+{
+ struct rtrs_clt_stats_pcpu *s;
+
+ size_t used;
+ int cpu;
+
+ used = 0;
+ for_each_possible_cpu(cpu) {
+ s = per_cpu_ptr(stats->pcpu_stats, cpu);
+ used += sysfs_emit_at(buf, used, "%d ",
+ atomic_read(&s->cpu_migr.from));
+ }
+
+ used += sysfs_emit_at(buf, used, "\n");
+
+ return used;
+}
+
+int rtrs_clt_stats_migration_to_cnt_to_str(struct rtrs_clt_stats *stats, char *buf)
+{
+ struct rtrs_clt_stats_pcpu *s;
+
+ size_t used;
+ int cpu;
+
+ used = 0;
+ for_each_possible_cpu(cpu) {
+ s = per_cpu_ptr(stats->pcpu_stats, cpu);
+ used += sysfs_emit_at(buf, used, "%d ", s->cpu_migr.to);
+ }
+
+ used += sysfs_emit_at(buf, used, "\n");
+
+ return used;
+}
+
+int rtrs_clt_stats_reconnects_to_str(struct rtrs_clt_stats *stats, char *buf)
+{
+ return sysfs_emit(buf, "%d %d\n", stats->reconnects.successful_cnt,
+ stats->reconnects.fail_cnt);
+}
+
+ssize_t rtrs_clt_stats_rdma_to_str(struct rtrs_clt_stats *stats, char *page)
+{
+ struct rtrs_clt_stats_rdma sum;
+ struct rtrs_clt_stats_rdma *r;
+ int cpu;
+
+ memset(&sum, 0, sizeof(sum));
+
+ for_each_possible_cpu(cpu) {
+ r = &per_cpu_ptr(stats->pcpu_stats, cpu)->rdma;
+
+ sum.dir[READ].cnt += r->dir[READ].cnt;
+ sum.dir[READ].size_total += r->dir[READ].size_total;
+ sum.dir[WRITE].cnt += r->dir[WRITE].cnt;
+ sum.dir[WRITE].size_total += r->dir[WRITE].size_total;
+ sum.failover_cnt += r->failover_cnt;
+ }
+
+ return sysfs_emit(page, "%llu %llu %llu %llu %u %llu\n",
+ sum.dir[READ].cnt, sum.dir[READ].size_total,
+ sum.dir[WRITE].cnt, sum.dir[WRITE].size_total,
+ atomic_read(&stats->inflight), sum.failover_cnt);
+}
+
+ssize_t rtrs_clt_reset_all_help(struct rtrs_clt_stats *s, char *page)
+{
+ return sysfs_emit(page, "echo 1 to reset all statistics\n");
+}
+
+int rtrs_clt_reset_rdma_stats(struct rtrs_clt_stats *stats, bool enable)
+{
+ struct rtrs_clt_stats_pcpu *s;
+ int cpu;
+
+ if (!enable)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ s = per_cpu_ptr(stats->pcpu_stats, cpu);
+ memset(&s->rdma, 0, sizeof(s->rdma));
+ }
+
+ return 0;
+}
+
+int rtrs_clt_reset_cpu_migr_stats(struct rtrs_clt_stats *stats, bool enable)
+{
+ struct rtrs_clt_stats_pcpu *s;
+ int cpu;
+
+ if (!enable)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ s = per_cpu_ptr(stats->pcpu_stats, cpu);
+ memset(&s->cpu_migr, 0, sizeof(s->cpu_migr));
+ }
+
+ return 0;
+}
+
+int rtrs_clt_reset_reconnects_stat(struct rtrs_clt_stats *stats, bool enable)
+{
+ if (!enable)
+ return -EINVAL;
+
+ memset(&stats->reconnects, 0, sizeof(stats->reconnects));
+
+ return 0;
+}
+
+int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *s, bool enable)
+{
+ if (enable) {
+ rtrs_clt_reset_rdma_stats(s, enable);
+ rtrs_clt_reset_cpu_migr_stats(s, enable);
+ rtrs_clt_reset_reconnects_stat(s, enable);
+ atomic_set(&s->inflight, 0);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats,
+ size_t size, int d)
+{
+ this_cpu_inc(stats->pcpu_stats->rdma.dir[d].cnt);
+ this_cpu_add(stats->pcpu_stats->rdma.dir[d].size_total, size);
+}
+
+void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir)
+{
+ struct rtrs_clt_con *con = req->con;
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_clt_stats *stats = clt_path->stats;
+ unsigned int len;
+
+ len = req->usr_len + req->data_len;
+ rtrs_clt_update_rdma_stats(stats, len, dir);
+ if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
+ atomic_inc(&stats->inflight);
+}
+
+int rtrs_clt_init_stats(struct rtrs_clt_stats *stats)
+{
+ stats->pcpu_stats = alloc_percpu(typeof(*stats->pcpu_stats));
+ if (!stats->pcpu_stats)
+ return -ENOMEM;
+
+ /*
+ * successful_cnt will be set to 0 after session
+ * is established for the first time
+ */
+ stats->reconnects.successful_cnt = -1;
+
+ return 0;
+}
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c
new file mode 100644
index 000000000000..d3c436ead694
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c
@@ -0,0 +1,514 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "rtrs-pri.h"
+#include "rtrs-clt.h"
+#include "rtrs-log.h"
+
+#define MIN_MAX_RECONN_ATT -1
+#define MAX_MAX_RECONN_ATT 9999
+
+static void rtrs_clt_path_release(struct kobject *kobj)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+
+ free_path(clt_path);
+}
+
+static struct kobj_type ktype_sess = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = rtrs_clt_path_release
+};
+
+static void rtrs_clt_path_stats_release(struct kobject *kobj)
+{
+ struct rtrs_clt_stats *stats;
+
+ stats = container_of(kobj, struct rtrs_clt_stats, kobj_stats);
+
+ free_percpu(stats->pcpu_stats);
+
+ kfree(stats);
+}
+
+static struct kobj_type ktype_stats = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = rtrs_clt_path_stats_release,
+};
+
+static ssize_t max_reconnect_attempts_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess,
+ dev);
+
+ return sysfs_emit(page, "%d\n",
+ rtrs_clt_get_max_reconnect_attempts(clt));
+}
+
+static ssize_t max_reconnect_attempts_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int value;
+ int ret;
+ struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess,
+ dev);
+
+ ret = kstrtoint(buf, 10, &value);
+ if (ret) {
+ rtrs_err(clt, "%s: failed to convert string '%s' to int\n",
+ attr->attr.name, buf);
+ return ret;
+ }
+ if (value > MAX_MAX_RECONN_ATT ||
+ value < MIN_MAX_RECONN_ATT) {
+ rtrs_err(clt,
+ "%s: invalid range (provided: '%s', accepted: min: %d, max: %d)\n",
+ attr->attr.name, buf, MIN_MAX_RECONN_ATT,
+ MAX_MAX_RECONN_ATT);
+ return -EINVAL;
+ }
+ rtrs_clt_set_max_reconnect_attempts(clt, value);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(max_reconnect_attempts);
+
+static ssize_t mpath_policy_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_sess *clt;
+
+ clt = container_of(dev, struct rtrs_clt_sess, dev);
+
+ switch (clt->mp_policy) {
+ case MP_POLICY_RR:
+ return sysfs_emit(page, "round-robin (RR: %d)\n",
+ clt->mp_policy);
+ case MP_POLICY_MIN_INFLIGHT:
+ return sysfs_emit(page, "min-inflight (MI: %d)\n",
+ clt->mp_policy);
+ case MP_POLICY_MIN_LATENCY:
+ return sysfs_emit(page, "min-latency (ML: %d)\n",
+ clt->mp_policy);
+ default:
+ return sysfs_emit(page, "Unknown (%d)\n", clt->mp_policy);
+ }
+}
+
+static ssize_t mpath_policy_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rtrs_clt_sess *clt;
+ int value;
+ int ret;
+ size_t len = 0;
+
+ clt = container_of(dev, struct rtrs_clt_sess, dev);
+
+ ret = kstrtoint(buf, 10, &value);
+ if (!ret && (value == MP_POLICY_RR ||
+ value == MP_POLICY_MIN_INFLIGHT ||
+ value == MP_POLICY_MIN_LATENCY)) {
+ clt->mp_policy = value;
+ return count;
+ }
+
+ /* distinguish "mi" and "min-latency" with length */
+ len = strnlen(buf, NAME_MAX);
+ if (buf[len - 1] == '\n')
+ len--;
+
+ if (!strncasecmp(buf, "round-robin", 11) ||
+ (len == 2 && !strncasecmp(buf, "rr", 2)))
+ clt->mp_policy = MP_POLICY_RR;
+ else if (!strncasecmp(buf, "min-inflight", 12) ||
+ (len == 2 && !strncasecmp(buf, "mi", 2)))
+ clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
+ else if (!strncasecmp(buf, "min-latency", 11) ||
+ (len == 2 && !strncasecmp(buf, "ml", 2)))
+ clt->mp_policy = MP_POLICY_MIN_LATENCY;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(mpath_policy);
+
+static ssize_t add_path_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return sysfs_emit(page,
+ "Usage: echo [<source addr>@]<destination addr> > %s\n\n*addr ::= [ ip:<ipv4|ipv6> | gid:<gid> ]\n",
+ attr->attr.name);
+}
+
+static ssize_t add_path_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct sockaddr_storage srcaddr, dstaddr;
+ struct rtrs_addr addr = {
+ .src = &srcaddr,
+ .dst = &dstaddr
+ };
+ struct rtrs_clt_sess *clt;
+ const char *nl;
+ size_t len;
+ int err;
+
+ clt = container_of(dev, struct rtrs_clt_sess, dev);
+
+ nl = strchr(buf, '\n');
+ if (nl)
+ len = nl - buf;
+ else
+ len = count;
+ err = rtrs_addr_to_sockaddr(buf, len, clt->port, &addr);
+ if (err)
+ return -EINVAL;
+
+ err = rtrs_clt_create_path_from_sysfs(clt, &addr);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(add_path);
+
+static ssize_t rtrs_clt_state_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ if (clt_path->state == RTRS_CLT_CONNECTED)
+ return sysfs_emit(page, "connected\n");
+
+ return sysfs_emit(page, "disconnected\n");
+}
+
+static struct kobj_attribute rtrs_clt_state_attr =
+ __ATTR(state, 0444, rtrs_clt_state_show, NULL);
+
+static ssize_t rtrs_clt_reconnect_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Usage: echo 1 > %s\n", attr->attr.name);
+}
+
+static ssize_t rtrs_clt_reconnect_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rtrs_clt_path *clt_path;
+ int ret;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ if (!sysfs_streq(buf, "1")) {
+ rtrs_err(clt_path->clt, "%s: unknown value: '%s'\n",
+ attr->attr.name, buf);
+ return -EINVAL;
+ }
+ ret = rtrs_clt_reconnect_from_sysfs(clt_path);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static struct kobj_attribute rtrs_clt_reconnect_attr =
+ __ATTR(reconnect, 0644, rtrs_clt_reconnect_show,
+ rtrs_clt_reconnect_store);
+
+static ssize_t rtrs_clt_disconnect_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Usage: echo 1 > %s\n", attr->attr.name);
+}
+
+static ssize_t rtrs_clt_disconnect_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ if (!sysfs_streq(buf, "1")) {
+ rtrs_err(clt_path->clt, "%s: unknown value: '%s'\n",
+ attr->attr.name, buf);
+ return -EINVAL;
+ }
+ rtrs_clt_close_conns(clt_path, true);
+
+ return count;
+}
+
+static struct kobj_attribute rtrs_clt_disconnect_attr =
+ __ATTR(disconnect, 0644, rtrs_clt_disconnect_show,
+ rtrs_clt_disconnect_store);
+
+static ssize_t rtrs_clt_remove_path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Usage: echo 1 > %s\n", attr->attr.name);
+}
+
+static ssize_t rtrs_clt_remove_path_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rtrs_clt_path *clt_path;
+ int ret;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ if (!sysfs_streq(buf, "1")) {
+ rtrs_err(clt_path->clt, "%s: unknown value: '%s'\n",
+ attr->attr.name, buf);
+ return -EINVAL;
+ }
+ ret = rtrs_clt_remove_path_from_sysfs(clt_path, &attr->attr);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static struct kobj_attribute rtrs_clt_remove_path_attr =
+ __ATTR(remove_path, 0644, rtrs_clt_remove_path_show,
+ rtrs_clt_remove_path_store);
+
+STAT_ATTR(struct rtrs_clt_stats, cpu_migration_from,
+ rtrs_clt_stats_migration_from_cnt_to_str,
+ rtrs_clt_reset_cpu_migr_stats);
+
+STAT_ATTR(struct rtrs_clt_stats, cpu_migration_to,
+ rtrs_clt_stats_migration_to_cnt_to_str,
+ rtrs_clt_reset_cpu_migr_stats);
+
+STAT_ATTR(struct rtrs_clt_stats, reconnects,
+ rtrs_clt_stats_reconnects_to_str,
+ rtrs_clt_reset_reconnects_stat);
+
+STAT_ATTR(struct rtrs_clt_stats, rdma,
+ rtrs_clt_stats_rdma_to_str,
+ rtrs_clt_reset_rdma_stats);
+
+STAT_ATTR(struct rtrs_clt_stats, reset_all,
+ rtrs_clt_reset_all_help,
+ rtrs_clt_reset_all_stats);
+
+static struct attribute *rtrs_clt_stats_attrs[] = {
+ &cpu_migration_from_attr.attr,
+ &cpu_migration_to_attr.attr,
+ &reconnects_attr.attr,
+ &rdma_attr.attr,
+ &reset_all_attr.attr,
+ NULL
+};
+
+static const struct attribute_group rtrs_clt_stats_attr_group = {
+ .attrs = rtrs_clt_stats_attrs,
+};
+
+static ssize_t rtrs_clt_hca_port_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, typeof(*clt_path), kobj);
+
+ return sysfs_emit(page, "%u\n", clt_path->hca_port);
+}
+
+static struct kobj_attribute rtrs_clt_hca_port_attr =
+ __ATTR(hca_port, 0444, rtrs_clt_hca_port_show, NULL);
+
+static ssize_t rtrs_clt_hca_name_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+
+ return sysfs_emit(page, "%s\n", clt_path->hca_name);
+}
+
+static struct kobj_attribute rtrs_clt_hca_name_attr =
+ __ATTR(hca_name, 0444, rtrs_clt_hca_name_show, NULL);
+
+static ssize_t rtrs_clt_cur_latency_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+
+ return sysfs_emit(page, "%lld ns\n",
+ ktime_to_ns(clt_path->s.hb_cur_latency));
+}
+
+static struct kobj_attribute rtrs_clt_cur_latency_attr =
+ __ATTR(cur_latency, 0444, rtrs_clt_cur_latency_show, NULL);
+
+static ssize_t rtrs_clt_src_addr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_path *clt_path;
+ int len;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ len = sockaddr_to_str((struct sockaddr *)&clt_path->s.src_addr, page,
+ PAGE_SIZE);
+ len += sysfs_emit_at(page, len, "\n");
+ return len;
+}
+
+static struct kobj_attribute rtrs_clt_src_addr_attr =
+ __ATTR(src_addr, 0444, rtrs_clt_src_addr_show, NULL);
+
+static ssize_t rtrs_clt_dst_addr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_clt_path *clt_path;
+ int len;
+
+ clt_path = container_of(kobj, struct rtrs_clt_path, kobj);
+ len = sockaddr_to_str((struct sockaddr *)&clt_path->s.dst_addr, page,
+ PAGE_SIZE);
+ len += sysfs_emit_at(page, len, "\n");
+ return len;
+}
+
+static struct kobj_attribute rtrs_clt_dst_addr_attr =
+ __ATTR(dst_addr, 0444, rtrs_clt_dst_addr_show, NULL);
+
+static struct attribute *rtrs_clt_path_attrs[] = {
+ &rtrs_clt_hca_name_attr.attr,
+ &rtrs_clt_hca_port_attr.attr,
+ &rtrs_clt_src_addr_attr.attr,
+ &rtrs_clt_dst_addr_attr.attr,
+ &rtrs_clt_state_attr.attr,
+ &rtrs_clt_reconnect_attr.attr,
+ &rtrs_clt_disconnect_attr.attr,
+ &rtrs_clt_remove_path_attr.attr,
+ &rtrs_clt_cur_latency_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group rtrs_clt_path_attr_group = {
+ .attrs = rtrs_clt_path_attrs,
+};
+
+int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ char str[NAME_MAX];
+ int err;
+ struct rtrs_addr path = {
+ .src = &clt_path->s.src_addr,
+ .dst = &clt_path->s.dst_addr,
+ };
+
+ rtrs_addr_to_str(&path, str, sizeof(str));
+ err = kobject_init_and_add(&clt_path->kobj, &ktype_sess,
+ clt->kobj_paths,
+ "%s", str);
+ if (err) {
+ pr_err("kobject_init_and_add: %d\n", err);
+ kobject_put(&clt_path->kobj);
+ return err;
+ }
+ err = sysfs_create_group(&clt_path->kobj, &rtrs_clt_path_attr_group);
+ if (err) {
+ pr_err("sysfs_create_group(): %d\n", err);
+ goto put_kobj;
+ }
+ err = kobject_init_and_add(&clt_path->stats->kobj_stats, &ktype_stats,
+ &clt_path->kobj, "stats");
+ if (err) {
+ pr_err("kobject_init_and_add: %d\n", err);
+ kobject_put(&clt_path->stats->kobj_stats);
+ goto remove_group;
+ }
+
+ err = sysfs_create_group(&clt_path->stats->kobj_stats,
+ &rtrs_clt_stats_attr_group);
+ if (err) {
+ pr_err("failed to create stats sysfs group, err: %d\n", err);
+ goto put_kobj_stats;
+ }
+
+ return 0;
+
+put_kobj_stats:
+ kobject_del(&clt_path->stats->kobj_stats);
+ kobject_put(&clt_path->stats->kobj_stats);
+remove_group:
+ sysfs_remove_group(&clt_path->kobj, &rtrs_clt_path_attr_group);
+put_kobj:
+ kobject_del(&clt_path->kobj);
+ kobject_put(&clt_path->kobj);
+
+ return err;
+}
+
+void rtrs_clt_destroy_path_files(struct rtrs_clt_path *clt_path,
+ const struct attribute *sysfs_self)
+{
+ kobject_del(&clt_path->stats->kobj_stats);
+ kobject_put(&clt_path->stats->kobj_stats);
+ if (sysfs_self)
+ sysfs_remove_file_self(&clt_path->kobj, sysfs_self);
+ kobject_del(&clt_path->kobj);
+}
+
+static struct attribute *rtrs_clt_attrs[] = {
+ &dev_attr_max_reconnect_attempts.attr,
+ &dev_attr_mpath_policy.attr,
+ &dev_attr_add_path.attr,
+ NULL,
+};
+
+static const struct attribute_group rtrs_clt_attr_group = {
+ .attrs = rtrs_clt_attrs,
+};
+
+int rtrs_clt_create_sysfs_root_files(struct rtrs_clt_sess *clt)
+{
+ return sysfs_create_group(&clt->dev.kobj, &rtrs_clt_attr_group);
+}
+
+void rtrs_clt_destroy_sysfs_root(struct rtrs_clt_sess *clt)
+{
+ sysfs_remove_group(&clt->dev.kobj, &rtrs_clt_attr_group);
+
+ if (clt->kobj_paths) {
+ kobject_del(clt->kobj_paths);
+ kobject_put(clt->kobj_paths);
+ }
+}
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.c
new file mode 100644
index 000000000000..f14fa1f36ce8
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#include "rtrs.h"
+#include "rtrs-clt.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "rtrs-clt-trace.h"
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.h b/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.h
new file mode 100644
index 000000000000..7738e2676855
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-trace.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rtrs_clt
+
+#if !defined(_TRACE_RTRS_CLT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RTRS_CLT_H
+
+#include <linux/tracepoint.h>
+
+struct rtrs_clt_path;
+struct rtrs_clt_sess;
+
+TRACE_DEFINE_ENUM(RTRS_CLT_CONNECTING);
+TRACE_DEFINE_ENUM(RTRS_CLT_CONNECTING_ERR);
+TRACE_DEFINE_ENUM(RTRS_CLT_RECONNECTING);
+TRACE_DEFINE_ENUM(RTRS_CLT_CONNECTED);
+TRACE_DEFINE_ENUM(RTRS_CLT_CLOSING);
+TRACE_DEFINE_ENUM(RTRS_CLT_CLOSED);
+TRACE_DEFINE_ENUM(RTRS_CLT_DEAD);
+
+#define show_rtrs_clt_state(x) \
+ __print_symbolic(x, \
+ { RTRS_CLT_CONNECTING, "CONNECTING" }, \
+ { RTRS_CLT_CONNECTING_ERR, "CONNECTING_ERR" }, \
+ { RTRS_CLT_RECONNECTING, "RECONNECTING" }, \
+ { RTRS_CLT_CONNECTED, "CONNECTED" }, \
+ { RTRS_CLT_CLOSING, "CLOSING" }, \
+ { RTRS_CLT_CLOSED, "CLOSED" }, \
+ { RTRS_CLT_DEAD, "DEAD" })
+
+DECLARE_EVENT_CLASS(rtrs_clt_conn_class,
+ TP_PROTO(struct rtrs_clt_path *clt_path),
+
+ TP_ARGS(clt_path),
+
+ TP_STRUCT__entry(
+ __field(int, state)
+ __field(int, reconnect_attempts)
+ __field(int, max_reconnect_attempts)
+ __field(int, fail_cnt)
+ __field(int, success_cnt)
+ __array(char, sessname, NAME_MAX)
+ ),
+
+ TP_fast_assign(
+ struct rtrs_clt_sess *clt = clt_path->clt;
+
+ __entry->state = clt_path->state;
+ __entry->reconnect_attempts = clt_path->reconnect_attempts;
+ __entry->max_reconnect_attempts = clt->max_reconnect_attempts;
+ __entry->fail_cnt = clt_path->stats->reconnects.fail_cnt;
+ __entry->success_cnt = clt_path->stats->reconnects.successful_cnt;
+ memcpy(__entry->sessname, kobject_name(&clt_path->kobj), NAME_MAX);
+ ),
+
+ TP_printk("RTRS-CLT: sess='%s' state=%s attempts='%d' max-attempts='%d' fail='%d' success='%d'",
+ __entry->sessname,
+ show_rtrs_clt_state(__entry->state),
+ __entry->reconnect_attempts,
+ __entry->max_reconnect_attempts,
+ __entry->fail_cnt,
+ __entry->success_cnt
+ )
+);
+
+#define DEFINE_CLT_CONN_EVENT(name) \
+DEFINE_EVENT(rtrs_clt_conn_class, rtrs_##name, \
+ TP_PROTO(struct rtrs_clt_path *clt_path), \
+ TP_ARGS(clt_path))
+
+DEFINE_CLT_CONN_EVENT(clt_reconnect_work);
+DEFINE_CLT_CONN_EVENT(clt_close_conns);
+DEFINE_CLT_CONN_EVENT(rdma_error_recovery);
+
+#endif /* _TRACE_RTRS_CLT_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rtrs-clt-trace
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
new file mode 100644
index 000000000000..8546b8816524
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -0,0 +1,3190 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/random.h>
+
+#include "rtrs-clt.h"
+#include "rtrs-log.h"
+#include "rtrs-clt-trace.h"
+
+#define RTRS_CONNECT_TIMEOUT_MS 30000
+/*
+ * Wait a bit before trying to reconnect after a failure
+ * in order to give server time to finish clean up which
+ * leads to "false positives" failed reconnect attempts
+ */
+#define RTRS_RECONNECT_BACKOFF 1000
+/*
+ * Wait for additional random time between 0 and 8 seconds
+ * before starting to reconnect to avoid clients reconnecting
+ * all at once in case of a major network outage
+ */
+#define RTRS_RECONNECT_SEED 8
+
+#define FIRST_CONN 0x01
+/* limit to 128 * 4k = 512k max IO */
+#define RTRS_MAX_SEGMENTS 128
+
+MODULE_DESCRIPTION("RDMA Transport Client");
+MODULE_LICENSE("GPL");
+
+static const struct rtrs_rdma_dev_pd_ops dev_pd_ops;
+static struct rtrs_rdma_dev_pd dev_pd = {
+ .ops = &dev_pd_ops
+};
+
+static struct workqueue_struct *rtrs_wq;
+static struct class *rtrs_clt_dev_class;
+
+static inline bool rtrs_clt_is_connected(const struct rtrs_clt_sess *clt)
+{
+ struct rtrs_clt_path *clt_path;
+ bool connected = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry)
+ if (READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED) {
+ connected = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return connected;
+}
+
+static struct rtrs_permit *
+__rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type)
+{
+ size_t max_depth = clt->queue_depth;
+ struct rtrs_permit *permit;
+ int bit;
+
+ /*
+ * Adapted from null_blk get_tag(). Callers from different cpus may
+ * grab the same bit, since find_first_zero_bit is not atomic.
+ * But then the test_and_set_bit_lock will fail for all the
+ * callers but one, so that they will loop again.
+ * This way an explicit spinlock is not required.
+ */
+ do {
+ bit = find_first_zero_bit(clt->permits_map, max_depth);
+ if (bit >= max_depth)
+ return NULL;
+ } while (test_and_set_bit_lock(bit, clt->permits_map));
+
+ permit = get_permit(clt, bit);
+ WARN_ON(permit->mem_id != bit);
+ permit->cpu_id = raw_smp_processor_id();
+ permit->con_type = con_type;
+
+ return permit;
+}
+
+static inline void __rtrs_put_permit(struct rtrs_clt_sess *clt,
+ struct rtrs_permit *permit)
+{
+ clear_bit_unlock(permit->mem_id, clt->permits_map);
+}
+
+/**
+ * rtrs_clt_get_permit() - allocates permit for future RDMA operation
+ * @clt: Current session
+ * @con_type: Type of connection to use with the permit
+ * @can_wait: Wait type
+ *
+ * Description:
+ * Allocates permit for the following RDMA operation. Permit is used
+ * to preallocate all resources and to propagate memory pressure
+ * up earlier.
+ *
+ * Context:
+ * Can sleep if @wait == RTRS_PERMIT_WAIT
+ */
+struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt_sess *clt,
+ enum rtrs_clt_con_type con_type,
+ enum wait_type can_wait)
+{
+ struct rtrs_permit *permit;
+ DEFINE_WAIT(wait);
+
+ permit = __rtrs_get_permit(clt, con_type);
+ if (permit || !can_wait)
+ return permit;
+
+ do {
+ prepare_to_wait(&clt->permits_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ permit = __rtrs_get_permit(clt, con_type);
+ if (permit)
+ break;
+
+ io_schedule();
+ } while (1);
+
+ finish_wait(&clt->permits_wait, &wait);
+
+ return permit;
+}
+EXPORT_SYMBOL(rtrs_clt_get_permit);
+
+/**
+ * rtrs_clt_put_permit() - puts allocated permit
+ * @clt: Current session
+ * @permit: Permit to be freed
+ *
+ * Context:
+ * Does not matter
+ */
+void rtrs_clt_put_permit(struct rtrs_clt_sess *clt,
+ struct rtrs_permit *permit)
+{
+ if (WARN_ON(!test_bit(permit->mem_id, clt->permits_map)))
+ return;
+
+ __rtrs_put_permit(clt, permit);
+
+ /*
+ * rtrs_clt_get_permit() adds itself to the &clt->permits_wait list
+ * before calling schedule(). So if rtrs_clt_get_permit() is sleeping
+ * it must have added itself to &clt->permits_wait before
+ * __rtrs_put_permit() finished.
+ * Hence it is safe to guard wake_up() with a waitqueue_active() test.
+ */
+ if (waitqueue_active(&clt->permits_wait))
+ wake_up(&clt->permits_wait);
+}
+EXPORT_SYMBOL(rtrs_clt_put_permit);
+
+/**
+ * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit
+ * @clt_path: client path pointer
+ * @permit: permit for the allocation of the RDMA buffer
+ * Note:
+ * IO connection starts from 1.
+ * 0 connection is for user messages.
+ */
+static
+struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_path *clt_path,
+ struct rtrs_permit *permit)
+{
+ int id = 0;
+
+ if (permit->con_type == RTRS_IO_CON)
+ id = (permit->cpu_id % (clt_path->s.irq_con_num - 1)) + 1;
+
+ return to_clt_con(clt_path->s.con[id]);
+}
+
+/**
+ * rtrs_clt_change_state() - change the session state through session state
+ * machine.
+ *
+ * @clt_path: client path to change the state of.
+ * @new_state: state to change to.
+ *
+ * returns true if sess's state is changed to new state, otherwise return false.
+ *
+ * Locks:
+ * state_wq lock must be hold.
+ */
+static bool rtrs_clt_change_state(struct rtrs_clt_path *clt_path,
+ enum rtrs_clt_state new_state)
+{
+ enum rtrs_clt_state old_state;
+ bool changed = false;
+
+ lockdep_assert_held(&clt_path->state_wq.lock);
+
+ old_state = clt_path->state;
+ switch (new_state) {
+ case RTRS_CLT_CONNECTING:
+ switch (old_state) {
+ case RTRS_CLT_RECONNECTING:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_RECONNECTING:
+ switch (old_state) {
+ case RTRS_CLT_CONNECTED:
+ case RTRS_CLT_CONNECTING_ERR:
+ case RTRS_CLT_CLOSED:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_CONNECTED:
+ switch (old_state) {
+ case RTRS_CLT_CONNECTING:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_CONNECTING_ERR:
+ switch (old_state) {
+ case RTRS_CLT_CONNECTING:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_CLOSING:
+ switch (old_state) {
+ case RTRS_CLT_CONNECTING:
+ case RTRS_CLT_CONNECTING_ERR:
+ case RTRS_CLT_RECONNECTING:
+ case RTRS_CLT_CONNECTED:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_CLOSED:
+ switch (old_state) {
+ case RTRS_CLT_CLOSING:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ case RTRS_CLT_DEAD:
+ switch (old_state) {
+ case RTRS_CLT_CLOSED:
+ changed = true;
+ fallthrough;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ if (changed) {
+ clt_path->state = new_state;
+ wake_up_locked(&clt_path->state_wq);
+ }
+
+ return changed;
+}
+
+static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
+ enum rtrs_clt_state old_state,
+ enum rtrs_clt_state new_state)
+{
+ bool changed = false;
+
+ spin_lock_irq(&clt_path->state_wq.lock);
+ if (clt_path->state == old_state)
+ changed = rtrs_clt_change_state(clt_path, new_state);
+ spin_unlock_irq(&clt_path->state_wq.lock);
+
+ return changed;
+}
+
+static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
+static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ trace_rtrs_rdma_error_recovery(clt_path);
+
+ if (rtrs_clt_change_state_from_to(clt_path,
+ RTRS_CLT_CONNECTED,
+ RTRS_CLT_RECONNECTING)) {
+ queue_work(rtrs_wq, &clt_path->err_recovery_work);
+ } else {
+ /*
+ * Error can happen just on establishing new connection,
+ * so notify waiter with error state, waiter is responsible
+ * for cleaning the rest and reconnect if needed.
+ */
+ rtrs_clt_change_state_from_to(clt_path,
+ RTRS_CLT_CONNECTING,
+ RTRS_CLT_CONNECTING_ERR);
+ }
+}
+
+static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n",
+ ib_wc_status_msg(wc->status));
+ rtrs_rdma_error_recovery(con);
+ }
+}
+
+static struct ib_cqe fast_reg_cqe = {
+ .done = rtrs_clt_fast_reg_done
+};
+
+static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
+ bool notify, bool can_wait);
+
+static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_clt_io_req *req =
+ container_of(wc->wr_cqe, typeof(*req), inv_cqe);
+ struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n",
+ ib_wc_status_msg(wc->status));
+ rtrs_rdma_error_recovery(con);
+ }
+ req->need_inv = false;
+ if (req->need_inv_comp)
+ complete(&req->inv_comp);
+ else
+ /* Complete request from INV callback */
+ complete_rdma_req(req, req->inv_errno, true, false);
+}
+
+static int rtrs_inv_rkey(struct rtrs_clt_io_req *req)
+{
+ struct rtrs_clt_con *con = req->con;
+ struct ib_send_wr wr = {
+ .opcode = IB_WR_LOCAL_INV,
+ .wr_cqe = &req->inv_cqe,
+ .send_flags = IB_SEND_SIGNALED,
+ .ex.invalidate_rkey = req->mr->rkey,
+ };
+ req->inv_cqe.done = rtrs_clt_inv_rkey_done;
+
+ return ib_post_send(con->c.qp, &wr, NULL);
+}
+
+static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
+ bool notify, bool can_wait)
+{
+ struct rtrs_clt_con *con = req->con;
+ struct rtrs_clt_path *clt_path;
+ int err;
+
+ if (WARN_ON(!req->in_use))
+ return;
+ if (WARN_ON(!req->con))
+ return;
+ clt_path = to_clt_path(con->c.path);
+
+ if (req->sg_cnt) {
+ if (req->dir == DMA_FROM_DEVICE && req->need_inv) {
+ /*
+ * We are here to invalidate read requests
+ * ourselves. In normal scenario server should
+ * send INV for all read requests, but
+ * we are here, thus two things could happen:
+ *
+ * 1. this is failover, when errno != 0
+ * and can_wait == 1,
+ *
+ * 2. something totally bad happened and
+ * server forgot to send INV, so we
+ * should do that ourselves.
+ */
+
+ if (can_wait) {
+ req->need_inv_comp = true;
+ } else {
+ /* This should be IO path, so always notify */
+ WARN_ON(!notify);
+ /* Save errno for INV callback */
+ req->inv_errno = errno;
+ }
+
+ refcount_inc(&req->ref);
+ err = rtrs_inv_rkey(req);
+ if (err) {
+ rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n",
+ req->mr->rkey, err);
+ } else if (can_wait) {
+ wait_for_completion(&req->inv_comp);
+ } else {
+ /*
+ * Something went wrong, so request will be
+ * completed from INV callback.
+ */
+ WARN_ON_ONCE(1);
+
+ return;
+ }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ }
+ ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
+ req->sg_cnt, req->dir);
+ }
+ if (!refcount_dec_and_test(&req->ref))
+ return;
+ if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
+ atomic_dec(&clt_path->stats->inflight);
+
+ req->in_use = false;
+ req->con = NULL;
+
+ if (errno) {
+ rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n",
+ errno, kobject_name(&clt_path->kobj), clt_path->hca_name,
+ clt_path->hca_port, notify);
+ }
+
+ if (notify)
+ req->conf(req->priv, errno);
+}
+
+static int rtrs_post_send_rdma(struct rtrs_clt_con *con,
+ struct rtrs_clt_io_req *req,
+ struct rtrs_rbuf *rbuf, u32 off,
+ u32 imm, struct ib_send_wr *wr)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ enum ib_send_flags flags;
+ struct ib_sge sge;
+
+ if (!req->sg_size) {
+ rtrs_wrn(con->c.path,
+ "Doing RDMA Write failed, no data supplied\n");
+ return -EINVAL;
+ }
+
+ /* user data and user message in the first list element */
+ sge.addr = req->iu->dma_addr;
+ sge.length = req->sg_size;
+ sge.lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
+
+ /*
+ * From time to time we have to post signalled sends,
+ * or send queue will fill up and only QP reset can help.
+ */
+ flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
+ 0 : IB_SEND_SIGNALED;
+
+ ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
+ req->iu->dma_addr,
+ req->sg_size, DMA_TO_DEVICE);
+
+ return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, &sge, 1,
+ rbuf->rkey, rbuf->addr + off,
+ imm, flags, wr, NULL);
+}
+
+static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id,
+ s16 errno, bool w_inval)
+{
+ struct rtrs_clt_io_req *req;
+
+ if (WARN_ON(msg_id >= clt_path->queue_depth))
+ return;
+
+ req = &clt_path->reqs[msg_id];
+ /* Drop need_inv if server responded with send with invalidation */
+ req->need_inv &= !w_inval;
+ complete_rdma_req(req, errno, true, false);
+}
+
+static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc)
+{
+ struct rtrs_iu *iu;
+ int err;
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
+ iu = container_of(wc->wr_cqe, struct rtrs_iu,
+ cqe);
+ err = rtrs_iu_post_recv(&con->c, iu);
+ if (err) {
+ rtrs_err(con->c.path, "post iu failed %d\n", err);
+ rtrs_rdma_error_recovery(con);
+ }
+}
+
+static void rtrs_clt_rkey_rsp_done(struct rtrs_clt_con *con, struct ib_wc *wc)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_msg_rkey_rsp *msg;
+ u32 imm_type, imm_payload;
+ bool w_inval = false;
+ struct rtrs_iu *iu;
+ u32 buf_id;
+ int err;
+
+ WARN_ON((clt_path->flags & RTRS_MSG_NEW_RKEY_F) == 0);
+
+ iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
+
+ if (wc->byte_len < sizeof(*msg)) {
+ rtrs_err(con->c.path, "rkey response is malformed: size %d\n",
+ wc->byte_len);
+ goto out;
+ }
+ ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
+ iu->size, DMA_FROM_DEVICE);
+ msg = iu->buf;
+ if (le16_to_cpu(msg->type) != RTRS_MSG_RKEY_RSP) {
+ rtrs_err(clt_path->clt,
+ "rkey response is malformed: type %d\n",
+ le16_to_cpu(msg->type));
+ goto out;
+ }
+ buf_id = le16_to_cpu(msg->buf_id);
+ if (WARN_ON(buf_id >= clt_path->queue_depth))
+ goto out;
+
+ rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload);
+ if (imm_type == RTRS_IO_RSP_IMM ||
+ imm_type == RTRS_IO_RSP_W_INV_IMM) {
+ u32 msg_id;
+
+ w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
+ rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
+
+ if (WARN_ON(buf_id != msg_id))
+ goto out;
+ clt_path->rbufs[buf_id].rkey = le32_to_cpu(msg->rkey);
+ process_io_rsp(clt_path, msg_id, err, w_inval);
+ }
+ ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev, iu->dma_addr,
+ iu->size, DMA_FROM_DEVICE);
+ return rtrs_clt_recv_done(con, wc);
+out:
+ rtrs_rdma_error_recovery(con);
+}
+
+static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
+
+static struct ib_cqe io_comp_cqe = {
+ .done = rtrs_clt_rdma_done
+};
+
+/*
+ * Post x2 empty WRs: first is for this RDMA with IMM,
+ * second is for RECV with INV, which happened earlier.
+ */
+static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe)
+{
+ struct ib_recv_wr wr_arr[2], *wr;
+ int i;
+
+ memset(wr_arr, 0, sizeof(wr_arr));
+ for (i = 0; i < ARRAY_SIZE(wr_arr); i++) {
+ wr = &wr_arr[i];
+ wr->wr_cqe = cqe;
+ if (i)
+ /* Chain backwards */
+ wr->next = &wr_arr[i - 1];
+ }
+
+ return ib_post_recv(con->qp, wr, NULL);
+}
+
+static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ u32 imm_type, imm_payload;
+ bool w_inval = false;
+ int err;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ rtrs_err(clt_path->clt, "RDMA failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ rtrs_rdma_error_recovery(con);
+ }
+ return;
+ }
+ rtrs_clt_update_wc_stats(con);
+
+ switch (wc->opcode) {
+ case IB_WC_RECV_RDMA_WITH_IMM:
+ /*
+ * post_recv() RDMA write completions of IO reqs (read/write)
+ * and hb
+ */
+ if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done))
+ return;
+ rtrs_from_imm(be32_to_cpu(wc->ex.imm_data),
+ &imm_type, &imm_payload);
+ if (imm_type == RTRS_IO_RSP_IMM ||
+ imm_type == RTRS_IO_RSP_W_INV_IMM) {
+ u32 msg_id;
+
+ w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
+ rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
+
+ process_io_rsp(clt_path, msg_id, err, w_inval);
+ } else if (imm_type == RTRS_HB_MSG_IMM) {
+ WARN_ON(con->c.cid);
+ rtrs_send_hb_ack(&clt_path->s);
+ if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
+ return rtrs_clt_recv_done(con, wc);
+ } else if (imm_type == RTRS_HB_ACK_IMM) {
+ WARN_ON(con->c.cid);
+ clt_path->s.hb_missed_cnt = 0;
+ clt_path->s.hb_cur_latency =
+ ktime_sub(ktime_get(), clt_path->s.hb_last_sent);
+ if (clt_path->flags & RTRS_MSG_NEW_RKEY_F)
+ return rtrs_clt_recv_done(con, wc);
+ } else {
+ rtrs_wrn(con->c.path, "Unknown IMM type %u\n",
+ imm_type);
+ }
+ if (w_inval)
+ /*
+ * Post x2 empty WRs: first is for this RDMA with IMM,
+ * second is for RECV with INV, which happened earlier.
+ */
+ err = rtrs_post_recv_empty_x2(&con->c, &io_comp_cqe);
+ else
+ err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
+ if (err) {
+ rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n",
+ err);
+ rtrs_rdma_error_recovery(con);
+ }
+ break;
+ case IB_WC_RECV:
+ /*
+ * Key invalidations from server side
+ */
+ WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE ||
+ wc->wc_flags & IB_WC_WITH_IMM));
+ WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done);
+ if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
+ if (wc->wc_flags & IB_WC_WITH_INVALIDATE)
+ return rtrs_clt_recv_done(con, wc);
+
+ return rtrs_clt_rkey_rsp_done(con, wc);
+ }
+ break;
+ case IB_WC_RDMA_WRITE:
+ /*
+ * post_send() RDMA write completions of IO reqs (read/write)
+ * and hb.
+ */
+ break;
+
+ default:
+ rtrs_wrn(clt_path->clt, "Unexpected WC type: %d\n", wc->opcode);
+ return;
+ }
+}
+
+static int post_recv_io(struct rtrs_clt_con *con, size_t q_size)
+{
+ int err, i;
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ for (i = 0; i < q_size; i++) {
+ if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) {
+ struct rtrs_iu *iu = &con->rsp_ius[i];
+
+ err = rtrs_iu_post_recv(&con->c, iu);
+ } else {
+ err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
+ }
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int post_recv_path(struct rtrs_clt_path *clt_path)
+{
+ size_t q_size = 0;
+ int err, cid;
+
+ for (cid = 0; cid < clt_path->s.con_num; cid++) {
+ if (cid == 0)
+ q_size = SERVICE_CON_QUEUE_DEPTH;
+ else
+ q_size = clt_path->queue_depth;
+
+ /*
+ * x2 for RDMA read responses + FR key invalidations,
+ * RDMA writes do not require any FR registrations.
+ */
+ q_size *= 2;
+
+ err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size);
+ if (err) {
+ rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n",
+ err);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+struct path_it {
+ int i;
+ struct list_head skip_list;
+ struct rtrs_clt_sess *clt;
+ struct rtrs_clt_path *(*next_path)(struct path_it *it);
+};
+
+/*
+ * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL
+ * @head: the head for the list.
+ * @clt_path: The element to take the next clt_path from.
+ *
+ * Next clt path returned in round-robin fashion, i.e. head will be skipped,
+ * but if list is observed as empty, NULL will be returned.
+ *
+ * This function may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+static inline struct rtrs_clt_path *
+rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path)
+{
+ return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?:
+ list_next_or_null_rcu(head,
+ READ_ONCE((&clt_path->s.entry)->next),
+ typeof(*clt_path), s.entry);
+}
+
+/**
+ * get_next_path_rr() - Returns path in round-robin fashion.
+ * @it: the path pointer
+ *
+ * Related to @MP_POLICY_RR
+ *
+ * Locks:
+ * rcu_read_lock() must be hold.
+ */
+static struct rtrs_clt_path *get_next_path_rr(struct path_it *it)
+{
+ struct rtrs_clt_path __rcu **ppcpu_path;
+ struct rtrs_clt_path *path;
+ struct rtrs_clt_sess *clt;
+
+ clt = it->clt;
+
+ /*
+ * Here we use two RCU objects: @paths_list and @pcpu_path
+ * pointer. See rtrs_clt_remove_path_from_arr() for details
+ * how that is handled.
+ */
+
+ ppcpu_path = this_cpu_ptr(clt->pcpu_path);
+ path = rcu_dereference(*ppcpu_path);
+ if (!path)
+ path = list_first_or_null_rcu(&clt->paths_list,
+ typeof(*path), s.entry);
+ else
+ path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path);
+
+ rcu_assign_pointer(*ppcpu_path, path);
+
+ return path;
+}
+
+/**
+ * get_next_path_min_inflight() - Returns path with minimal inflight count.
+ * @it: the path pointer
+ *
+ * Related to @MP_POLICY_MIN_INFLIGHT
+ *
+ * Locks:
+ * rcu_read_lock() must be hold.
+ */
+static struct rtrs_clt_path *get_next_path_min_inflight(struct path_it *it)
+{
+ struct rtrs_clt_path *min_path = NULL;
+ struct rtrs_clt_sess *clt = it->clt;
+ struct rtrs_clt_path *clt_path;
+ int min_inflight = INT_MAX;
+ int inflight;
+
+ list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
+ continue;
+
+ if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
+ continue;
+
+ inflight = atomic_read(&clt_path->stats->inflight);
+
+ if (inflight < min_inflight) {
+ min_inflight = inflight;
+ min_path = clt_path;
+ }
+ }
+
+ /*
+ * add the path to the skip list, so that next time we can get
+ * a different one
+ */
+ if (min_path)
+ list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
+
+ return min_path;
+}
+
+/**
+ * get_next_path_min_latency() - Returns path with minimal latency.
+ * @it: the path pointer
+ *
+ * Return: a path with the lowest latency or NULL if all paths are tried
+ *
+ * Locks:
+ * rcu_read_lock() must be hold.
+ *
+ * Related to @MP_POLICY_MIN_LATENCY
+ *
+ * This DOES skip an already-tried path.
+ * There is a skip-list to skip a path if the path has tried but failed.
+ * It will try the minimum latency path and then the second minimum latency
+ * path and so on. Finally it will return NULL if all paths are tried.
+ * Therefore the caller MUST check the returned
+ * path is NULL and trigger the IO error.
+ */
+static struct rtrs_clt_path *get_next_path_min_latency(struct path_it *it)
+{
+ struct rtrs_clt_path *min_path = NULL;
+ struct rtrs_clt_sess *clt = it->clt;
+ struct rtrs_clt_path *clt_path;
+ ktime_t min_latency = KTIME_MAX;
+ ktime_t latency;
+
+ list_for_each_entry_rcu(clt_path, &clt->paths_list, s.entry) {
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
+ continue;
+
+ if (!list_empty(raw_cpu_ptr(clt_path->mp_skip_entry)))
+ continue;
+
+ latency = clt_path->s.hb_cur_latency;
+
+ if (latency < min_latency) {
+ min_latency = latency;
+ min_path = clt_path;
+ }
+ }
+
+ /*
+ * add the path to the skip list, so that next time we can get
+ * a different one
+ */
+ if (min_path)
+ list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
+
+ return min_path;
+}
+
+static inline void path_it_init(struct path_it *it, struct rtrs_clt_sess *clt)
+{
+ INIT_LIST_HEAD(&it->skip_list);
+ it->clt = clt;
+ it->i = 0;
+
+ if (clt->mp_policy == MP_POLICY_RR)
+ it->next_path = get_next_path_rr;
+ else if (clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
+ it->next_path = get_next_path_min_inflight;
+ else
+ it->next_path = get_next_path_min_latency;
+}
+
+static inline void path_it_deinit(struct path_it *it)
+{
+ struct list_head *skip, *tmp;
+ /*
+ * The skip_list is used only for the MIN_INFLIGHT and MIN_LATENCY policies.
+ * We need to remove paths from it, so that next IO can insert
+ * paths (->mp_skip_entry) into a skip_list again.
+ */
+ list_for_each_safe(skip, tmp, &it->skip_list)
+ list_del_init(skip);
+}
+
+/**
+ * rtrs_clt_init_req() - Initialize an rtrs_clt_io_req holding information
+ * about an inflight IO.
+ * The user buffer holding user control message (not data) is copied into
+ * the corresponding buffer of rtrs_iu (req->iu->buf), which later on will
+ * also hold the control message of rtrs.
+ * @req: an io request holding information about IO.
+ * @clt_path: client path
+ * @conf: conformation callback function to notify upper layer.
+ * @permit: permit for allocation of RDMA remote buffer
+ * @priv: private pointer
+ * @vec: kernel vector containing control message
+ * @usr_len: length of the user message
+ * @sg: scater list for IO data
+ * @sg_cnt: number of scater list entries
+ * @data_len: length of the IO data
+ * @dir: direction of the IO.
+ */
+static void rtrs_clt_init_req(struct rtrs_clt_io_req *req,
+ struct rtrs_clt_path *clt_path,
+ void (*conf)(void *priv, int errno),
+ struct rtrs_permit *permit, void *priv,
+ const struct kvec *vec, size_t usr_len,
+ struct scatterlist *sg, size_t sg_cnt,
+ size_t data_len, int dir)
+{
+ struct iov_iter iter;
+ size_t len;
+
+ req->permit = permit;
+ req->in_use = true;
+ req->usr_len = usr_len;
+ req->data_len = data_len;
+ req->sglist = sg;
+ req->sg_cnt = sg_cnt;
+ req->priv = priv;
+ req->dir = dir;
+ req->con = rtrs_permit_to_clt_con(clt_path, permit);
+ req->conf = conf;
+ req->need_inv = false;
+ req->need_inv_comp = false;
+ req->inv_errno = 0;
+ refcount_set(&req->ref, 1);
+ req->mp_policy = clt_path->clt->mp_policy;
+
+ iov_iter_kvec(&iter, READ, vec, 1, usr_len);
+ len = _copy_from_iter(req->iu->buf, usr_len, &iter);
+ WARN_ON(len != usr_len);
+
+ reinit_completion(&req->inv_comp);
+}
+
+static struct rtrs_clt_io_req *
+rtrs_clt_get_req(struct rtrs_clt_path *clt_path,
+ void (*conf)(void *priv, int errno),
+ struct rtrs_permit *permit, void *priv,
+ const struct kvec *vec, size_t usr_len,
+ struct scatterlist *sg, size_t sg_cnt,
+ size_t data_len, int dir)
+{
+ struct rtrs_clt_io_req *req;
+
+ req = &clt_path->reqs[permit->mem_id];
+ rtrs_clt_init_req(req, clt_path, conf, permit, priv, vec, usr_len,
+ sg, sg_cnt, data_len, dir);
+ return req;
+}
+
+static struct rtrs_clt_io_req *
+rtrs_clt_get_copy_req(struct rtrs_clt_path *alive_path,
+ struct rtrs_clt_io_req *fail_req)
+{
+ struct rtrs_clt_io_req *req;
+ struct kvec vec = {
+ .iov_base = fail_req->iu->buf,
+ .iov_len = fail_req->usr_len
+ };
+
+ req = &alive_path->reqs[fail_req->permit->mem_id];
+ rtrs_clt_init_req(req, alive_path, fail_req->conf, fail_req->permit,
+ fail_req->priv, &vec, fail_req->usr_len,
+ fail_req->sglist, fail_req->sg_cnt,
+ fail_req->data_len, fail_req->dir);
+ return req;
+}
+
+static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con,
+ struct rtrs_clt_io_req *req,
+ struct rtrs_rbuf *rbuf, bool fr_en,
+ u32 count, u32 size, u32 imm,
+ struct ib_send_wr *wr,
+ struct ib_send_wr *tail)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct ib_sge *sge = req->sge;
+ enum ib_send_flags flags;
+ struct scatterlist *sg;
+ size_t num_sge;
+ int i;
+ struct ib_send_wr *ptail = NULL;
+
+ if (fr_en) {
+ i = 0;
+ sge[i].addr = req->mr->iova;
+ sge[i].length = req->mr->length;
+ sge[i].lkey = req->mr->lkey;
+ i++;
+ num_sge = 2;
+ ptail = tail;
+ } else {
+ for_each_sg(req->sglist, sg, count, i) {
+ sge[i].addr = sg_dma_address(sg);
+ sge[i].length = sg_dma_len(sg);
+ sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
+ }
+ num_sge = 1 + count;
+ }
+ sge[i].addr = req->iu->dma_addr;
+ sge[i].length = size;
+ sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey;
+
+ /*
+ * From time to time we have to post signalled sends,
+ * or send queue will fill up and only QP reset can help.
+ */
+ flags = atomic_inc_return(&con->c.wr_cnt) % clt_path->s.signal_interval ?
+ 0 : IB_SEND_SIGNALED;
+
+ ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
+ req->iu->dma_addr,
+ size, DMA_TO_DEVICE);
+
+ return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, sge, num_sge,
+ rbuf->rkey, rbuf->addr, imm,
+ flags, wr, ptail);
+}
+
+static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count)
+{
+ int nr;
+
+ /* Align the MR to a 4K page size to match the block virt boundary */
+ nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K);
+ if (nr < 0)
+ return nr;
+ if (nr < req->sg_cnt)
+ return -EINVAL;
+ ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
+
+ return nr;
+}
+
+static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
+{
+ struct rtrs_clt_con *con = req->con;
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_clt_path *clt_path = to_clt_path(s);
+ struct rtrs_msg_rdma_write *msg;
+
+ struct rtrs_rbuf *rbuf;
+ int ret, count = 0;
+ u32 imm, buf_id;
+ struct ib_reg_wr rwr;
+ struct ib_send_wr inv_wr;
+ struct ib_send_wr *wr = NULL;
+ bool fr_en = false;
+
+ const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
+
+ if (tsize > clt_path->chunk_size) {
+ rtrs_wrn(s, "Write request failed, size too big %zu > %d\n",
+ tsize, clt_path->chunk_size);
+ return -EMSGSIZE;
+ }
+ if (req->sg_cnt) {
+ count = ib_dma_map_sg(clt_path->s.dev->ib_dev, req->sglist,
+ req->sg_cnt, req->dir);
+ if (!count) {
+ rtrs_wrn(s, "Write request failed, map failed\n");
+ return -EINVAL;
+ }
+ }
+ /* put rtrs msg after sg and user message */
+ msg = req->iu->buf + req->usr_len;
+ msg->type = cpu_to_le16(RTRS_MSG_WRITE);
+ msg->usr_len = cpu_to_le16(req->usr_len);
+
+ /* rtrs message on server side will be after user data and message */
+ imm = req->permit->mem_off + req->data_len + req->usr_len;
+ imm = rtrs_to_io_req_imm(imm);
+ buf_id = req->permit->mem_id;
+ req->sg_size = tsize;
+ rbuf = &clt_path->rbufs[buf_id];
+
+ if (count) {
+ ret = rtrs_map_sg_fr(req, count);
+ if (ret < 0) {
+ rtrs_err_rl(s,
+ "Write request failed, failed to map fast reg. data, err: %d\n",
+ ret);
+ ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
+ req->sg_cnt, req->dir);
+ return ret;
+ }
+ inv_wr = (struct ib_send_wr) {
+ .opcode = IB_WR_LOCAL_INV,
+ .wr_cqe = &req->inv_cqe,
+ .send_flags = IB_SEND_SIGNALED,
+ .ex.invalidate_rkey = req->mr->rkey,
+ };
+ req->inv_cqe.done = rtrs_clt_inv_rkey_done;
+ rwr = (struct ib_reg_wr) {
+ .wr.opcode = IB_WR_REG_MR,
+ .wr.wr_cqe = &fast_reg_cqe,
+ .mr = req->mr,
+ .key = req->mr->rkey,
+ .access = (IB_ACCESS_LOCAL_WRITE),
+ };
+ wr = &rwr.wr;
+ fr_en = true;
+ refcount_inc(&req->ref);
+ }
+ /*
+ * Update stats now, after request is successfully sent it is not
+ * safe anymore to touch it.
+ */
+ rtrs_clt_update_all_stats(req, WRITE);
+
+ ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count,
+ req->usr_len + sizeof(*msg),
+ imm, wr, &inv_wr);
+ if (ret) {
+ rtrs_err_rl(s,
+ "Write request failed: error=%d path=%s [%s:%u]\n",
+ ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
+ clt_path->hca_port);
+ if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
+ atomic_dec(&clt_path->stats->inflight);
+ if (req->sg_cnt)
+ ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist,
+ req->sg_cnt, req->dir);
+ }
+
+ return ret;
+}
+
+static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
+{
+ struct rtrs_clt_con *con = req->con;
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_clt_path *clt_path = to_clt_path(s);
+ struct rtrs_msg_rdma_read *msg;
+ struct rtrs_ib_dev *dev = clt_path->s.dev;
+
+ struct ib_reg_wr rwr;
+ struct ib_send_wr *wr = NULL;
+
+ int ret, count = 0;
+ u32 imm, buf_id;
+
+ const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
+
+ if (tsize > clt_path->chunk_size) {
+ rtrs_wrn(s,
+ "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n",
+ tsize, clt_path->chunk_size);
+ return -EMSGSIZE;
+ }
+
+ if (req->sg_cnt) {
+ count = ib_dma_map_sg(dev->ib_dev, req->sglist, req->sg_cnt,
+ req->dir);
+ if (!count) {
+ rtrs_wrn(s,
+ "Read request failed, dma map failed\n");
+ return -EINVAL;
+ }
+ }
+ /* put our message into req->buf after user message*/
+ msg = req->iu->buf + req->usr_len;
+ msg->type = cpu_to_le16(RTRS_MSG_READ);
+ msg->usr_len = cpu_to_le16(req->usr_len);
+
+ if (count) {
+ ret = rtrs_map_sg_fr(req, count);
+ if (ret < 0) {
+ rtrs_err_rl(s,
+ "Read request failed, failed to map fast reg. data, err: %d\n",
+ ret);
+ ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt,
+ req->dir);
+ return ret;
+ }
+ rwr = (struct ib_reg_wr) {
+ .wr.opcode = IB_WR_REG_MR,
+ .wr.wr_cqe = &fast_reg_cqe,
+ .mr = req->mr,
+ .key = req->mr->rkey,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE),
+ };
+ wr = &rwr.wr;
+
+ msg->sg_cnt = cpu_to_le16(1);
+ msg->flags = cpu_to_le16(RTRS_MSG_NEED_INVAL_F);
+
+ msg->desc[0].addr = cpu_to_le64(req->mr->iova);
+ msg->desc[0].key = cpu_to_le32(req->mr->rkey);
+ msg->desc[0].len = cpu_to_le32(req->mr->length);
+
+ /* Further invalidation is required */
+ req->need_inv = !!RTRS_MSG_NEED_INVAL_F;
+
+ } else {
+ msg->sg_cnt = 0;
+ msg->flags = 0;
+ }
+ /*
+ * rtrs message will be after the space reserved for disk data and
+ * user message
+ */
+ imm = req->permit->mem_off + req->data_len + req->usr_len;
+ imm = rtrs_to_io_req_imm(imm);
+ buf_id = req->permit->mem_id;
+
+ req->sg_size = sizeof(*msg);
+ req->sg_size += le16_to_cpu(msg->sg_cnt) * sizeof(struct rtrs_sg_desc);
+ req->sg_size += req->usr_len;
+
+ /*
+ * Update stats now, after request is successfully sent it is not
+ * safe anymore to touch it.
+ */
+ rtrs_clt_update_all_stats(req, READ);
+
+ ret = rtrs_post_send_rdma(req->con, req, &clt_path->rbufs[buf_id],
+ req->data_len, imm, wr);
+ if (ret) {
+ rtrs_err_rl(s,
+ "Read request failed: error=%d path=%s [%s:%u]\n",
+ ret, kobject_name(&clt_path->kobj), clt_path->hca_name,
+ clt_path->hca_port);
+ if (req->mp_policy == MP_POLICY_MIN_INFLIGHT)
+ atomic_dec(&clt_path->stats->inflight);
+ req->need_inv = false;
+ if (req->sg_cnt)
+ ib_dma_unmap_sg(dev->ib_dev, req->sglist,
+ req->sg_cnt, req->dir);
+ }
+
+ return ret;
+}
+
+/**
+ * rtrs_clt_failover_req() - Try to find an active path for a failed request
+ * @clt: clt context
+ * @fail_req: a failed io request.
+ */
+static int rtrs_clt_failover_req(struct rtrs_clt_sess *clt,
+ struct rtrs_clt_io_req *fail_req)
+{
+ struct rtrs_clt_path *alive_path;
+ struct rtrs_clt_io_req *req;
+ int err = -ECONNABORTED;
+ struct path_it it;
+
+ rcu_read_lock();
+ for (path_it_init(&it, clt);
+ (alive_path = it.next_path(&it)) && it.i < it.clt->paths_num;
+ it.i++) {
+ if (READ_ONCE(alive_path->state) != RTRS_CLT_CONNECTED)
+ continue;
+ req = rtrs_clt_get_copy_req(alive_path, fail_req);
+ if (req->dir == DMA_TO_DEVICE)
+ err = rtrs_clt_write_req(req);
+ else
+ err = rtrs_clt_read_req(req);
+ if (err) {
+ req->in_use = false;
+ continue;
+ }
+ /* Success path */
+ rtrs_clt_inc_failover_cnt(alive_path->stats);
+ break;
+ }
+ path_it_deinit(&it);
+ rcu_read_unlock();
+
+ return err;
+}
+
+static void fail_all_outstanding_reqs(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ struct rtrs_clt_io_req *req;
+ int i, err;
+
+ if (!clt_path->reqs)
+ return;
+ for (i = 0; i < clt_path->queue_depth; ++i) {
+ req = &clt_path->reqs[i];
+ if (!req->in_use)
+ continue;
+
+ /*
+ * Safely (without notification) complete failed request.
+ * After completion this request is still useble and can
+ * be failovered to another path.
+ */
+ complete_rdma_req(req, -ECONNABORTED, false, true);
+
+ err = rtrs_clt_failover_req(clt, req);
+ if (err)
+ /* Failover failed, notify anyway */
+ req->conf(req->priv, err);
+ }
+}
+
+static void free_path_reqs(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_io_req *req;
+ int i;
+
+ if (!clt_path->reqs)
+ return;
+ for (i = 0; i < clt_path->queue_depth; ++i) {
+ req = &clt_path->reqs[i];
+ if (req->mr)
+ ib_dereg_mr(req->mr);
+ kfree(req->sge);
+ rtrs_iu_free(req->iu, clt_path->s.dev->ib_dev, 1);
+ }
+ kfree(clt_path->reqs);
+ clt_path->reqs = NULL;
+}
+
+static int alloc_path_reqs(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_io_req *req;
+ int i, err = -ENOMEM;
+
+ clt_path->reqs = kcalloc(clt_path->queue_depth,
+ sizeof(*clt_path->reqs),
+ GFP_KERNEL);
+ if (!clt_path->reqs)
+ return -ENOMEM;
+
+ for (i = 0; i < clt_path->queue_depth; ++i) {
+ req = &clt_path->reqs[i];
+ req->iu = rtrs_iu_alloc(1, clt_path->max_hdr_size, GFP_KERNEL,
+ clt_path->s.dev->ib_dev,
+ DMA_TO_DEVICE,
+ rtrs_clt_rdma_done);
+ if (!req->iu)
+ goto out;
+
+ req->sge = kcalloc(2, sizeof(*req->sge), GFP_KERNEL);
+ if (!req->sge)
+ goto out;
+
+ req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd,
+ IB_MR_TYPE_MEM_REG,
+ clt_path->max_pages_per_mr);
+ if (IS_ERR(req->mr)) {
+ err = PTR_ERR(req->mr);
+ req->mr = NULL;
+ pr_err("Failed to alloc clt_path->max_pages_per_mr %d\n",
+ clt_path->max_pages_per_mr);
+ goto out;
+ }
+
+ init_completion(&req->inv_comp);
+ }
+
+ return 0;
+
+out:
+ free_path_reqs(clt_path);
+
+ return err;
+}
+
+static int alloc_permits(struct rtrs_clt_sess *clt)
+{
+ unsigned int chunk_bits;
+ int err, i;
+
+ clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL);
+ if (!clt->permits_map) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ clt->permits = kcalloc(clt->queue_depth, permit_size(clt), GFP_KERNEL);
+ if (!clt->permits) {
+ err = -ENOMEM;
+ goto err_map;
+ }
+ chunk_bits = ilog2(clt->queue_depth - 1) + 1;
+ for (i = 0; i < clt->queue_depth; i++) {
+ struct rtrs_permit *permit;
+
+ permit = get_permit(clt, i);
+ permit->mem_id = i;
+ permit->mem_off = i << (MAX_IMM_PAYL_BITS - chunk_bits);
+ }
+
+ return 0;
+
+err_map:
+ bitmap_free(clt->permits_map);
+ clt->permits_map = NULL;
+out_err:
+ return err;
+}
+
+static void free_permits(struct rtrs_clt_sess *clt)
+{
+ if (clt->permits_map)
+ wait_event(clt->permits_wait,
+ bitmap_empty(clt->permits_map, clt->queue_depth));
+
+ bitmap_free(clt->permits_map);
+ clt->permits_map = NULL;
+ kfree(clt->permits);
+ clt->permits = NULL;
+}
+
+static void query_fast_reg_mode(struct rtrs_clt_path *clt_path)
+{
+ struct ib_device *ib_dev;
+ u64 max_pages_per_mr;
+ int mr_page_shift;
+
+ ib_dev = clt_path->s.dev->ib_dev;
+
+ /*
+ * Use the smallest page size supported by the HCA, down to a
+ * minimum of 4096 bytes. We're unlikely to build large sglists
+ * out of smaller entries.
+ */
+ mr_page_shift = max(12, ffs(ib_dev->attrs.page_size_cap) - 1);
+ max_pages_per_mr = ib_dev->attrs.max_mr_size;
+ do_div(max_pages_per_mr, (1ull << mr_page_shift));
+ clt_path->max_pages_per_mr =
+ min3(clt_path->max_pages_per_mr, (u32)max_pages_per_mr,
+ ib_dev->attrs.max_fast_reg_page_list_len);
+ clt_path->clt->max_segments =
+ min(clt_path->max_pages_per_mr, clt_path->clt->max_segments);
+}
+
+static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path,
+ enum rtrs_clt_state new_state,
+ enum rtrs_clt_state *old_state)
+{
+ bool changed;
+
+ spin_lock_irq(&clt_path->state_wq.lock);
+ if (old_state)
+ *old_state = clt_path->state;
+ changed = rtrs_clt_change_state(clt_path, new_state);
+ spin_unlock_irq(&clt_path->state_wq.lock);
+
+ return changed;
+}
+
+static void rtrs_clt_hb_err_handler(struct rtrs_con *c)
+{
+ struct rtrs_clt_con *con = container_of(c, typeof(*con), c);
+
+ rtrs_rdma_error_recovery(con);
+}
+
+static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
+{
+ rtrs_init_hb(&clt_path->s, &io_comp_cqe,
+ RTRS_HB_INTERVAL_MS,
+ RTRS_HB_MISSED_MAX,
+ rtrs_clt_hb_err_handler,
+ rtrs_wq);
+}
+
+static void rtrs_clt_reconnect_work(struct work_struct *work);
+static void rtrs_clt_close_work(struct work_struct *work);
+
+static void rtrs_clt_err_recovery_work(struct work_struct *work)
+{
+ struct rtrs_clt_path *clt_path;
+ struct rtrs_clt_sess *clt;
+ int delay_ms;
+
+ clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
+ clt = clt_path->clt;
+ delay_ms = clt->reconnect_delay_sec * 1000;
+ rtrs_clt_stop_and_destroy_conns(clt_path);
+ queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
+ msecs_to_jiffies(delay_ms +
+ prandom_u32_max(RTRS_RECONNECT_SEED)));
+}
+
+static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
+ const struct rtrs_addr *path,
+ size_t con_num, u32 nr_poll_queues)
+{
+ struct rtrs_clt_path *clt_path;
+ int err = -ENOMEM;
+ int cpu;
+ size_t total_con;
+
+ clt_path = kzalloc(sizeof(*clt_path), GFP_KERNEL);
+ if (!clt_path)
+ goto err;
+
+ /*
+ * irqmode and poll
+ * +1: Extra connection for user messages
+ */
+ total_con = con_num + nr_poll_queues + 1;
+ clt_path->s.con = kcalloc(total_con, sizeof(*clt_path->s.con),
+ GFP_KERNEL);
+ if (!clt_path->s.con)
+ goto err_free_path;
+
+ clt_path->s.con_num = total_con;
+ clt_path->s.irq_con_num = con_num + 1;
+
+ clt_path->stats = kzalloc(sizeof(*clt_path->stats), GFP_KERNEL);
+ if (!clt_path->stats)
+ goto err_free_con;
+
+ mutex_init(&clt_path->init_mutex);
+ uuid_gen(&clt_path->s.uuid);
+ memcpy(&clt_path->s.dst_addr, path->dst,
+ rdma_addr_size((struct sockaddr *)path->dst));
+
+ /*
+ * rdma_resolve_addr() passes src_addr to cma_bind_addr, which
+ * checks the sa_family to be non-zero. If user passed src_addr=NULL
+ * the sess->src_addr will contain only zeros, which is then fine.
+ */
+ if (path->src)
+ memcpy(&clt_path->s.src_addr, path->src,
+ rdma_addr_size((struct sockaddr *)path->src));
+ strscpy(clt_path->s.sessname, clt->sessname,
+ sizeof(clt_path->s.sessname));
+ clt_path->clt = clt;
+ clt_path->max_pages_per_mr = RTRS_MAX_SEGMENTS;
+ init_waitqueue_head(&clt_path->state_wq);
+ clt_path->state = RTRS_CLT_CONNECTING;
+ atomic_set(&clt_path->connected_cnt, 0);
+ INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
+ INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
+ INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
+ rtrs_clt_init_hb(clt_path);
+
+ clt_path->mp_skip_entry = alloc_percpu(typeof(*clt_path->mp_skip_entry));
+ if (!clt_path->mp_skip_entry)
+ goto err_free_stats;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(clt_path->mp_skip_entry, cpu));
+
+ err = rtrs_clt_init_stats(clt_path->stats);
+ if (err)
+ goto err_free_percpu;
+
+ return clt_path;
+
+err_free_percpu:
+ free_percpu(clt_path->mp_skip_entry);
+err_free_stats:
+ kfree(clt_path->stats);
+err_free_con:
+ kfree(clt_path->s.con);
+err_free_path:
+ kfree(clt_path);
+err:
+ return ERR_PTR(err);
+}
+
+void free_path(struct rtrs_clt_path *clt_path)
+{
+ free_percpu(clt_path->mp_skip_entry);
+ mutex_destroy(&clt_path->init_mutex);
+ kfree(clt_path->s.con);
+ kfree(clt_path->rbufs);
+ kfree(clt_path);
+}
+
+static int create_con(struct rtrs_clt_path *clt_path, unsigned int cid)
+{
+ struct rtrs_clt_con *con;
+
+ con = kzalloc(sizeof(*con), GFP_KERNEL);
+ if (!con)
+ return -ENOMEM;
+
+ /* Map first two connections to the first CPU */
+ con->cpu = (cid ? cid - 1 : 0) % nr_cpu_ids;
+ con->c.cid = cid;
+ con->c.path = &clt_path->s;
+ /* Align with srv, init as 1 */
+ atomic_set(&con->c.wr_cnt, 1);
+ mutex_init(&con->con_mutex);
+
+ clt_path->s.con[cid] = &con->c;
+
+ return 0;
+}
+
+static void destroy_con(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ clt_path->s.con[con->c.cid] = NULL;
+ mutex_destroy(&con->con_mutex);
+ kfree(con);
+}
+
+static int create_con_cq_qp(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ u32 max_send_wr, max_recv_wr, cq_num, max_send_sge, wr_limit;
+ int err, cq_vector;
+ struct rtrs_msg_rkey_rsp *rsp;
+
+ lockdep_assert_held(&con->con_mutex);
+ if (con->c.cid == 0) {
+ max_send_sge = 1;
+ /* We must be the first here */
+ if (WARN_ON(clt_path->s.dev))
+ return -EINVAL;
+
+ /*
+ * The whole session uses device from user connection.
+ * Be careful not to close user connection before ib dev
+ * is gracefully put.
+ */
+ clt_path->s.dev = rtrs_ib_dev_find_or_add(con->c.cm_id->device,
+ &dev_pd);
+ if (!clt_path->s.dev) {
+ rtrs_wrn(clt_path->clt,
+ "rtrs_ib_dev_find_get_or_add(): no memory\n");
+ return -ENOMEM;
+ }
+ clt_path->s.dev_ref = 1;
+ query_fast_reg_mode(clt_path);
+ wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
+ /*
+ * Two (request + registration) completion for send
+ * Two for recv if always_invalidate is set on server
+ * or one for recv.
+ * + 2 for drain and heartbeat
+ * in case qp gets into error state.
+ */
+ max_send_wr =
+ min_t(int, wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
+ max_recv_wr = max_send_wr;
+ } else {
+ /*
+ * Here we assume that session members are correctly set.
+ * This is always true if user connection (cid == 0) is
+ * established first.
+ */
+ if (WARN_ON(!clt_path->s.dev))
+ return -EINVAL;
+ if (WARN_ON(!clt_path->queue_depth))
+ return -EINVAL;
+
+ wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
+ /* Shared between connections */
+ clt_path->s.dev_ref++;
+ max_send_wr = min_t(int, wr_limit,
+ /* QD * (REQ + RSP + FR REGS or INVS) + drain */
+ clt_path->queue_depth * 3 + 1);
+ max_recv_wr = min_t(int, wr_limit,
+ clt_path->queue_depth * 3 + 1);
+ max_send_sge = 2;
+ }
+ atomic_set(&con->c.sq_wr_avail, max_send_wr);
+ cq_num = max_send_wr + max_recv_wr;
+ /* alloc iu to recv new rkey reply when server reports flags set */
+ if (clt_path->flags & RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) {
+ con->rsp_ius = rtrs_iu_alloc(cq_num, sizeof(*rsp),
+ GFP_KERNEL,
+ clt_path->s.dev->ib_dev,
+ DMA_FROM_DEVICE,
+ rtrs_clt_rdma_done);
+ if (!con->rsp_ius)
+ return -ENOMEM;
+ con->queue_num = cq_num;
+ }
+ cq_num = max_send_wr + max_recv_wr;
+ cq_vector = con->cpu % clt_path->s.dev->ib_dev->num_comp_vectors;
+ if (con->c.cid >= clt_path->s.irq_con_num)
+ err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
+ cq_vector, cq_num, max_send_wr,
+ max_recv_wr, IB_POLL_DIRECT);
+ else
+ err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge,
+ cq_vector, cq_num, max_send_wr,
+ max_recv_wr, IB_POLL_SOFTIRQ);
+ /*
+ * In case of error we do not bother to clean previous allocations,
+ * since destroy_con_cq_qp() must be called.
+ */
+ return err;
+}
+
+static void destroy_con_cq_qp(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ /*
+ * Be careful here: destroy_con_cq_qp() can be called even
+ * create_con_cq_qp() failed, see comments there.
+ */
+ lockdep_assert_held(&con->con_mutex);
+ rtrs_cq_qp_destroy(&con->c);
+ if (con->rsp_ius) {
+ rtrs_iu_free(con->rsp_ius, clt_path->s.dev->ib_dev,
+ con->queue_num);
+ con->rsp_ius = NULL;
+ con->queue_num = 0;
+ }
+ if (clt_path->s.dev_ref && !--clt_path->s.dev_ref) {
+ rtrs_ib_dev_put(clt_path->s.dev);
+ clt_path->s.dev = NULL;
+ }
+}
+
+static void stop_cm(struct rtrs_clt_con *con)
+{
+ rdma_disconnect(con->c.cm_id);
+ if (con->c.qp)
+ ib_drain_qp(con->c.qp);
+}
+
+static void destroy_cm(struct rtrs_clt_con *con)
+{
+ rdma_destroy_id(con->c.cm_id);
+ con->c.cm_id = NULL;
+}
+
+static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con)
+{
+ struct rtrs_path *s = con->c.path;
+ int err;
+
+ mutex_lock(&con->con_mutex);
+ err = create_con_cq_qp(con);
+ mutex_unlock(&con->con_mutex);
+ if (err) {
+ rtrs_err(s, "create_con_cq_qp(), err: %d\n", err);
+ return err;
+ }
+ err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS);
+ if (err)
+ rtrs_err(s, "Resolving route failed, err: %d\n", err);
+
+ return err;
+}
+
+static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ struct rtrs_msg_conn_req msg;
+ struct rdma_conn_param param;
+
+ int err;
+
+ param = (struct rdma_conn_param) {
+ .retry_count = 7,
+ .rnr_retry_count = 7,
+ .private_data = &msg,
+ .private_data_len = sizeof(msg),
+ };
+
+ msg = (struct rtrs_msg_conn_req) {
+ .magic = cpu_to_le16(RTRS_MAGIC),
+ .version = cpu_to_le16(RTRS_PROTO_VER),
+ .cid = cpu_to_le16(con->c.cid),
+ .cid_num = cpu_to_le16(clt_path->s.con_num),
+ .recon_cnt = cpu_to_le16(clt_path->s.recon_cnt),
+ };
+ msg.first_conn = clt_path->for_new_clt ? FIRST_CONN : 0;
+ uuid_copy(&msg.sess_uuid, &clt_path->s.uuid);
+ uuid_copy(&msg.paths_uuid, &clt->paths_uuid);
+
+ err = rdma_connect_locked(con->c.cm_id, &param);
+ if (err)
+ rtrs_err(clt, "rdma_connect_locked(): %d\n", err);
+
+ return err;
+}
+
+static int rtrs_rdma_conn_established(struct rtrs_clt_con *con,
+ struct rdma_cm_event *ev)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ const struct rtrs_msg_conn_rsp *msg;
+ u16 version, queue_depth;
+ int errno;
+ u8 len;
+
+ msg = ev->param.conn.private_data;
+ len = ev->param.conn.private_data_len;
+ if (len < sizeof(*msg)) {
+ rtrs_err(clt, "Invalid RTRS connection response\n");
+ return -ECONNRESET;
+ }
+ if (le16_to_cpu(msg->magic) != RTRS_MAGIC) {
+ rtrs_err(clt, "Invalid RTRS magic\n");
+ return -ECONNRESET;
+ }
+ version = le16_to_cpu(msg->version);
+ if (version >> 8 != RTRS_PROTO_VER_MAJOR) {
+ rtrs_err(clt, "Unsupported major RTRS version: %d, expected %d\n",
+ version >> 8, RTRS_PROTO_VER_MAJOR);
+ return -ECONNRESET;
+ }
+ errno = le16_to_cpu(msg->errno);
+ if (errno) {
+ rtrs_err(clt, "Invalid RTRS message: errno %d\n",
+ errno);
+ return -ECONNRESET;
+ }
+ if (con->c.cid == 0) {
+ queue_depth = le16_to_cpu(msg->queue_depth);
+
+ if (clt_path->queue_depth > 0 && queue_depth != clt_path->queue_depth) {
+ rtrs_err(clt, "Error: queue depth changed\n");
+
+ /*
+ * Stop any more reconnection attempts
+ */
+ clt_path->reconnect_attempts = -1;
+ rtrs_err(clt,
+ "Disabling auto-reconnect. Trigger a manual reconnect after issue is resolved\n");
+ return -ECONNRESET;
+ }
+
+ if (!clt_path->rbufs) {
+ clt_path->rbufs = kcalloc(queue_depth,
+ sizeof(*clt_path->rbufs),
+ GFP_KERNEL);
+ if (!clt_path->rbufs)
+ return -ENOMEM;
+ }
+ clt_path->queue_depth = queue_depth;
+ clt_path->s.signal_interval = min_not_zero(queue_depth,
+ (unsigned short) SERVICE_CON_QUEUE_DEPTH);
+ clt_path->max_hdr_size = le32_to_cpu(msg->max_hdr_size);
+ clt_path->max_io_size = le32_to_cpu(msg->max_io_size);
+ clt_path->flags = le32_to_cpu(msg->flags);
+ clt_path->chunk_size = clt_path->max_io_size + clt_path->max_hdr_size;
+
+ /*
+ * Global IO size is always a minimum.
+ * If while a reconnection server sends us a value a bit
+ * higher - client does not care and uses cached minimum.
+ *
+ * Since we can have several sessions (paths) restablishing
+ * connections in parallel, use lock.
+ */
+ mutex_lock(&clt->paths_mutex);
+ clt->queue_depth = clt_path->queue_depth;
+ clt->max_io_size = min_not_zero(clt_path->max_io_size,
+ clt->max_io_size);
+ mutex_unlock(&clt->paths_mutex);
+
+ /*
+ * Cache the hca_port and hca_name for sysfs
+ */
+ clt_path->hca_port = con->c.cm_id->port_num;
+ scnprintf(clt_path->hca_name, sizeof(clt_path->hca_name),
+ clt_path->s.dev->ib_dev->name);
+ clt_path->s.src_addr = con->c.cm_id->route.addr.src_addr;
+ /* set for_new_clt, to allow future reconnect on any path */
+ clt_path->for_new_clt = 1;
+ }
+
+ return 0;
+}
+
+static inline void flag_success_on_conn(struct rtrs_clt_con *con)
+{
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+
+ atomic_inc(&clt_path->connected_cnt);
+ con->cm_err = 1;
+}
+
+static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
+ struct rdma_cm_event *ev)
+{
+ struct rtrs_path *s = con->c.path;
+ const struct rtrs_msg_conn_rsp *msg;
+ const char *rej_msg;
+ int status, errno;
+ u8 data_len;
+
+ status = ev->status;
+ rej_msg = rdma_reject_msg(con->c.cm_id, status);
+ msg = rdma_consumer_reject_data(con->c.cm_id, ev, &data_len);
+
+ if (msg && data_len >= sizeof(*msg)) {
+ errno = (int16_t)le16_to_cpu(msg->errno);
+ if (errno == -EBUSY)
+ rtrs_err(s,
+ "Previous session is still exists on the server, please reconnect later\n");
+ else
+ rtrs_err(s,
+ "Connect rejected: status %d (%s), rtrs errno %d\n",
+ status, rej_msg, errno);
+ } else {
+ rtrs_err(s,
+ "Connect rejected but with malformed message: status %d (%s)\n",
+ status, rej_msg);
+ }
+
+ return -ECONNRESET;
+}
+
+void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait)
+{
+ trace_rtrs_clt_close_conns(clt_path);
+
+ if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSING, NULL))
+ queue_work(rtrs_wq, &clt_path->close_work);
+ if (wait)
+ flush_work(&clt_path->close_work);
+}
+
+static inline void flag_error_on_conn(struct rtrs_clt_con *con, int cm_err)
+{
+ if (con->cm_err == 1) {
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = to_clt_path(con->c.path);
+ if (atomic_dec_and_test(&clt_path->connected_cnt))
+
+ wake_up(&clt_path->state_wq);
+ }
+ con->cm_err = cm_err;
+}
+
+static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *ev)
+{
+ struct rtrs_clt_con *con = cm_id->context;
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_clt_path *clt_path = to_clt_path(s);
+ int cm_err = 0;
+
+ switch (ev->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ cm_err = rtrs_rdma_addr_resolved(con);
+ break;
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ cm_err = rtrs_rdma_route_resolved(con);
+ break;
+ case RDMA_CM_EVENT_ESTABLISHED:
+ cm_err = rtrs_rdma_conn_established(con, ev);
+ if (!cm_err) {
+ /*
+ * Report success and wake up. Here we abuse state_wq,
+ * i.e. wake up without state change, but we set cm_err.
+ */
+ flag_success_on_conn(con);
+ wake_up(&clt_path->state_wq);
+ return 0;
+ }
+ break;
+ case RDMA_CM_EVENT_REJECTED:
+ cm_err = rtrs_rdma_conn_rejected(con, ev);
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ /* No message for disconnecting */
+ cm_err = -ECONNRESET;
+ break;
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
+ rdma_event_msg(ev->event), ev->status);
+ cm_err = -ECONNRESET;
+ break;
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
+ rdma_event_msg(ev->event), ev->status);
+ cm_err = -EHOSTUNREACH;
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ /*
+ * Device removal is a special case. Queue close and return 0.
+ */
+ rtrs_clt_close_conns(clt_path, false);
+ return 0;
+ default:
+ rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n",
+ rdma_event_msg(ev->event), ev->status);
+ cm_err = -ECONNRESET;
+ break;
+ }
+
+ if (cm_err) {
+ /*
+ * cm error makes sense only on connection establishing,
+ * in other cases we rely on normal procedure of reconnecting.
+ */
+ flag_error_on_conn(con, cm_err);
+ rtrs_rdma_error_recovery(con);
+ }
+
+ return 0;
+}
+
+static int create_cm(struct rtrs_clt_con *con)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_clt_path *clt_path = to_clt_path(s);
+ struct rdma_cm_id *cm_id;
+ int err;
+
+ cm_id = rdma_create_id(&init_net, rtrs_clt_rdma_cm_handler, con,
+ clt_path->s.dst_addr.ss_family == AF_IB ?
+ RDMA_PS_IB : RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id)) {
+ err = PTR_ERR(cm_id);
+ rtrs_err(s, "Failed to create CM ID, err: %d\n", err);
+
+ return err;
+ }
+ con->c.cm_id = cm_id;
+ con->cm_err = 0;
+ /* allow the port to be reused */
+ err = rdma_set_reuseaddr(cm_id, 1);
+ if (err != 0) {
+ rtrs_err(s, "Set address reuse failed, err: %d\n", err);
+ goto destroy_cm;
+ }
+ err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr,
+ (struct sockaddr *)&clt_path->s.dst_addr,
+ RTRS_CONNECT_TIMEOUT_MS);
+ if (err) {
+ rtrs_err(s, "Failed to resolve address, err: %d\n", err);
+ goto destroy_cm;
+ }
+ /*
+ * Combine connection status and session events. This is needed
+ * for waiting two possible cases: cm_err has something meaningful
+ * or session state was really changed to error by device removal.
+ */
+ err = wait_event_interruptible_timeout(
+ clt_path->state_wq,
+ con->cm_err || clt_path->state != RTRS_CLT_CONNECTING,
+ msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
+ if (err == 0 || err == -ERESTARTSYS) {
+ if (err == 0)
+ err = -ETIMEDOUT;
+ /* Timedout or interrupted */
+ goto errr;
+ }
+ if (con->cm_err < 0) {
+ err = con->cm_err;
+ goto errr;
+ }
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) {
+ /* Device removal */
+ err = -ECONNABORTED;
+ goto errr;
+ }
+
+ return 0;
+
+errr:
+ stop_cm(con);
+ mutex_lock(&con->con_mutex);
+ destroy_con_cq_qp(con);
+ mutex_unlock(&con->con_mutex);
+destroy_cm:
+ destroy_cm(con);
+
+ return err;
+}
+
+static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ int up;
+
+ /*
+ * We can fire RECONNECTED event only when all paths were
+ * connected on rtrs_clt_open(), then each was disconnected
+ * and the first one connected again. That's why this nasty
+ * game with counter value.
+ */
+
+ mutex_lock(&clt->paths_ev_mutex);
+ up = ++clt->paths_up;
+ /*
+ * Here it is safe to access paths num directly since up counter
+ * is greater than MAX_PATHS_NUM only while rtrs_clt_open() is
+ * in progress, thus paths removals are impossible.
+ */
+ if (up > MAX_PATHS_NUM && up == MAX_PATHS_NUM + clt->paths_num)
+ clt->paths_up = clt->paths_num;
+ else if (up == 1)
+ clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_RECONNECTED);
+ mutex_unlock(&clt->paths_ev_mutex);
+
+ /* Mark session as established */
+ clt_path->established = true;
+ clt_path->reconnect_attempts = 0;
+ clt_path->stats->reconnects.successful_cnt++;
+}
+
+static void rtrs_clt_path_down(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+
+ if (!clt_path->established)
+ return;
+
+ clt_path->established = false;
+ mutex_lock(&clt->paths_ev_mutex);
+ WARN_ON(!clt->paths_up);
+ if (--clt->paths_up == 0)
+ clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_DISCONNECTED);
+ mutex_unlock(&clt->paths_ev_mutex);
+}
+
+static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_con *con;
+ unsigned int cid;
+
+ WARN_ON(READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTED);
+
+ /*
+ * Possible race with rtrs_clt_open(), when DEVICE_REMOVAL comes
+ * exactly in between. Start destroying after it finishes.
+ */
+ mutex_lock(&clt_path->init_mutex);
+ mutex_unlock(&clt_path->init_mutex);
+
+ /*
+ * All IO paths must observe !CONNECTED state before we
+ * free everything.
+ */
+ synchronize_rcu();
+
+ rtrs_stop_hb(&clt_path->s);
+
+ /*
+ * The order it utterly crucial: firstly disconnect and complete all
+ * rdma requests with error (thus set in_use=false for requests),
+ * then fail outstanding requests checking in_use for each, and
+ * eventually notify upper layer about session disconnection.
+ */
+
+ for (cid = 0; cid < clt_path->s.con_num; cid++) {
+ if (!clt_path->s.con[cid])
+ break;
+ con = to_clt_con(clt_path->s.con[cid]);
+ stop_cm(con);
+ }
+ fail_all_outstanding_reqs(clt_path);
+ free_path_reqs(clt_path);
+ rtrs_clt_path_down(clt_path);
+
+ /*
+ * Wait for graceful shutdown, namely when peer side invokes
+ * rdma_disconnect(). 'connected_cnt' is decremented only on
+ * CM events, thus if other side had crashed and hb has detected
+ * something is wrong, here we will stuck for exactly timeout ms,
+ * since CM does not fire anything. That is fine, we are not in
+ * hurry.
+ */
+ wait_event_timeout(clt_path->state_wq,
+ !atomic_read(&clt_path->connected_cnt),
+ msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
+
+ for (cid = 0; cid < clt_path->s.con_num; cid++) {
+ if (!clt_path->s.con[cid])
+ break;
+ con = to_clt_con(clt_path->s.con[cid]);
+ mutex_lock(&con->con_mutex);
+ destroy_con_cq_qp(con);
+ mutex_unlock(&con->con_mutex);
+ destroy_cm(con);
+ destroy_con(con);
+ }
+}
+
+static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+ struct rtrs_clt_path *next;
+ bool wait_for_grace = false;
+ int cpu;
+
+ mutex_lock(&clt->paths_mutex);
+ list_del_rcu(&clt_path->s.entry);
+
+ /* Make sure everybody observes path removal. */
+ synchronize_rcu();
+
+ /*
+ * At this point nobody sees @sess in the list, but still we have
+ * dangling pointer @pcpu_path which _can_ point to @sess. Since
+ * nobody can observe @sess in the list, we guarantee that IO path
+ * will not assign @sess to @pcpu_path, i.e. @pcpu_path can be equal
+ * to @sess, but can never again become @sess.
+ */
+
+ /*
+ * Decrement paths number only after grace period, because
+ * caller of do_each_path() must firstly observe list without
+ * path and only then decremented paths number.
+ *
+ * Otherwise there can be the following situation:
+ * o Two paths exist and IO is coming.
+ * o One path is removed:
+ * CPU#0 CPU#1
+ * do_each_path(): rtrs_clt_remove_path_from_arr():
+ * path = get_next_path()
+ * ^^^ list_del_rcu(path)
+ * [!CONNECTED path] clt->paths_num--
+ * ^^^^^^^^^
+ * load clt->paths_num from 2 to 1
+ * ^^^^^^^^^
+ * sees 1
+ *
+ * path is observed as !CONNECTED, but do_each_path() loop
+ * ends, because expression i < clt->paths_num is false.
+ */
+ clt->paths_num--;
+
+ /*
+ * Get @next connection from current @sess which is going to be
+ * removed. If @sess is the last element, then @next is NULL.
+ */
+ rcu_read_lock();
+ next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path);
+ rcu_read_unlock();
+
+ /*
+ * @pcpu paths can still point to the path which is going to be
+ * removed, so change the pointer manually.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rtrs_clt_path __rcu **ppcpu_path;
+
+ ppcpu_path = per_cpu_ptr(clt->pcpu_path, cpu);
+ if (rcu_dereference_protected(*ppcpu_path,
+ lockdep_is_held(&clt->paths_mutex)) != clt_path)
+ /*
+ * synchronize_rcu() was called just after deleting
+ * entry from the list, thus IO code path cannot
+ * change pointer back to the pointer which is going
+ * to be removed, we are safe here.
+ */
+ continue;
+
+ /*
+ * We race with IO code path, which also changes pointer,
+ * thus we have to be careful not to overwrite it.
+ */
+ if (try_cmpxchg((struct rtrs_clt_path **)ppcpu_path, &clt_path,
+ next))
+ /*
+ * @ppcpu_path was successfully replaced with @next,
+ * that means that someone could also pick up the
+ * @sess and dereferencing it right now, so wait for
+ * a grace period is required.
+ */
+ wait_for_grace = true;
+ }
+ if (wait_for_grace)
+ synchronize_rcu();
+
+ mutex_unlock(&clt->paths_mutex);
+}
+
+static void rtrs_clt_add_path_to_arr(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_sess *clt = clt_path->clt;
+
+ mutex_lock(&clt->paths_mutex);
+ clt->paths_num++;
+
+ list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
+ mutex_unlock(&clt->paths_mutex);
+}
+
+static void rtrs_clt_close_work(struct work_struct *work)
+{
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = container_of(work, struct rtrs_clt_path, close_work);
+
+ cancel_work_sync(&clt_path->err_recovery_work);
+ cancel_delayed_work_sync(&clt_path->reconnect_dwork);
+ rtrs_clt_stop_and_destroy_conns(clt_path);
+ rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
+}
+
+static int init_conns(struct rtrs_clt_path *clt_path)
+{
+ unsigned int cid;
+ int err;
+
+ /*
+ * On every new session connections increase reconnect counter
+ * to avoid clashes with previous sessions not yet closed
+ * sessions on a server side.
+ */
+ clt_path->s.recon_cnt++;
+
+ /* Establish all RDMA connections */
+ for (cid = 0; cid < clt_path->s.con_num; cid++) {
+ err = create_con(clt_path, cid);
+ if (err)
+ goto destroy;
+
+ err = create_cm(to_clt_con(clt_path->s.con[cid]));
+ if (err) {
+ destroy_con(to_clt_con(clt_path->s.con[cid]));
+ goto destroy;
+ }
+ }
+ err = alloc_path_reqs(clt_path);
+ if (err)
+ goto destroy;
+
+ rtrs_start_hb(&clt_path->s);
+
+ return 0;
+
+destroy:
+ while (cid--) {
+ struct rtrs_clt_con *con = to_clt_con(clt_path->s.con[cid]);
+
+ stop_cm(con);
+
+ mutex_lock(&con->con_mutex);
+ destroy_con_cq_qp(con);
+ mutex_unlock(&con->con_mutex);
+ destroy_cm(con);
+ destroy_con(con);
+ }
+ /*
+ * If we've never taken async path and got an error, say,
+ * doing rdma_resolve_addr(), switch to CONNECTION_ERR state
+ * manually to keep reconnecting.
+ */
+ rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
+
+ return err;
+}
+
+static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_iu *iu;
+
+ iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
+ rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(clt_path->clt, "Path info request send failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING_ERR, NULL);
+ return;
+ }
+
+ rtrs_clt_update_wc_stats(con);
+}
+
+static int process_info_rsp(struct rtrs_clt_path *clt_path,
+ const struct rtrs_msg_info_rsp *msg)
+{
+ unsigned int sg_cnt, total_len;
+ int i, sgi;
+
+ sg_cnt = le16_to_cpu(msg->sg_cnt);
+ if (!sg_cnt || (clt_path->queue_depth % sg_cnt)) {
+ rtrs_err(clt_path->clt,
+ "Incorrect sg_cnt %d, is not multiple\n",
+ sg_cnt);
+ return -EINVAL;
+ }
+
+ /*
+ * Check if IB immediate data size is enough to hold the mem_id and
+ * the offset inside the memory chunk.
+ */
+ if ((ilog2(sg_cnt - 1) + 1) + (ilog2(clt_path->chunk_size - 1) + 1) >
+ MAX_IMM_PAYL_BITS) {
+ rtrs_err(clt_path->clt,
+ "RDMA immediate size (%db) not enough to encode %d buffers of size %dB\n",
+ MAX_IMM_PAYL_BITS, sg_cnt, clt_path->chunk_size);
+ return -EINVAL;
+ }
+ total_len = 0;
+ for (sgi = 0, i = 0; sgi < sg_cnt && i < clt_path->queue_depth; sgi++) {
+ const struct rtrs_sg_desc *desc = &msg->desc[sgi];
+ u32 len, rkey;
+ u64 addr;
+
+ addr = le64_to_cpu(desc->addr);
+ rkey = le32_to_cpu(desc->key);
+ len = le32_to_cpu(desc->len);
+
+ total_len += len;
+
+ if (!len || (len % clt_path->chunk_size)) {
+ rtrs_err(clt_path->clt, "Incorrect [%d].len %d\n",
+ sgi,
+ len);
+ return -EINVAL;
+ }
+ for ( ; len && i < clt_path->queue_depth; i++) {
+ clt_path->rbufs[i].addr = addr;
+ clt_path->rbufs[i].rkey = rkey;
+
+ len -= clt_path->chunk_size;
+ addr += clt_path->chunk_size;
+ }
+ }
+ /* Sanity check */
+ if (sgi != sg_cnt || i != clt_path->queue_depth) {
+ rtrs_err(clt_path->clt,
+ "Incorrect sg vector, not fully mapped\n");
+ return -EINVAL;
+ }
+ if (total_len != clt_path->chunk_size * clt_path->queue_depth) {
+ rtrs_err(clt_path->clt, "Incorrect total_len %d\n", total_len);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
+ struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
+ struct rtrs_msg_info_rsp *msg;
+ enum rtrs_clt_state state;
+ struct rtrs_iu *iu;
+ size_t rx_sz;
+ int err;
+
+ state = RTRS_CLT_CONNECTING_ERR;
+
+ WARN_ON(con->c.cid);
+ iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(clt_path->clt, "Path info response recv failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ goto out;
+ }
+ WARN_ON(wc->opcode != IB_WC_RECV);
+
+ if (wc->byte_len < sizeof(*msg)) {
+ rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
+ wc->byte_len);
+ goto out;
+ }
+ ib_dma_sync_single_for_cpu(clt_path->s.dev->ib_dev, iu->dma_addr,
+ iu->size, DMA_FROM_DEVICE);
+ msg = iu->buf;
+ if (le16_to_cpu(msg->type) != RTRS_MSG_INFO_RSP) {
+ rtrs_err(clt_path->clt, "Path info response is malformed: type %d\n",
+ le16_to_cpu(msg->type));
+ goto out;
+ }
+ rx_sz = sizeof(*msg);
+ rx_sz += sizeof(msg->desc[0]) * le16_to_cpu(msg->sg_cnt);
+ if (wc->byte_len < rx_sz) {
+ rtrs_err(clt_path->clt, "Path info response is malformed: size %d\n",
+ wc->byte_len);
+ goto out;
+ }
+ err = process_info_rsp(clt_path, msg);
+ if (err)
+ goto out;
+
+ err = post_recv_path(clt_path);
+ if (err)
+ goto out;
+
+ state = RTRS_CLT_CONNECTED;
+
+out:
+ rtrs_clt_update_wc_stats(con);
+ rtrs_iu_free(iu, clt_path->s.dev->ib_dev, 1);
+ rtrs_clt_change_state_get_old(clt_path, state, NULL);
+}
+
+static int rtrs_send_path_info(struct rtrs_clt_path *clt_path)
+{
+ struct rtrs_clt_con *usr_con = to_clt_con(clt_path->s.con[0]);
+ struct rtrs_msg_info_req *msg;
+ struct rtrs_iu *tx_iu, *rx_iu;
+ size_t rx_sz;
+ int err;
+
+ rx_sz = sizeof(struct rtrs_msg_info_rsp);
+ rx_sz += sizeof(struct rtrs_sg_desc) * clt_path->queue_depth;
+
+ tx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), GFP_KERNEL,
+ clt_path->s.dev->ib_dev, DMA_TO_DEVICE,
+ rtrs_clt_info_req_done);
+ rx_iu = rtrs_iu_alloc(1, rx_sz, GFP_KERNEL, clt_path->s.dev->ib_dev,
+ DMA_FROM_DEVICE, rtrs_clt_info_rsp_done);
+ if (!tx_iu || !rx_iu) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* Prepare for getting info response */
+ err = rtrs_iu_post_recv(&usr_con->c, rx_iu);
+ if (err) {
+ rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err);
+ goto out;
+ }
+ rx_iu = NULL;
+
+ msg = tx_iu->buf;
+ msg->type = cpu_to_le16(RTRS_MSG_INFO_REQ);
+ memcpy(msg->pathname, clt_path->s.sessname, sizeof(msg->pathname));
+
+ ib_dma_sync_single_for_device(clt_path->s.dev->ib_dev,
+ tx_iu->dma_addr,
+ tx_iu->size, DMA_TO_DEVICE);
+
+ /* Send info request */
+ err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL);
+ if (err) {
+ rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err);
+ goto out;
+ }
+ tx_iu = NULL;
+
+ /* Wait for state change */
+ wait_event_interruptible_timeout(clt_path->state_wq,
+ clt_path->state != RTRS_CLT_CONNECTING,
+ msecs_to_jiffies(
+ RTRS_CONNECT_TIMEOUT_MS));
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED) {
+ if (READ_ONCE(clt_path->state) == RTRS_CLT_CONNECTING_ERR)
+ err = -ECONNRESET;
+ else
+ err = -ETIMEDOUT;
+ }
+
+out:
+ if (tx_iu)
+ rtrs_iu_free(tx_iu, clt_path->s.dev->ib_dev, 1);
+ if (rx_iu)
+ rtrs_iu_free(rx_iu, clt_path->s.dev->ib_dev, 1);
+ if (err)
+ /* If we've never taken async path because of malloc problems */
+ rtrs_clt_change_state_get_old(clt_path,
+ RTRS_CLT_CONNECTING_ERR, NULL);
+
+ return err;
+}
+
+/**
+ * init_path() - establishes all path connections and does handshake
+ * @clt_path: client path.
+ * In case of error full close or reconnect procedure should be taken,
+ * because reconnect or close async works can be started.
+ */
+static int init_path(struct rtrs_clt_path *clt_path)
+{
+ int err;
+ char str[NAME_MAX];
+ struct rtrs_addr path = {
+ .src = &clt_path->s.src_addr,
+ .dst = &clt_path->s.dst_addr,
+ };
+
+ rtrs_addr_to_str(&path, str, sizeof(str));
+
+ mutex_lock(&clt_path->init_mutex);
+ err = init_conns(clt_path);
+ if (err) {
+ rtrs_err(clt_path->clt,
+ "init_conns() failed: err=%d path=%s [%s:%u]\n", err,
+ str, clt_path->hca_name, clt_path->hca_port);
+ goto out;
+ }
+ err = rtrs_send_path_info(clt_path);
+ if (err) {
+ rtrs_err(clt_path->clt,
+ "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n",
+ err, str, clt_path->hca_name, clt_path->hca_port);
+ goto out;
+ }
+ rtrs_clt_path_up(clt_path);
+out:
+ mutex_unlock(&clt_path->init_mutex);
+
+ return err;
+}
+
+static void rtrs_clt_reconnect_work(struct work_struct *work)
+{
+ struct rtrs_clt_path *clt_path;
+ struct rtrs_clt_sess *clt;
+ int err;
+
+ clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
+ reconnect_dwork);
+ clt = clt_path->clt;
+
+ trace_rtrs_clt_reconnect_work(clt_path);
+
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_RECONNECTING)
+ return;
+
+ if (clt_path->reconnect_attempts >= clt->max_reconnect_attempts) {
+ /* Close a path completely if max attempts is reached */
+ rtrs_clt_close_conns(clt_path, false);
+ return;
+ }
+ clt_path->reconnect_attempts++;
+
+ msleep(RTRS_RECONNECT_BACKOFF);
+ if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
+ err = init_path(clt_path);
+ if (err)
+ goto reconnect_again;
+ }
+
+ return;
+
+reconnect_again:
+ if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
+ clt_path->stats->reconnects.fail_cnt++;
+ queue_work(rtrs_wq, &clt_path->err_recovery_work);
+ }
+}
+
+static void rtrs_clt_dev_release(struct device *dev)
+{
+ struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess,
+ dev);
+
+ mutex_destroy(&clt->paths_ev_mutex);
+ mutex_destroy(&clt->paths_mutex);
+ kfree(clt);
+}
+
+static struct rtrs_clt_sess *alloc_clt(const char *sessname, size_t paths_num,
+ u16 port, size_t pdu_sz, void *priv,
+ void (*link_ev)(void *priv,
+ enum rtrs_clt_link_ev ev),
+ unsigned int reconnect_delay_sec,
+ unsigned int max_reconnect_attempts)
+{
+ struct rtrs_clt_sess *clt;
+ int err;
+
+ if (!paths_num || paths_num > MAX_PATHS_NUM)
+ return ERR_PTR(-EINVAL);
+
+ if (strlen(sessname) >= sizeof(clt->sessname))
+ return ERR_PTR(-EINVAL);
+
+ clt = kzalloc(sizeof(*clt), GFP_KERNEL);
+ if (!clt)
+ return ERR_PTR(-ENOMEM);
+
+ clt->pcpu_path = alloc_percpu(typeof(*clt->pcpu_path));
+ if (!clt->pcpu_path) {
+ kfree(clt);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ clt->dev.class = rtrs_clt_dev_class;
+ clt->dev.release = rtrs_clt_dev_release;
+ uuid_gen(&clt->paths_uuid);
+ INIT_LIST_HEAD_RCU(&clt->paths_list);
+ clt->paths_num = paths_num;
+ clt->paths_up = MAX_PATHS_NUM;
+ clt->port = port;
+ clt->pdu_sz = pdu_sz;
+ clt->max_segments = RTRS_MAX_SEGMENTS;
+ clt->reconnect_delay_sec = reconnect_delay_sec;
+ clt->max_reconnect_attempts = max_reconnect_attempts;
+ clt->priv = priv;
+ clt->link_ev = link_ev;
+ clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
+ strscpy(clt->sessname, sessname, sizeof(clt->sessname));
+ init_waitqueue_head(&clt->permits_wait);
+ mutex_init(&clt->paths_ev_mutex);
+ mutex_init(&clt->paths_mutex);
+ device_initialize(&clt->dev);
+
+ err = dev_set_name(&clt->dev, "%s", sessname);
+ if (err)
+ goto err_put;
+
+ /*
+ * Suppress user space notification until
+ * sysfs files are created
+ */
+ dev_set_uevent_suppress(&clt->dev, true);
+ err = device_add(&clt->dev);
+ if (err)
+ goto err_put;
+
+ clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj);
+ if (!clt->kobj_paths) {
+ err = -ENOMEM;
+ goto err_del;
+ }
+ err = rtrs_clt_create_sysfs_root_files(clt);
+ if (err) {
+ kobject_del(clt->kobj_paths);
+ kobject_put(clt->kobj_paths);
+ goto err_del;
+ }
+ dev_set_uevent_suppress(&clt->dev, false);
+ kobject_uevent(&clt->dev.kobj, KOBJ_ADD);
+
+ return clt;
+err_del:
+ device_del(&clt->dev);
+err_put:
+ free_percpu(clt->pcpu_path);
+ put_device(&clt->dev);
+ return ERR_PTR(err);
+}
+
+static void free_clt(struct rtrs_clt_sess *clt)
+{
+ free_percpu(clt->pcpu_path);
+
+ /*
+ * release callback will free clt and destroy mutexes in last put
+ */
+ device_unregister(&clt->dev);
+}
+
+/**
+ * rtrs_clt_open() - Open a path to an RTRS server
+ * @ops: holds the link event callback and the private pointer.
+ * @pathname: name of the path to an RTRS server
+ * @paths: Paths to be established defined by their src and dst addresses
+ * @paths_num: Number of elements in the @paths array
+ * @port: port to be used by the RTRS session
+ * @pdu_sz: Size of extra payload which can be accessed after permit allocation.
+ * @reconnect_delay_sec: time between reconnect tries
+ * @max_reconnect_attempts: Number of times to reconnect on error before giving
+ * up, 0 for * disabled, -1 for forever
+ * @nr_poll_queues: number of polling mode connection using IB_POLL_DIRECT flag
+ *
+ * Starts session establishment with the rtrs_server. The function can block
+ * up to ~2000ms before it returns.
+ *
+ * Return a valid pointer on success otherwise PTR_ERR.
+ */
+struct rtrs_clt_sess *rtrs_clt_open(struct rtrs_clt_ops *ops,
+ const char *pathname,
+ const struct rtrs_addr *paths,
+ size_t paths_num, u16 port,
+ size_t pdu_sz, u8 reconnect_delay_sec,
+ s16 max_reconnect_attempts, u32 nr_poll_queues)
+{
+ struct rtrs_clt_path *clt_path, *tmp;
+ struct rtrs_clt_sess *clt;
+ int err, i;
+
+ if (strchr(pathname, '/') || strchr(pathname, '.')) {
+ pr_err("pathname cannot contain / and .\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ clt = alloc_clt(pathname, paths_num, port, pdu_sz, ops->priv,
+ ops->link_ev,
+ reconnect_delay_sec,
+ max_reconnect_attempts);
+ if (IS_ERR(clt)) {
+ err = PTR_ERR(clt);
+ goto out;
+ }
+ for (i = 0; i < paths_num; i++) {
+ struct rtrs_clt_path *clt_path;
+
+ clt_path = alloc_path(clt, &paths[i], nr_cpu_ids,
+ nr_poll_queues);
+ if (IS_ERR(clt_path)) {
+ err = PTR_ERR(clt_path);
+ goto close_all_path;
+ }
+ if (!i)
+ clt_path->for_new_clt = 1;
+ list_add_tail_rcu(&clt_path->s.entry, &clt->paths_list);
+
+ err = init_path(clt_path);
+ if (err) {
+ list_del_rcu(&clt_path->s.entry);
+ rtrs_clt_close_conns(clt_path, true);
+ free_percpu(clt_path->stats->pcpu_stats);
+ kfree(clt_path->stats);
+ free_path(clt_path);
+ goto close_all_path;
+ }
+
+ err = rtrs_clt_create_path_files(clt_path);
+ if (err) {
+ list_del_rcu(&clt_path->s.entry);
+ rtrs_clt_close_conns(clt_path, true);
+ free_percpu(clt_path->stats->pcpu_stats);
+ kfree(clt_path->stats);
+ free_path(clt_path);
+ goto close_all_path;
+ }
+ }
+ err = alloc_permits(clt);
+ if (err)
+ goto close_all_path;
+
+ return clt;
+
+close_all_path:
+ list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
+ rtrs_clt_destroy_path_files(clt_path, NULL);
+ rtrs_clt_close_conns(clt_path, true);
+ kobject_put(&clt_path->kobj);
+ }
+ rtrs_clt_destroy_sysfs_root(clt);
+ free_clt(clt);
+
+out:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(rtrs_clt_open);
+
+/**
+ * rtrs_clt_close() - Close a path
+ * @clt: Session handle. Session is freed upon return.
+ */
+void rtrs_clt_close(struct rtrs_clt_sess *clt)
+{
+ struct rtrs_clt_path *clt_path, *tmp;
+
+ /* Firstly forbid sysfs access */
+ rtrs_clt_destroy_sysfs_root(clt);
+
+ /* Now it is safe to iterate over all paths without locks */
+ list_for_each_entry_safe(clt_path, tmp, &clt->paths_list, s.entry) {
+ rtrs_clt_close_conns(clt_path, true);
+ rtrs_clt_destroy_path_files(clt_path, NULL);
+ kobject_put(&clt_path->kobj);
+ }
+ free_permits(clt);
+ free_clt(clt);
+}
+EXPORT_SYMBOL(rtrs_clt_close);
+
+int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
+{
+ enum rtrs_clt_state old_state;
+ int err = -EBUSY;
+ bool changed;
+
+ changed = rtrs_clt_change_state_get_old(clt_path,
+ RTRS_CLT_RECONNECTING,
+ &old_state);
+ if (changed) {
+ clt_path->reconnect_attempts = 0;
+ rtrs_clt_stop_and_destroy_conns(clt_path);
+ queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
+ }
+ if (changed || old_state == RTRS_CLT_RECONNECTING) {
+ /*
+ * flush_delayed_work() queues pending work for immediate
+ * execution, so do the flush if we have queued something
+ * right now or work is pending.
+ */
+ flush_delayed_work(&clt_path->reconnect_dwork);
+ err = (READ_ONCE(clt_path->state) ==
+ RTRS_CLT_CONNECTED ? 0 : -ENOTCONN);
+ }
+
+ return err;
+}
+
+int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *clt_path,
+ const struct attribute *sysfs_self)
+{
+ enum rtrs_clt_state old_state;
+ bool changed;
+
+ /*
+ * Continue stopping path till state was changed to DEAD or
+ * state was observed as DEAD:
+ * 1. State was changed to DEAD - we were fast and nobody
+ * invoked rtrs_clt_reconnect(), which can again start
+ * reconnecting.
+ * 2. State was observed as DEAD - we have someone in parallel
+ * removing the path.
+ */
+ do {
+ rtrs_clt_close_conns(clt_path, true);
+ changed = rtrs_clt_change_state_get_old(clt_path,
+ RTRS_CLT_DEAD,
+ &old_state);
+ } while (!changed && old_state != RTRS_CLT_DEAD);
+
+ if (changed) {
+ rtrs_clt_remove_path_from_arr(clt_path);
+ rtrs_clt_destroy_path_files(clt_path, sysfs_self);
+ kobject_put(&clt_path->kobj);
+ }
+
+ return 0;
+}
+
+void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value)
+{
+ clt->max_reconnect_attempts = (unsigned int)value;
+}
+
+int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt)
+{
+ return (int)clt->max_reconnect_attempts;
+}
+
+/**
+ * rtrs_clt_request() - Request data transfer to/from server via RDMA.
+ *
+ * @dir: READ/WRITE
+ * @ops: callback function to be called as confirmation, and the pointer.
+ * @clt: Session
+ * @permit: Preallocated permit
+ * @vec: Message that is sent to server together with the request.
+ * Sum of len of all @vec elements limited to <= IO_MSG_SIZE.
+ * Since the msg is copied internally it can be allocated on stack.
+ * @nr: Number of elements in @vec.
+ * @data_len: length of data sent to/from server
+ * @sg: Pages to be sent/received to/from server.
+ * @sg_cnt: Number of elements in the @sg
+ *
+ * Return:
+ * 0: Success
+ * <0: Error
+ *
+ * On dir=READ rtrs client will request a data transfer from Server to client.
+ * The data that the server will respond with will be stored in @sg when
+ * the user receives an %RTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL event.
+ * On dir=WRITE rtrs client will rdma write data in sg to server side.
+ */
+int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops,
+ struct rtrs_clt_sess *clt, struct rtrs_permit *permit,
+ const struct kvec *vec, size_t nr, size_t data_len,
+ struct scatterlist *sg, unsigned int sg_cnt)
+{
+ struct rtrs_clt_io_req *req;
+ struct rtrs_clt_path *clt_path;
+
+ enum dma_data_direction dma_dir;
+ int err = -ECONNABORTED, i;
+ size_t usr_len, hdr_len;
+ struct path_it it;
+
+ /* Get kvec length */
+ for (i = 0, usr_len = 0; i < nr; i++)
+ usr_len += vec[i].iov_len;
+
+ if (dir == READ) {
+ hdr_len = sizeof(struct rtrs_msg_rdma_read) +
+ sg_cnt * sizeof(struct rtrs_sg_desc);
+ dma_dir = DMA_FROM_DEVICE;
+ } else {
+ hdr_len = sizeof(struct rtrs_msg_rdma_write);
+ dma_dir = DMA_TO_DEVICE;
+ }
+
+ rcu_read_lock();
+ for (path_it_init(&it, clt);
+ (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
+ continue;
+
+ if (usr_len + hdr_len > clt_path->max_hdr_size) {
+ rtrs_wrn_rl(clt_path->clt,
+ "%s request failed, user message size is %zu and header length %zu, but max size is %u\n",
+ dir == READ ? "Read" : "Write",
+ usr_len, hdr_len, clt_path->max_hdr_size);
+ err = -EMSGSIZE;
+ break;
+ }
+ req = rtrs_clt_get_req(clt_path, ops->conf_fn, permit, ops->priv,
+ vec, usr_len, sg, sg_cnt, data_len,
+ dma_dir);
+ if (dir == READ)
+ err = rtrs_clt_read_req(req);
+ else
+ err = rtrs_clt_write_req(req);
+ if (err) {
+ req->in_use = false;
+ continue;
+ }
+ /* Success path */
+ break;
+ }
+ path_it_deinit(&it);
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(rtrs_clt_request);
+
+int rtrs_clt_rdma_cq_direct(struct rtrs_clt_sess *clt, unsigned int index)
+{
+ /* If no path, return -1 for block layer not to try again */
+ int cnt = -1;
+ struct rtrs_con *con;
+ struct rtrs_clt_path *clt_path;
+ struct path_it it;
+
+ rcu_read_lock();
+ for (path_it_init(&it, clt);
+ (clt_path = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) {
+ if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTED)
+ continue;
+
+ con = clt_path->s.con[index + 1];
+ cnt = ib_process_cq_direct(con->cq, -1);
+ if (cnt)
+ break;
+ }
+ path_it_deinit(&it);
+ rcu_read_unlock();
+
+ return cnt;
+}
+EXPORT_SYMBOL(rtrs_clt_rdma_cq_direct);
+
+/**
+ * rtrs_clt_query() - queries RTRS session attributes
+ *@clt: session pointer
+ *@attr: query results for session attributes.
+ * Returns:
+ * 0 on success
+ * -ECOMM no connection to the server
+ */
+int rtrs_clt_query(struct rtrs_clt_sess *clt, struct rtrs_attrs *attr)
+{
+ if (!rtrs_clt_is_connected(clt))
+ return -ECOMM;
+
+ attr->queue_depth = clt->queue_depth;
+ attr->max_segments = clt->max_segments;
+ /* Cap max_io_size to min of remote buffer size and the fr pages */
+ attr->max_io_size = min_t(int, clt->max_io_size,
+ clt->max_segments * SZ_4K);
+
+ return 0;
+}
+EXPORT_SYMBOL(rtrs_clt_query);
+
+int rtrs_clt_create_path_from_sysfs(struct rtrs_clt_sess *clt,
+ struct rtrs_addr *addr)
+{
+ struct rtrs_clt_path *clt_path;
+ int err;
+
+ clt_path = alloc_path(clt, addr, nr_cpu_ids, 0);
+ if (IS_ERR(clt_path))
+ return PTR_ERR(clt_path);
+
+ mutex_lock(&clt->paths_mutex);
+ if (clt->paths_num == 0) {
+ /*
+ * When all the paths are removed for a session,
+ * the addition of the first path is like a new session for
+ * the storage server
+ */
+ clt_path->for_new_clt = 1;
+ }
+
+ mutex_unlock(&clt->paths_mutex);
+
+ /*
+ * It is totally safe to add path in CONNECTING state: coming
+ * IO will never grab it. Also it is very important to add
+ * path before init, since init fires LINK_CONNECTED event.
+ */
+ rtrs_clt_add_path_to_arr(clt_path);
+
+ err = init_path(clt_path);
+ if (err)
+ goto close_path;
+
+ err = rtrs_clt_create_path_files(clt_path);
+ if (err)
+ goto close_path;
+
+ return 0;
+
+close_path:
+ rtrs_clt_remove_path_from_arr(clt_path);
+ rtrs_clt_close_conns(clt_path, true);
+ free_percpu(clt_path->stats->pcpu_stats);
+ kfree(clt_path->stats);
+ free_path(clt_path);
+
+ return err;
+}
+
+static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev)
+{
+ if (!(dev->ib_dev->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS)) {
+ pr_err("Memory registrations not supported.\n");
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
+static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = {
+ .init = rtrs_clt_ib_dev_init
+};
+
+static int __init rtrs_client_init(void)
+{
+ rtrs_rdma_dev_pd_init(0, &dev_pd);
+
+ rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client");
+ if (IS_ERR(rtrs_clt_dev_class)) {
+ pr_err("Failed to create rtrs-client dev class\n");
+ return PTR_ERR(rtrs_clt_dev_class);
+ }
+ rtrs_wq = alloc_workqueue("rtrs_client_wq", 0, 0);
+ if (!rtrs_wq) {
+ class_destroy(rtrs_clt_dev_class);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit rtrs_client_exit(void)
+{
+ destroy_workqueue(rtrs_wq);
+ class_destroy(rtrs_clt_dev_class);
+ rtrs_rdma_dev_pd_deinit(&dev_pd);
+}
+
+module_init(rtrs_client_init);
+module_exit(rtrs_client_exit);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h
new file mode 100644
index 000000000000..f848c0392d98
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#ifndef RTRS_CLT_H
+#define RTRS_CLT_H
+
+#include <linux/device.h>
+#include "rtrs-pri.h"
+
+/**
+ * enum rtrs_clt_state - Client states.
+ */
+enum rtrs_clt_state {
+ RTRS_CLT_CONNECTING,
+ RTRS_CLT_CONNECTING_ERR,
+ RTRS_CLT_RECONNECTING,
+ RTRS_CLT_CONNECTED,
+ RTRS_CLT_CLOSING,
+ RTRS_CLT_CLOSED,
+ RTRS_CLT_DEAD,
+};
+
+enum rtrs_mp_policy {
+ MP_POLICY_RR,
+ MP_POLICY_MIN_INFLIGHT,
+ MP_POLICY_MIN_LATENCY,
+};
+
+/* see Documentation/ABI/testing/sysfs-class-rtrs-client for details */
+struct rtrs_clt_stats_reconnects {
+ int successful_cnt;
+ int fail_cnt;
+};
+
+/* see Documentation/ABI/testing/sysfs-class-rtrs-client for details */
+struct rtrs_clt_stats_cpu_migr {
+ atomic_t from;
+ int to;
+};
+
+/* stats for Read and write operation.
+ * see Documentation/ABI/testing/sysfs-class-rtrs-client for details
+ */
+struct rtrs_clt_stats_rdma {
+ struct {
+ u64 cnt;
+ u64 size_total;
+ } dir[2];
+
+ u64 failover_cnt;
+};
+
+struct rtrs_clt_stats_pcpu {
+ struct rtrs_clt_stats_cpu_migr cpu_migr;
+ struct rtrs_clt_stats_rdma rdma;
+};
+
+struct rtrs_clt_stats {
+ struct kobject kobj_stats;
+ struct rtrs_clt_stats_pcpu __percpu *pcpu_stats;
+ struct rtrs_clt_stats_reconnects reconnects;
+ atomic_t inflight;
+};
+
+struct rtrs_clt_con {
+ struct rtrs_con c;
+ struct rtrs_iu *rsp_ius;
+ u32 queue_num;
+ unsigned int cpu;
+ struct mutex con_mutex;
+ int cm_err;
+};
+
+/**
+ * rtrs_permit - permits the memory allocation for future RDMA operation.
+ * Combine with irq pinning to keep IO on same CPU.
+ */
+struct rtrs_permit {
+ enum rtrs_clt_con_type con_type;
+ unsigned int cpu_id;
+ unsigned int mem_id;
+ unsigned int mem_off;
+};
+
+/**
+ * rtrs_clt_io_req - describes one inflight IO request
+ */
+struct rtrs_clt_io_req {
+ struct list_head list;
+ struct rtrs_iu *iu;
+ struct scatterlist *sglist; /* list holding user data */
+ unsigned int sg_cnt;
+ unsigned int sg_size;
+ unsigned int data_len;
+ unsigned int usr_len;
+ void *priv;
+ bool in_use;
+ enum rtrs_mp_policy mp_policy;
+ struct rtrs_clt_con *con;
+ struct rtrs_sg_desc *desc;
+ struct ib_sge *sge;
+ struct rtrs_permit *permit;
+ enum dma_data_direction dir;
+ void (*conf)(void *priv, int errno);
+ unsigned long start_jiffies;
+
+ struct ib_mr *mr;
+ struct ib_cqe inv_cqe;
+ struct completion inv_comp;
+ int inv_errno;
+ bool need_inv_comp;
+ bool need_inv;
+ refcount_t ref;
+};
+
+struct rtrs_rbuf {
+ u64 addr;
+ u32 rkey;
+};
+
+struct rtrs_clt_path {
+ struct rtrs_path s;
+ struct rtrs_clt_sess *clt;
+ wait_queue_head_t state_wq;
+ enum rtrs_clt_state state;
+ atomic_t connected_cnt;
+ struct mutex init_mutex;
+ struct rtrs_clt_io_req *reqs;
+ struct delayed_work reconnect_dwork;
+ struct work_struct close_work;
+ struct work_struct err_recovery_work;
+ unsigned int reconnect_attempts;
+ bool established;
+ struct rtrs_rbuf *rbufs;
+ size_t max_io_size;
+ u32 max_hdr_size;
+ u32 chunk_size;
+ size_t queue_depth;
+ u32 max_pages_per_mr;
+ u32 flags;
+ struct kobject kobj;
+ u8 for_new_clt;
+ struct rtrs_clt_stats *stats;
+ /* cache hca_port and hca_name to display in sysfs */
+ u8 hca_port;
+ char hca_name[IB_DEVICE_NAME_MAX];
+ struct list_head __percpu
+ *mp_skip_entry;
+};
+
+struct rtrs_clt_sess {
+ struct list_head paths_list; /* rcu protected list */
+ size_t paths_num;
+ struct rtrs_clt_path
+ __rcu * __percpu *pcpu_path;
+ uuid_t paths_uuid;
+ int paths_up;
+ struct mutex paths_mutex;
+ struct mutex paths_ev_mutex;
+ char sessname[NAME_MAX];
+ u16 port;
+ unsigned int max_reconnect_attempts;
+ unsigned int reconnect_delay_sec;
+ unsigned int max_segments;
+ void *permits;
+ unsigned long *permits_map;
+ size_t queue_depth;
+ size_t max_io_size;
+ wait_queue_head_t permits_wait;
+ size_t pdu_sz;
+ void *priv;
+ void (*link_ev)(void *priv,
+ enum rtrs_clt_link_ev ev);
+ struct device dev;
+ struct kobject *kobj_paths;
+ enum rtrs_mp_policy mp_policy;
+};
+
+static inline struct rtrs_clt_con *to_clt_con(struct rtrs_con *c)
+{
+ return container_of(c, struct rtrs_clt_con, c);
+}
+
+static inline struct rtrs_clt_path *to_clt_path(struct rtrs_path *s)
+{
+ return container_of(s, struct rtrs_clt_path, s);
+}
+
+static inline int permit_size(struct rtrs_clt_sess *clt)
+{
+ return sizeof(struct rtrs_permit) + clt->pdu_sz;
+}
+
+static inline struct rtrs_permit *get_permit(struct rtrs_clt_sess *clt,
+ int idx)
+{
+ return (struct rtrs_permit *)(clt->permits + permit_size(clt) * idx);
+}
+
+int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *path);
+void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait);
+int rtrs_clt_create_path_from_sysfs(struct rtrs_clt_sess *clt,
+ struct rtrs_addr *addr);
+int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *path,
+ const struct attribute *sysfs_self);
+
+void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value);
+int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt);
+void free_path(struct rtrs_clt_path *clt_path);
+
+/* rtrs-clt-stats.c */
+
+int rtrs_clt_init_stats(struct rtrs_clt_stats *stats);
+
+void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *s);
+
+void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con);
+void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir);
+
+int rtrs_clt_reset_rdma_lat_distr_stats(struct rtrs_clt_stats *stats,
+ bool enable);
+ssize_t rtrs_clt_stats_rdma_lat_distr_to_str(struct rtrs_clt_stats *stats,
+ char *page);
+int rtrs_clt_reset_cpu_migr_stats(struct rtrs_clt_stats *stats, bool enable);
+int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf);
+int rtrs_clt_stats_migration_to_cnt_to_str(struct rtrs_clt_stats *stats, char *buf);
+int rtrs_clt_reset_reconnects_stat(struct rtrs_clt_stats *stats, bool enable);
+int rtrs_clt_stats_reconnects_to_str(struct rtrs_clt_stats *stats, char *buf);
+int rtrs_clt_reset_rdma_stats(struct rtrs_clt_stats *stats, bool enable);
+ssize_t rtrs_clt_stats_rdma_to_str(struct rtrs_clt_stats *stats,
+ char *page);
+int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *stats, bool enable);
+ssize_t rtrs_clt_reset_all_help(struct rtrs_clt_stats *stats,
+ char *page);
+
+/* rtrs-clt-sysfs.c */
+
+int rtrs_clt_create_sysfs_root_files(struct rtrs_clt_sess *clt);
+void rtrs_clt_destroy_sysfs_root(struct rtrs_clt_sess *clt);
+
+int rtrs_clt_create_path_files(struct rtrs_clt_path *clt_path);
+void rtrs_clt_destroy_path_files(struct rtrs_clt_path *clt_path,
+ const struct attribute *sysfs_self);
+
+#endif /* RTRS_CLT_H */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-log.h b/drivers/infiniband/ulp/rtrs/rtrs-log.h
new file mode 100644
index 000000000000..53c785b992f2
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-log.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RTRS_LOG_H
+#define RTRS_LOG_H
+
+#define rtrs_log(fn, obj, fmt, ...) \
+ fn("<%s>: " fmt, obj->sessname, ##__VA_ARGS__)
+
+#define rtrs_err(obj, fmt, ...) \
+ rtrs_log(pr_err, obj, fmt, ##__VA_ARGS__)
+#define rtrs_err_rl(obj, fmt, ...) \
+ rtrs_log(pr_err_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define rtrs_wrn(obj, fmt, ...) \
+ rtrs_log(pr_warn, obj, fmt, ##__VA_ARGS__)
+#define rtrs_wrn_rl(obj, fmt, ...) \
+ rtrs_log(pr_warn_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define rtrs_info(obj, fmt, ...) \
+ rtrs_log(pr_info, obj, fmt, ##__VA_ARGS__)
+#define rtrs_info_rl(obj, fmt, ...) \
+ rtrs_log(pr_info_ratelimited, obj, fmt, ##__VA_ARGS__)
+
+#endif /* RTRS_LOG_H */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
new file mode 100644
index 000000000000..a2420eecaf5a
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#ifndef RTRS_PRI_H
+#define RTRS_PRI_H
+
+#include <linux/uuid.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib.h>
+
+#include "rtrs.h"
+
+#define RTRS_PROTO_VER_MAJOR 2
+#define RTRS_PROTO_VER_MINOR 0
+
+#define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \
+ __stringify(RTRS_PROTO_VER_MINOR)
+
+/*
+ * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS)
+ * and the minimum chunk size is 4096 (2^12).
+ * So the maximum sess_queue_depth is 65535 (2^16 - 1) in theory
+ * since queue_depth in rtrs_msg_conn_rsp is defined as le16.
+ * Therefore the pratical max value of sess_queue_depth is
+ * somewhere between 1 and 65535 and it depends on the system.
+ */
+#define MAX_SESS_QUEUE_DEPTH 65535
+
+enum rtrs_imm_const {
+ MAX_IMM_TYPE_BITS = 4,
+ MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1),
+ MAX_IMM_PAYL_BITS = 28,
+ MAX_IMM_PAYL_MASK = ((1 << MAX_IMM_PAYL_BITS) - 1),
+};
+
+enum rtrs_imm_type {
+ RTRS_IO_REQ_IMM = 0, /* client to server */
+ RTRS_IO_RSP_IMM = 1, /* server to client */
+ RTRS_IO_RSP_W_INV_IMM = 2, /* server to client */
+
+ RTRS_HB_MSG_IMM = 8, /* HB: HeartBeat */
+ RTRS_HB_ACK_IMM = 9,
+
+ RTRS_LAST_IMM,
+};
+
+enum {
+ SERVICE_CON_QUEUE_DEPTH = 512,
+
+ MAX_PATHS_NUM = 128,
+
+ MIN_CHUNK_SIZE = 8192,
+
+ RTRS_HB_INTERVAL_MS = 5000,
+ RTRS_HB_MISSED_MAX = 5,
+
+ RTRS_MAGIC = 0x1BBD,
+ RTRS_PROTO_VER = (RTRS_PROTO_VER_MAJOR << 8) | RTRS_PROTO_VER_MINOR,
+};
+
+struct rtrs_ib_dev;
+
+struct rtrs_rdma_dev_pd_ops {
+ struct rtrs_ib_dev *(*alloc)(void);
+ void (*free)(struct rtrs_ib_dev *dev);
+ int (*init)(struct rtrs_ib_dev *dev);
+ void (*deinit)(struct rtrs_ib_dev *dev);
+};
+
+struct rtrs_rdma_dev_pd {
+ struct mutex mutex;
+ struct list_head list;
+ enum ib_pd_flags pd_flags;
+ const struct rtrs_rdma_dev_pd_ops *ops;
+};
+
+struct rtrs_ib_dev {
+ struct ib_device *ib_dev;
+ struct ib_pd *ib_pd;
+ struct kref ref;
+ struct list_head entry;
+ struct rtrs_rdma_dev_pd *pool;
+};
+
+struct rtrs_con {
+ struct rtrs_path *path;
+ struct ib_qp *qp;
+ struct ib_cq *cq;
+ struct rdma_cm_id *cm_id;
+ unsigned int cid;
+ int nr_cqe;
+ atomic_t wr_cnt;
+ atomic_t sq_wr_avail;
+};
+
+struct rtrs_path {
+ struct list_head entry;
+ struct sockaddr_storage dst_addr;
+ struct sockaddr_storage src_addr;
+ char sessname[NAME_MAX];
+ uuid_t uuid;
+ struct rtrs_con **con;
+ unsigned int con_num;
+ unsigned int irq_con_num;
+ unsigned int recon_cnt;
+ unsigned int signal_interval;
+ struct rtrs_ib_dev *dev;
+ int dev_ref;
+ struct ib_cqe *hb_cqe;
+ void (*hb_err_handler)(struct rtrs_con *con);
+ struct workqueue_struct *hb_wq;
+ struct delayed_work hb_dwork;
+ unsigned int hb_interval_ms;
+ unsigned int hb_missed_cnt;
+ unsigned int hb_missed_max;
+ ktime_t hb_last_sent;
+ ktime_t hb_cur_latency;
+};
+
+/* rtrs information unit */
+struct rtrs_iu {
+ struct ib_cqe cqe;
+ dma_addr_t dma_addr;
+ void *buf;
+ size_t size;
+ enum dma_data_direction direction;
+};
+
+/**
+ * enum rtrs_msg_types - RTRS message types, see also rtrs/README
+ * @RTRS_MSG_INFO_REQ: Client additional info request to the server
+ * @RTRS_MSG_INFO_RSP: Server additional info response to the client
+ * @RTRS_MSG_WRITE: Client writes data per RDMA to server
+ * @RTRS_MSG_READ: Client requests data transfer from server
+ * @RTRS_MSG_RKEY_RSP: Server refreshed rkey for rbuf
+ */
+enum rtrs_msg_types {
+ RTRS_MSG_INFO_REQ,
+ RTRS_MSG_INFO_RSP,
+ RTRS_MSG_WRITE,
+ RTRS_MSG_READ,
+ RTRS_MSG_RKEY_RSP,
+};
+
+/**
+ * enum rtrs_msg_flags - RTRS message flags.
+ * @RTRS_NEED_INVAL: Send invalidation in response.
+ * @RTRS_MSG_NEW_RKEY_F: Send refreshed rkey in response.
+ */
+enum rtrs_msg_flags {
+ RTRS_MSG_NEED_INVAL_F = 1 << 0,
+ RTRS_MSG_NEW_RKEY_F = 1 << 1,
+};
+
+/**
+ * struct rtrs_sg_desc - RDMA-Buffer entry description
+ * @addr: Address of RDMA destination buffer
+ * @key: Authorization rkey to write to the buffer
+ * @len: Size of the buffer
+ */
+struct rtrs_sg_desc {
+ __le64 addr;
+ __le32 key;
+ __le32 len;
+};
+
+/**
+ * struct rtrs_msg_conn_req - Client connection request to the server
+ * @magic: RTRS magic
+ * @version: RTRS protocol version
+ * @cid: Current connection id
+ * @cid_num: Number of connections per session
+ * @recon_cnt: Reconnections counter
+ * @sess_uuid: UUID of a session (path)
+ * @paths_uuid: UUID of a group of sessions (paths)
+ *
+ * NOTE: max size 56 bytes, see man rdma_connect().
+ */
+struct rtrs_msg_conn_req {
+ /* Is set to 0 by cma.c in case of AF_IB, do not touch that.
+ * see https://www.spinics.net/lists/linux-rdma/msg22397.html
+ */
+ u8 __cma_version;
+ /* On sender side that should be set to 0, or cma_save_ip_info()
+ * extract garbage and will fail.
+ */
+ u8 __ip_version;
+ __le16 magic;
+ __le16 version;
+ __le16 cid;
+ __le16 cid_num;
+ __le16 recon_cnt;
+ uuid_t sess_uuid;
+ uuid_t paths_uuid;
+ u8 first_conn : 1;
+ u8 reserved_bits : 7;
+ u8 reserved[11];
+};
+
+/**
+ * struct rtrs_msg_conn_rsp - Server connection response to the client
+ * @magic: RTRS magic
+ * @version: RTRS protocol version
+ * @errno: If rdma_accept() then 0, if rdma_reject() indicates error
+ * @queue_depth: max inflight messages (queue-depth) in this session
+ * @max_io_size: max io size server supports
+ * @max_hdr_size: max msg header size server supports
+ *
+ * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept().
+ */
+struct rtrs_msg_conn_rsp {
+ __le16 magic;
+ __le16 version;
+ __le16 errno;
+ __le16 queue_depth;
+ __le32 max_io_size;
+ __le32 max_hdr_size;
+ __le32 flags;
+ u8 reserved[36];
+};
+
+/**
+ * struct rtrs_msg_info_req
+ * @type: @RTRS_MSG_INFO_REQ
+ * @pathname: Path name chosen by client
+ */
+struct rtrs_msg_info_req {
+ __le16 type;
+ u8 pathname[NAME_MAX];
+ u8 reserved[15];
+};
+
+/**
+ * struct rtrs_msg_info_rsp
+ * @type: @RTRS_MSG_INFO_RSP
+ * @sg_cnt: Number of @desc entries
+ * @desc: RDMA buffers where the client can write to server
+ */
+struct rtrs_msg_info_rsp {
+ __le16 type;
+ __le16 sg_cnt;
+ u8 reserved[4];
+ struct rtrs_sg_desc desc[];
+};
+
+/**
+ * struct rtrs_msg_rkey_rsp
+ * @type: @RTRS_MSG_RKEY_RSP
+ * @buf_id: RDMA buf_id of the new rkey
+ * @rkey: new remote key for RDMA buffers id from server
+ */
+struct rtrs_msg_rkey_rsp {
+ __le16 type;
+ __le16 buf_id;
+ __le32 rkey;
+};
+
+/**
+ * struct rtrs_msg_rdma_read - RDMA data transfer request from client
+ * @type: always @RTRS_MSG_READ
+ * @usr_len: length of user payload
+ * @sg_cnt: number of @desc entries
+ * @desc: RDMA buffers where the server can write the result to
+ */
+struct rtrs_msg_rdma_read {
+ __le16 type;
+ __le16 usr_len;
+ __le16 flags;
+ __le16 sg_cnt;
+ struct rtrs_sg_desc desc[];
+};
+
+/**
+ * struct_msg_rdma_write - Message transferred to server with RDMA-Write
+ * @type: always @RTRS_MSG_WRITE
+ * @usr_len: length of user payload
+ */
+struct rtrs_msg_rdma_write {
+ __le16 type;
+ __le16 usr_len;
+};
+
+/**
+ * struct_msg_rdma_hdr - header for read or write request
+ * @type: @RTRS_MSG_WRITE | @RTRS_MSG_READ
+ */
+struct rtrs_msg_rdma_hdr {
+ __le16 type;
+};
+
+/* rtrs.c */
+
+struct rtrs_iu *rtrs_iu_alloc(u32 queue_num, size_t size, gfp_t t,
+ struct ib_device *dev, enum dma_data_direction,
+ void (*done)(struct ib_cq *cq, struct ib_wc *wc));
+void rtrs_iu_free(struct rtrs_iu *iu, struct ib_device *dev, u32 queue_num);
+int rtrs_iu_post_recv(struct rtrs_con *con, struct rtrs_iu *iu);
+int rtrs_iu_post_send(struct rtrs_con *con, struct rtrs_iu *iu, size_t size,
+ struct ib_send_wr *head);
+int rtrs_iu_post_rdma_write_imm(struct rtrs_con *con, struct rtrs_iu *iu,
+ struct ib_sge *sge, unsigned int num_sge,
+ u32 rkey, u64 rdma_addr, u32 imm_data,
+ enum ib_send_flags flags,
+ struct ib_send_wr *head,
+ struct ib_send_wr *tail);
+
+int rtrs_post_recv_empty(struct rtrs_con *con, struct ib_cqe *cqe);
+
+int rtrs_cq_qp_create(struct rtrs_path *path, struct rtrs_con *con,
+ u32 max_send_sge, int cq_vector, int nr_cqe,
+ u32 max_send_wr, u32 max_recv_wr,
+ enum ib_poll_context poll_ctx);
+void rtrs_cq_qp_destroy(struct rtrs_con *con);
+
+void rtrs_init_hb(struct rtrs_path *path, struct ib_cqe *cqe,
+ unsigned int interval_ms, unsigned int missed_max,
+ void (*err_handler)(struct rtrs_con *con),
+ struct workqueue_struct *wq);
+void rtrs_start_hb(struct rtrs_path *path);
+void rtrs_stop_hb(struct rtrs_path *path);
+void rtrs_send_hb_ack(struct rtrs_path *path);
+
+void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags,
+ struct rtrs_rdma_dev_pd *pool);
+void rtrs_rdma_dev_pd_deinit(struct rtrs_rdma_dev_pd *pool);
+
+struct rtrs_ib_dev *rtrs_ib_dev_find_or_add(struct ib_device *ib_dev,
+ struct rtrs_rdma_dev_pd *pool);
+int rtrs_ib_dev_put(struct rtrs_ib_dev *dev);
+
+static inline u32 rtrs_to_imm(u32 type, u32 payload)
+{
+ BUILD_BUG_ON(MAX_IMM_PAYL_BITS + MAX_IMM_TYPE_BITS != 32);
+ BUILD_BUG_ON(RTRS_LAST_IMM > (1<<MAX_IMM_TYPE_BITS));
+ return ((type & MAX_IMM_TYPE_MASK) << MAX_IMM_PAYL_BITS) |
+ (payload & MAX_IMM_PAYL_MASK);
+}
+
+static inline void rtrs_from_imm(u32 imm, u32 *type, u32 *payload)
+{
+ *payload = imm & MAX_IMM_PAYL_MASK;
+ *type = imm >> MAX_IMM_PAYL_BITS;
+}
+
+static inline u32 rtrs_to_io_req_imm(u32 addr)
+{
+ return rtrs_to_imm(RTRS_IO_REQ_IMM, addr);
+}
+
+static inline u32 rtrs_to_io_rsp_imm(u32 msg_id, int errno, bool w_inval)
+{
+ enum rtrs_imm_type type;
+ u32 payload;
+
+ /* 9 bits for errno, 19 bits for msg_id */
+ payload = (abs(errno) & 0x1ff) << 19 | (msg_id & 0x7ffff);
+ type = w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM;
+
+ return rtrs_to_imm(type, payload);
+}
+
+static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno)
+{
+ /* 9 bits for errno, 19 bits for msg_id */
+ *msg_id = payload & 0x7ffff;
+ *errno = -(int)((payload >> 19) & 0x1ff);
+}
+
+#define STAT_STORE_FUNC(type, set_value, reset) \
+static ssize_t set_value##_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ int ret = -EINVAL; \
+ type *stats = container_of(kobj, type, kobj_stats); \
+ \
+ if (sysfs_streq(buf, "1")) \
+ ret = reset(stats, true); \
+ else if (sysfs_streq(buf, "0")) \
+ ret = reset(stats, false); \
+ if (ret) \
+ return ret; \
+ \
+ return count; \
+}
+
+#define STAT_SHOW_FUNC(type, get_value, print) \
+static ssize_t get_value##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ type *stats = container_of(kobj, type, kobj_stats); \
+ \
+ return print(stats, page); \
+}
+
+#define STAT_ATTR(type, stat, print, reset) \
+STAT_STORE_FUNC(type, stat, reset) \
+STAT_SHOW_FUNC(type, stat, print) \
+static struct kobj_attribute stat##_attr = __ATTR_RW(stat)
+
+#endif /* RTRS_PRI_H */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
new file mode 100644
index 000000000000..2aff1213a19d
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "rtrs-srv.h"
+
+int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable)
+{
+ if (enable) {
+ int cpu;
+ struct rtrs_srv_stats_rdma_stats *r;
+
+ for_each_possible_cpu(cpu) {
+ r = per_cpu_ptr(stats->rdma_stats, cpu);
+ memset(r, 0, sizeof(*r));
+ }
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page)
+{
+ int cpu;
+ struct rtrs_srv_stats_rdma_stats sum;
+ struct rtrs_srv_stats_rdma_stats *r;
+
+ memset(&sum, 0, sizeof(sum));
+
+ for_each_possible_cpu(cpu) {
+ r = per_cpu_ptr(stats->rdma_stats, cpu);
+
+ sum.dir[READ].cnt += r->dir[READ].cnt;
+ sum.dir[READ].size_total += r->dir[READ].size_total;
+ sum.dir[WRITE].cnt += r->dir[WRITE].cnt;
+ sum.dir[WRITE].size_total += r->dir[WRITE].size_total;
+ }
+
+ return sysfs_emit(page, "%llu %llu %llu %llu\n",
+ sum.dir[READ].cnt, sum.dir[READ].size_total,
+ sum.dir[WRITE].cnt, sum.dir[WRITE].size_total);
+}
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
new file mode 100644
index 000000000000..2a3c9ac64a42
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "rtrs-pri.h"
+#include "rtrs-srv.h"
+#include "rtrs-log.h"
+
+static void rtrs_srv_release(struct kobject *kobj)
+{
+ struct rtrs_srv_path *srv_path;
+
+ srv_path = container_of(kobj, struct rtrs_srv_path, kobj);
+ kfree(srv_path);
+}
+
+static struct kobj_type ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = rtrs_srv_release,
+};
+
+static ssize_t rtrs_srv_disconnect_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Usage: echo 1 > %s\n", attr->attr.name);
+}
+
+static ssize_t rtrs_srv_disconnect_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rtrs_srv_path *srv_path;
+ struct rtrs_path *s;
+ char str[MAXHOSTNAMELEN];
+
+ srv_path = container_of(kobj, struct rtrs_srv_path, kobj);
+ s = &srv_path->s;
+ if (!sysfs_streq(buf, "1")) {
+ rtrs_err(s, "%s: invalid value: '%s'\n",
+ attr->attr.name, buf);
+ return -EINVAL;
+ }
+
+ sockaddr_to_str((struct sockaddr *)&srv_path->s.dst_addr, str,
+ sizeof(str));
+
+ rtrs_info(s, "disconnect for path %s requested\n", str);
+ /* first remove sysfs itself to avoid deadlock */
+ sysfs_remove_file_self(&srv_path->kobj, &attr->attr);
+ close_path(srv_path);
+
+ return count;
+}
+
+static struct kobj_attribute rtrs_srv_disconnect_attr =
+ __ATTR(disconnect, 0644,
+ rtrs_srv_disconnect_show, rtrs_srv_disconnect_store);
+
+static ssize_t rtrs_srv_hca_port_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_srv_path *srv_path;
+ struct rtrs_con *usr_con;
+
+ srv_path = container_of(kobj, typeof(*srv_path), kobj);
+ usr_con = srv_path->s.con[0];
+
+ return sysfs_emit(page, "%u\n", usr_con->cm_id->port_num);
+}
+
+static struct kobj_attribute rtrs_srv_hca_port_attr =
+ __ATTR(hca_port, 0444, rtrs_srv_hca_port_show, NULL);
+
+static ssize_t rtrs_srv_hca_name_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_srv_path *srv_path;
+
+ srv_path = container_of(kobj, struct rtrs_srv_path, kobj);
+
+ return sysfs_emit(page, "%s\n", srv_path->s.dev->ib_dev->name);
+}
+
+static struct kobj_attribute rtrs_srv_hca_name_attr =
+ __ATTR(hca_name, 0444, rtrs_srv_hca_name_show, NULL);
+
+static ssize_t rtrs_srv_src_addr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_srv_path *srv_path;
+ int cnt;
+
+ srv_path = container_of(kobj, struct rtrs_srv_path, kobj);
+ cnt = sockaddr_to_str((struct sockaddr *)&srv_path->s.dst_addr,
+ page, PAGE_SIZE);
+ return cnt + sysfs_emit_at(page, cnt, "\n");
+}
+
+static struct kobj_attribute rtrs_srv_src_addr_attr =
+ __ATTR(src_addr, 0444, rtrs_srv_src_addr_show, NULL);
+
+static ssize_t rtrs_srv_dst_addr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+ struct rtrs_srv_path *srv_path;
+ int len;
+
+ srv_path = container_of(kobj, struct rtrs_srv_path, kobj);
+ len = sockaddr_to_str((struct sockaddr *)&srv_path->s.src_addr, page,
+ PAGE_SIZE);
+ len += sysfs_emit_at(page, len, "\n");
+ return len;
+}
+
+static struct kobj_attribute rtrs_srv_dst_addr_attr =
+ __ATTR(dst_addr, 0444, rtrs_srv_dst_addr_show, NULL);
+
+static struct attribute *rtrs_srv_path_attrs[] = {
+ &rtrs_srv_hca_name_attr.attr,
+ &rtrs_srv_hca_port_attr.attr,
+ &rtrs_srv_src_addr_attr.attr,
+ &rtrs_srv_dst_addr_attr.attr,
+ &rtrs_srv_disconnect_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group rtrs_srv_path_attr_group = {
+ .attrs = rtrs_srv_path_attrs,
+};
+
+STAT_ATTR(struct rtrs_srv_stats, rdma,
+ rtrs_srv_stats_rdma_to_str,
+ rtrs_srv_reset_rdma_stats);
+
+static struct attribute *rtrs_srv_stats_attrs[] = {
+ &rdma_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group rtrs_srv_stats_attr_group = {
+ .attrs = rtrs_srv_stats_attrs,
+};
+
+static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ int err = 0;
+
+ mutex_lock(&srv->paths_mutex);
+ if (srv->dev_ref++) {
+ /*
+ * Device needs to be registered only on the first session
+ */
+ goto unlock;
+ }
+ srv->dev.class = rtrs_dev_class;
+ err = dev_set_name(&srv->dev, "%s", srv_path->s.sessname);
+ if (err)
+ goto unlock;
+
+ /*
+ * Suppress user space notification until
+ * sysfs files are created
+ */
+ dev_set_uevent_suppress(&srv->dev, true);
+ err = device_add(&srv->dev);
+ if (err) {
+ pr_err("device_add(): %d\n", err);
+ put_device(&srv->dev);
+ goto unlock;
+ }
+ srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj);
+ if (!srv->kobj_paths) {
+ err = -ENOMEM;
+ pr_err("kobject_create_and_add(): %d\n", err);
+ device_del(&srv->dev);
+ put_device(&srv->dev);
+ goto unlock;
+ }
+ dev_set_uevent_suppress(&srv->dev, false);
+ kobject_uevent(&srv->dev.kobj, KOBJ_ADD);
+unlock:
+ mutex_unlock(&srv->paths_mutex);
+
+ return err;
+}
+
+static void
+rtrs_srv_destroy_once_sysfs_root_folders(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+
+ mutex_lock(&srv->paths_mutex);
+ if (!--srv->dev_ref) {
+ kobject_del(srv->kobj_paths);
+ kobject_put(srv->kobj_paths);
+ mutex_unlock(&srv->paths_mutex);
+ device_del(&srv->dev);
+ put_device(&srv->dev);
+ } else {
+ put_device(&srv->dev);
+ mutex_unlock(&srv->paths_mutex);
+ }
+}
+
+static void rtrs_srv_path_stats_release(struct kobject *kobj)
+{
+ struct rtrs_srv_stats *stats;
+
+ stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats);
+
+ free_percpu(stats->rdma_stats);
+
+ kfree(stats);
+}
+
+static struct kobj_type ktype_stats = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = rtrs_srv_path_stats_release,
+};
+
+static int rtrs_srv_create_stats_files(struct rtrs_srv_path *srv_path)
+{
+ int err;
+ struct rtrs_path *s = &srv_path->s;
+
+ err = kobject_init_and_add(&srv_path->stats->kobj_stats, &ktype_stats,
+ &srv_path->kobj, "stats");
+ if (err) {
+ rtrs_err(s, "kobject_init_and_add(): %d\n", err);
+ kobject_put(&srv_path->stats->kobj_stats);
+ return err;
+ }
+ err = sysfs_create_group(&srv_path->stats->kobj_stats,
+ &rtrs_srv_stats_attr_group);
+ if (err) {
+ rtrs_err(s, "sysfs_create_group(): %d\n", err);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ kobject_del(&srv_path->stats->kobj_stats);
+ kobject_put(&srv_path->stats->kobj_stats);
+
+ return err;
+}
+
+int rtrs_srv_create_path_files(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_path *s = &srv_path->s;
+ char str[NAME_MAX];
+ int err;
+ struct rtrs_addr path = {
+ .src = &srv_path->s.dst_addr,
+ .dst = &srv_path->s.src_addr,
+ };
+
+ rtrs_addr_to_str(&path, str, sizeof(str));
+ err = rtrs_srv_create_once_sysfs_root_folders(srv_path);
+ if (err)
+ return err;
+
+ err = kobject_init_and_add(&srv_path->kobj, &ktype, srv->kobj_paths,
+ "%s", str);
+ if (err) {
+ rtrs_err(s, "kobject_init_and_add(): %d\n", err);
+ goto destroy_root;
+ }
+ err = sysfs_create_group(&srv_path->kobj, &rtrs_srv_path_attr_group);
+ if (err) {
+ rtrs_err(s, "sysfs_create_group(): %d\n", err);
+ goto put_kobj;
+ }
+ err = rtrs_srv_create_stats_files(srv_path);
+ if (err)
+ goto remove_group;
+
+ return 0;
+
+remove_group:
+ sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group);
+put_kobj:
+ kobject_del(&srv_path->kobj);
+destroy_root:
+ kobject_put(&srv_path->kobj);
+ rtrs_srv_destroy_once_sysfs_root_folders(srv_path);
+
+ return err;
+}
+
+void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path)
+{
+ if (srv_path->kobj.state_in_sysfs) {
+ kobject_del(&srv_path->stats->kobj_stats);
+ kobject_put(&srv_path->stats->kobj_stats);
+ sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group);
+ kobject_put(&srv_path->kobj);
+
+ rtrs_srv_destroy_once_sysfs_root_folders(srv_path);
+ }
+}
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.c
new file mode 100644
index 000000000000..29ca59ceb0dd
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#include "rtrs.h"
+#include "rtrs-pri.h"
+#include "rtrs-srv.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "rtrs-srv-trace.h"
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.h b/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.h
new file mode 100644
index 000000000000..587d3e033081
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-trace.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rtrs_srv
+
+#if !defined(_TRACE_RTRS_SRV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RTRS_SRV_H
+
+#include <linux/tracepoint.h>
+
+struct rtrs_srv_op;
+struct rtrs_srv_con;
+struct rtrs_srv_path;
+
+TRACE_DEFINE_ENUM(RTRS_SRV_CONNECTING);
+TRACE_DEFINE_ENUM(RTRS_SRV_CONNECTED);
+TRACE_DEFINE_ENUM(RTRS_SRV_CLOSING);
+TRACE_DEFINE_ENUM(RTRS_SRV_CLOSED);
+
+#define show_rtrs_srv_state(x) \
+ __print_symbolic(x, \
+ { RTRS_SRV_CONNECTING, "CONNECTING" }, \
+ { RTRS_SRV_CONNECTED, "CONNECTED" }, \
+ { RTRS_SRV_CLOSING, "CLOSING" }, \
+ { RTRS_SRV_CLOSED, "CLOSED" })
+
+TRACE_EVENT(send_io_resp_imm,
+ TP_PROTO(struct rtrs_srv_op *id,
+ bool need_inval,
+ bool always_invalidate,
+ int errno),
+
+ TP_ARGS(id, need_inval, always_invalidate, errno),
+
+ TP_STRUCT__entry(
+ __field(u8, dir)
+ __field(bool, need_inval)
+ __field(bool, always_invalidate)
+ __field(u32, msg_id)
+ __field(int, wr_cnt)
+ __field(u32, signal_interval)
+ __field(int, state)
+ __field(int, errno)
+ __array(char, sessname, NAME_MAX)
+ ),
+
+ TP_fast_assign(
+ struct rtrs_srv_con *con = id->con;
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+
+ __entry->dir = id->dir;
+ __entry->state = srv_path->state;
+ __entry->errno = errno;
+ __entry->need_inval = need_inval;
+ __entry->always_invalidate = always_invalidate;
+ __entry->msg_id = id->msg_id;
+ __entry->wr_cnt = atomic_read(&con->c.wr_cnt);
+ __entry->signal_interval = s->signal_interval;
+ memcpy(__entry->sessname, kobject_name(&srv_path->kobj), NAME_MAX);
+ ),
+
+ TP_printk("sess='%s' state='%s' dir=%s err='%d' inval='%d' glob-inval='%d' msgid='%u' wrcnt='%d' sig-interval='%u'",
+ __entry->sessname,
+ show_rtrs_srv_state(__entry->state),
+ __print_symbolic(__entry->dir,
+ { READ, "READ" },
+ { WRITE, "WRITE" }),
+ __entry->errno,
+ __entry->need_inval,
+ __entry->always_invalidate,
+ __entry->msg_id,
+ __entry->wr_cnt,
+ __entry->signal_interval
+ )
+);
+
+#endif /* _TRACE_RTRS_SRV_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rtrs-srv-trace
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
new file mode 100644
index 000000000000..22d7ba05e9fe
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -0,0 +1,2297 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+
+#include "rtrs-srv.h"
+#include "rtrs-log.h"
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include "rtrs-srv-trace.h"
+
+MODULE_DESCRIPTION("RDMA Transport Server");
+MODULE_LICENSE("GPL");
+
+/* Must be power of 2, see mask from mr->page_size in ib_sg_to_pages() */
+#define DEFAULT_MAX_CHUNK_SIZE (128 << 10)
+#define DEFAULT_SESS_QUEUE_DEPTH 512
+#define MAX_HDR_SIZE PAGE_SIZE
+
+static struct rtrs_rdma_dev_pd dev_pd;
+struct class *rtrs_dev_class;
+static struct rtrs_srv_ib_ctx ib_ctx;
+
+static int __read_mostly max_chunk_size = DEFAULT_MAX_CHUNK_SIZE;
+static int __read_mostly sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH;
+
+static bool always_invalidate = true;
+module_param(always_invalidate, bool, 0444);
+MODULE_PARM_DESC(always_invalidate,
+ "Invalidate memory registration for contiguous memory regions before accessing.");
+
+module_param_named(max_chunk_size, max_chunk_size, int, 0444);
+MODULE_PARM_DESC(max_chunk_size,
+ "Max size for each IO request, when change the unit is in byte (default: "
+ __stringify(DEFAULT_MAX_CHUNK_SIZE) "KB)");
+
+module_param_named(sess_queue_depth, sess_queue_depth, int, 0444);
+MODULE_PARM_DESC(sess_queue_depth,
+ "Number of buffers for pending I/O requests to allocate per session. Maximum: "
+ __stringify(MAX_SESS_QUEUE_DEPTH) " (default: "
+ __stringify(DEFAULT_SESS_QUEUE_DEPTH) ")");
+
+static cpumask_t cq_affinity_mask = { CPU_BITS_ALL };
+
+static struct workqueue_struct *rtrs_wq;
+
+static inline struct rtrs_srv_con *to_srv_con(struct rtrs_con *c)
+{
+ return container_of(c, struct rtrs_srv_con, c);
+}
+
+static bool rtrs_srv_change_state(struct rtrs_srv_path *srv_path,
+ enum rtrs_srv_state new_state)
+{
+ enum rtrs_srv_state old_state;
+ bool changed = false;
+
+ spin_lock_irq(&srv_path->state_lock);
+ old_state = srv_path->state;
+ switch (new_state) {
+ case RTRS_SRV_CONNECTED:
+ if (old_state == RTRS_SRV_CONNECTING)
+ changed = true;
+ break;
+ case RTRS_SRV_CLOSING:
+ if (old_state == RTRS_SRV_CONNECTING ||
+ old_state == RTRS_SRV_CONNECTED)
+ changed = true;
+ break;
+ case RTRS_SRV_CLOSED:
+ if (old_state == RTRS_SRV_CLOSING)
+ changed = true;
+ break;
+ default:
+ break;
+ }
+ if (changed)
+ srv_path->state = new_state;
+ spin_unlock_irq(&srv_path->state_lock);
+
+ return changed;
+}
+
+static void free_id(struct rtrs_srv_op *id)
+{
+ if (!id)
+ return;
+ kfree(id);
+}
+
+static void rtrs_srv_free_ops_ids(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ int i;
+
+ if (srv_path->ops_ids) {
+ for (i = 0; i < srv->queue_depth; i++)
+ free_id(srv_path->ops_ids[i]);
+ kfree(srv_path->ops_ids);
+ srv_path->ops_ids = NULL;
+ }
+}
+
+static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
+
+static struct ib_cqe io_comp_cqe = {
+ .done = rtrs_srv_rdma_done
+};
+
+static inline void rtrs_srv_inflight_ref_release(struct percpu_ref *ref)
+{
+ struct rtrs_srv_path *srv_path = container_of(ref,
+ struct rtrs_srv_path,
+ ids_inflight_ref);
+
+ percpu_ref_exit(&srv_path->ids_inflight_ref);
+ complete(&srv_path->complete_done);
+}
+
+static int rtrs_srv_alloc_ops_ids(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_srv_op *id;
+ int i, ret;
+
+ srv_path->ops_ids = kcalloc(srv->queue_depth,
+ sizeof(*srv_path->ops_ids),
+ GFP_KERNEL);
+ if (!srv_path->ops_ids)
+ goto err;
+
+ for (i = 0; i < srv->queue_depth; ++i) {
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id)
+ goto err;
+
+ srv_path->ops_ids[i] = id;
+ }
+
+ ret = percpu_ref_init(&srv_path->ids_inflight_ref,
+ rtrs_srv_inflight_ref_release, 0, GFP_KERNEL);
+ if (ret) {
+ pr_err("Percpu reference init failed\n");
+ goto err;
+ }
+ init_completion(&srv_path->complete_done);
+
+ return 0;
+
+err:
+ rtrs_srv_free_ops_ids(srv_path);
+ return -ENOMEM;
+}
+
+static inline void rtrs_srv_get_ops_ids(struct rtrs_srv_path *srv_path)
+{
+ percpu_ref_get(&srv_path->ids_inflight_ref);
+}
+
+static inline void rtrs_srv_put_ops_ids(struct rtrs_srv_path *srv_path)
+{
+ percpu_ref_put(&srv_path->ids_inflight_ref);
+}
+
+static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(s, "REG MR failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ close_path(srv_path);
+ return;
+ }
+}
+
+static struct ib_cqe local_reg_cqe = {
+ .done = rtrs_srv_reg_mr_done
+};
+
+static int rdma_write_sg(struct rtrs_srv_op *id)
+{
+ struct rtrs_path *s = id->con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ dma_addr_t dma_addr = srv_path->dma_addr[id->msg_id];
+ struct rtrs_srv_mr *srv_mr;
+ struct ib_send_wr inv_wr;
+ struct ib_rdma_wr imm_wr;
+ struct ib_rdma_wr *wr = NULL;
+ enum ib_send_flags flags;
+ size_t sg_cnt;
+ int err, offset;
+ bool need_inval;
+ u32 rkey = 0;
+ struct ib_reg_wr rwr;
+ struct ib_sge *plist;
+ struct ib_sge list;
+
+ sg_cnt = le16_to_cpu(id->rd_msg->sg_cnt);
+ need_inval = le16_to_cpu(id->rd_msg->flags) & RTRS_MSG_NEED_INVAL_F;
+ if (sg_cnt != 1)
+ return -EINVAL;
+
+ offset = 0;
+
+ wr = &id->tx_wr;
+ plist = &id->tx_sg;
+ plist->addr = dma_addr + offset;
+ plist->length = le32_to_cpu(id->rd_msg->desc[0].len);
+
+ /* WR will fail with length error
+ * if this is 0
+ */
+ if (plist->length == 0) {
+ rtrs_err(s, "Invalid RDMA-Write sg list length 0\n");
+ return -EINVAL;
+ }
+
+ plist->lkey = srv_path->s.dev->ib_pd->local_dma_lkey;
+ offset += plist->length;
+
+ wr->wr.sg_list = plist;
+ wr->wr.num_sge = 1;
+ wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr);
+ wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key);
+ if (rkey == 0)
+ rkey = wr->rkey;
+ else
+ /* Only one key is actually used */
+ WARN_ON_ONCE(rkey != wr->rkey);
+
+ wr->wr.opcode = IB_WR_RDMA_WRITE;
+ wr->wr.wr_cqe = &io_comp_cqe;
+ wr->wr.ex.imm_data = 0;
+ wr->wr.send_flags = 0;
+
+ if (need_inval && always_invalidate) {
+ wr->wr.next = &rwr.wr;
+ rwr.wr.next = &inv_wr;
+ inv_wr.next = &imm_wr.wr;
+ } else if (always_invalidate) {
+ wr->wr.next = &rwr.wr;
+ rwr.wr.next = &imm_wr.wr;
+ } else if (need_inval) {
+ wr->wr.next = &inv_wr;
+ inv_wr.next = &imm_wr.wr;
+ } else {
+ wr->wr.next = &imm_wr.wr;
+ }
+ /*
+ * From time to time we have to post signaled sends,
+ * or send queue will fill up and only QP reset can help.
+ */
+ flags = (atomic_inc_return(&id->con->c.wr_cnt) % s->signal_interval) ?
+ 0 : IB_SEND_SIGNALED;
+
+ if (need_inval) {
+ inv_wr.sg_list = NULL;
+ inv_wr.num_sge = 0;
+ inv_wr.opcode = IB_WR_SEND_WITH_INV;
+ inv_wr.wr_cqe = &io_comp_cqe;
+ inv_wr.send_flags = 0;
+ inv_wr.ex.invalidate_rkey = rkey;
+ }
+
+ imm_wr.wr.next = NULL;
+ if (always_invalidate) {
+ struct rtrs_msg_rkey_rsp *msg;
+
+ srv_mr = &srv_path->mrs[id->msg_id];
+ rwr.wr.opcode = IB_WR_REG_MR;
+ rwr.wr.wr_cqe = &local_reg_cqe;
+ rwr.wr.num_sge = 0;
+ rwr.mr = srv_mr->mr;
+ rwr.wr.send_flags = 0;
+ rwr.key = srv_mr->mr->rkey;
+ rwr.access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ msg = srv_mr->iu->buf;
+ msg->buf_id = cpu_to_le16(id->msg_id);
+ msg->type = cpu_to_le16(RTRS_MSG_RKEY_RSP);
+ msg->rkey = cpu_to_le32(srv_mr->mr->rkey);
+
+ list.addr = srv_mr->iu->dma_addr;
+ list.length = sizeof(*msg);
+ list.lkey = srv_path->s.dev->ib_pd->local_dma_lkey;
+ imm_wr.wr.sg_list = &list;
+ imm_wr.wr.num_sge = 1;
+ imm_wr.wr.opcode = IB_WR_SEND_WITH_IMM;
+ ib_dma_sync_single_for_device(srv_path->s.dev->ib_dev,
+ srv_mr->iu->dma_addr,
+ srv_mr->iu->size, DMA_TO_DEVICE);
+ } else {
+ imm_wr.wr.sg_list = NULL;
+ imm_wr.wr.num_sge = 0;
+ imm_wr.wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM;
+ }
+ imm_wr.wr.send_flags = flags;
+ imm_wr.wr.ex.imm_data = cpu_to_be32(rtrs_to_io_rsp_imm(id->msg_id,
+ 0, need_inval));
+
+ imm_wr.wr.wr_cqe = &io_comp_cqe;
+ ib_dma_sync_single_for_device(srv_path->s.dev->ib_dev, dma_addr,
+ offset, DMA_BIDIRECTIONAL);
+
+ err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL);
+ if (err)
+ rtrs_err(s,
+ "Posting RDMA-Write-Request to QP failed, err: %d\n",
+ err);
+
+ return err;
+}
+
+/**
+ * send_io_resp_imm() - respond to client with empty IMM on failed READ/WRITE
+ * requests or on successful WRITE request.
+ * @con: the connection to send back result
+ * @id: the id associated with the IO
+ * @errno: the error number of the IO.
+ *
+ * Return 0 on success, errno otherwise.
+ */
+static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id,
+ int errno)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct ib_send_wr inv_wr, *wr = NULL;
+ struct ib_rdma_wr imm_wr;
+ struct ib_reg_wr rwr;
+ struct rtrs_srv_mr *srv_mr;
+ bool need_inval = false;
+ enum ib_send_flags flags;
+ u32 imm;
+ int err;
+
+ if (id->dir == READ) {
+ struct rtrs_msg_rdma_read *rd_msg = id->rd_msg;
+ size_t sg_cnt;
+
+ need_inval = le16_to_cpu(rd_msg->flags) &
+ RTRS_MSG_NEED_INVAL_F;
+ sg_cnt = le16_to_cpu(rd_msg->sg_cnt);
+
+ if (need_inval) {
+ if (sg_cnt) {
+ inv_wr.wr_cqe = &io_comp_cqe;
+ inv_wr.sg_list = NULL;
+ inv_wr.num_sge = 0;
+ inv_wr.opcode = IB_WR_SEND_WITH_INV;
+ inv_wr.send_flags = 0;
+ /* Only one key is actually used */
+ inv_wr.ex.invalidate_rkey =
+ le32_to_cpu(rd_msg->desc[0].key);
+ } else {
+ WARN_ON_ONCE(1);
+ need_inval = false;
+ }
+ }
+ }
+
+ trace_send_io_resp_imm(id, need_inval, always_invalidate, errno);
+
+ if (need_inval && always_invalidate) {
+ wr = &inv_wr;
+ inv_wr.next = &rwr.wr;
+ rwr.wr.next = &imm_wr.wr;
+ } else if (always_invalidate) {
+ wr = &rwr.wr;
+ rwr.wr.next = &imm_wr.wr;
+ } else if (need_inval) {
+ wr = &inv_wr;
+ inv_wr.next = &imm_wr.wr;
+ } else {
+ wr = &imm_wr.wr;
+ }
+ /*
+ * From time to time we have to post signalled sends,
+ * or send queue will fill up and only QP reset can help.
+ */
+ flags = (atomic_inc_return(&con->c.wr_cnt) % s->signal_interval) ?
+ 0 : IB_SEND_SIGNALED;
+ imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval);
+ imm_wr.wr.next = NULL;
+ if (always_invalidate) {
+ struct ib_sge list;
+ struct rtrs_msg_rkey_rsp *msg;
+
+ srv_mr = &srv_path->mrs[id->msg_id];
+ rwr.wr.next = &imm_wr.wr;
+ rwr.wr.opcode = IB_WR_REG_MR;
+ rwr.wr.wr_cqe = &local_reg_cqe;
+ rwr.wr.num_sge = 0;
+ rwr.wr.send_flags = 0;
+ rwr.mr = srv_mr->mr;
+ rwr.key = srv_mr->mr->rkey;
+ rwr.access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ msg = srv_mr->iu->buf;
+ msg->buf_id = cpu_to_le16(id->msg_id);
+ msg->type = cpu_to_le16(RTRS_MSG_RKEY_RSP);
+ msg->rkey = cpu_to_le32(srv_mr->mr->rkey);
+
+ list.addr = srv_mr->iu->dma_addr;
+ list.length = sizeof(*msg);
+ list.lkey = srv_path->s.dev->ib_pd->local_dma_lkey;
+ imm_wr.wr.sg_list = &list;
+ imm_wr.wr.num_sge = 1;
+ imm_wr.wr.opcode = IB_WR_SEND_WITH_IMM;
+ ib_dma_sync_single_for_device(srv_path->s.dev->ib_dev,
+ srv_mr->iu->dma_addr,
+ srv_mr->iu->size, DMA_TO_DEVICE);
+ } else {
+ imm_wr.wr.sg_list = NULL;
+ imm_wr.wr.num_sge = 0;
+ imm_wr.wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM;
+ }
+ imm_wr.wr.send_flags = flags;
+ imm_wr.wr.wr_cqe = &io_comp_cqe;
+
+ imm_wr.wr.ex.imm_data = cpu_to_be32(imm);
+
+ err = ib_post_send(id->con->c.qp, wr, NULL);
+ if (err)
+ rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n",
+ err);
+
+ return err;
+}
+
+void close_path(struct rtrs_srv_path *srv_path)
+{
+ if (rtrs_srv_change_state(srv_path, RTRS_SRV_CLOSING))
+ queue_work(rtrs_wq, &srv_path->close_work);
+ WARN_ON(srv_path->state != RTRS_SRV_CLOSING);
+}
+
+static inline const char *rtrs_srv_state_str(enum rtrs_srv_state state)
+{
+ switch (state) {
+ case RTRS_SRV_CONNECTING:
+ return "RTRS_SRV_CONNECTING";
+ case RTRS_SRV_CONNECTED:
+ return "RTRS_SRV_CONNECTED";
+ case RTRS_SRV_CLOSING:
+ return "RTRS_SRV_CLOSING";
+ case RTRS_SRV_CLOSED:
+ return "RTRS_SRV_CLOSED";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+/**
+ * rtrs_srv_resp_rdma() - Finish an RDMA request
+ *
+ * @id: Internal RTRS operation identifier
+ * @status: Response Code sent to the other side for this operation.
+ * 0 = success, <=0 error
+ * Context: any
+ *
+ * Finish a RDMA operation. A message is sent to the client and the
+ * corresponding memory areas will be released.
+ */
+bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status)
+{
+ struct rtrs_srv_path *srv_path;
+ struct rtrs_srv_con *con;
+ struct rtrs_path *s;
+ int err;
+
+ if (WARN_ON(!id))
+ return true;
+
+ con = id->con;
+ s = con->c.path;
+ srv_path = to_srv_path(s);
+
+ id->status = status;
+
+ if (srv_path->state != RTRS_SRV_CONNECTED) {
+ rtrs_err_rl(s,
+ "Sending I/O response failed, server path %s is disconnected, path state %s\n",
+ kobject_name(&srv_path->kobj),
+ rtrs_srv_state_str(srv_path->state));
+ goto out;
+ }
+ if (always_invalidate) {
+ struct rtrs_srv_mr *mr = &srv_path->mrs[id->msg_id];
+
+ ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
+ }
+ if (atomic_sub_return(1, &con->c.sq_wr_avail) < 0) {
+ rtrs_err(s, "IB send queue full: srv_path=%s cid=%d\n",
+ kobject_name(&srv_path->kobj),
+ con->c.cid);
+ atomic_add(1, &con->c.sq_wr_avail);
+ spin_lock(&con->rsp_wr_wait_lock);
+ list_add_tail(&id->wait_list, &con->rsp_wr_wait_list);
+ spin_unlock(&con->rsp_wr_wait_lock);
+ return false;
+ }
+
+ if (status || id->dir == WRITE || !id->rd_msg->sg_cnt)
+ err = send_io_resp_imm(con, id, status);
+ else
+ err = rdma_write_sg(id);
+
+ if (err) {
+ rtrs_err_rl(s, "IO response failed: %d: srv_path=%s\n", err,
+ kobject_name(&srv_path->kobj));
+ close_path(srv_path);
+ }
+out:
+ rtrs_srv_put_ops_ids(srv_path);
+ return true;
+}
+EXPORT_SYMBOL(rtrs_srv_resp_rdma);
+
+/**
+ * rtrs_srv_set_sess_priv() - Set private pointer in rtrs_srv.
+ * @srv: Session pointer
+ * @priv: The private pointer that is associated with the session.
+ */
+void rtrs_srv_set_sess_priv(struct rtrs_srv_sess *srv, void *priv)
+{
+ srv->priv = priv;
+}
+EXPORT_SYMBOL(rtrs_srv_set_sess_priv);
+
+static void unmap_cont_bufs(struct rtrs_srv_path *srv_path)
+{
+ int i;
+
+ for (i = 0; i < srv_path->mrs_num; i++) {
+ struct rtrs_srv_mr *srv_mr;
+
+ srv_mr = &srv_path->mrs[i];
+ rtrs_iu_free(srv_mr->iu, srv_path->s.dev->ib_dev, 1);
+ ib_dereg_mr(srv_mr->mr);
+ ib_dma_unmap_sg(srv_path->s.dev->ib_dev, srv_mr->sgt.sgl,
+ srv_mr->sgt.nents, DMA_BIDIRECTIONAL);
+ sg_free_table(&srv_mr->sgt);
+ }
+ kfree(srv_path->mrs);
+}
+
+static int map_cont_bufs(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_path *ss = &srv_path->s;
+ int i, mri, err, mrs_num;
+ unsigned int chunk_bits;
+ int chunks_per_mr = 1;
+
+ /*
+ * Here we map queue_depth chunks to MR. Firstly we have to
+ * figure out how many chunks can we map per MR.
+ */
+ if (always_invalidate) {
+ /*
+ * in order to do invalidate for each chunks of memory, we needs
+ * more memory regions.
+ */
+ mrs_num = srv->queue_depth;
+ } else {
+ chunks_per_mr =
+ srv_path->s.dev->ib_dev->attrs.max_fast_reg_page_list_len;
+ mrs_num = DIV_ROUND_UP(srv->queue_depth, chunks_per_mr);
+ chunks_per_mr = DIV_ROUND_UP(srv->queue_depth, mrs_num);
+ }
+
+ srv_path->mrs = kcalloc(mrs_num, sizeof(*srv_path->mrs), GFP_KERNEL);
+ if (!srv_path->mrs)
+ return -ENOMEM;
+
+ srv_path->mrs_num = mrs_num;
+
+ for (mri = 0; mri < mrs_num; mri++) {
+ struct rtrs_srv_mr *srv_mr = &srv_path->mrs[mri];
+ struct sg_table *sgt = &srv_mr->sgt;
+ struct scatterlist *s;
+ struct ib_mr *mr;
+ int nr, nr_sgt, chunks;
+
+ chunks = chunks_per_mr * mri;
+ if (!always_invalidate)
+ chunks_per_mr = min_t(int, chunks_per_mr,
+ srv->queue_depth - chunks);
+
+ err = sg_alloc_table(sgt, chunks_per_mr, GFP_KERNEL);
+ if (err)
+ goto err;
+
+ for_each_sg(sgt->sgl, s, chunks_per_mr, i)
+ sg_set_page(s, srv->chunks[chunks + i],
+ max_chunk_size, 0);
+
+ nr_sgt = ib_dma_map_sg(srv_path->s.dev->ib_dev, sgt->sgl,
+ sgt->nents, DMA_BIDIRECTIONAL);
+ if (!nr_sgt) {
+ err = -EINVAL;
+ goto free_sg;
+ }
+ mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG,
+ nr_sgt);
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+ goto unmap_sg;
+ }
+ nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt,
+ NULL, max_chunk_size);
+ if (nr < 0 || nr < sgt->nents) {
+ err = nr < 0 ? nr : -EINVAL;
+ goto dereg_mr;
+ }
+
+ if (always_invalidate) {
+ srv_mr->iu = rtrs_iu_alloc(1,
+ sizeof(struct rtrs_msg_rkey_rsp),
+ GFP_KERNEL, srv_path->s.dev->ib_dev,
+ DMA_TO_DEVICE, rtrs_srv_rdma_done);
+ if (!srv_mr->iu) {
+ err = -ENOMEM;
+ rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err);
+ goto dereg_mr;
+ }
+ }
+ /* Eventually dma addr for each chunk can be cached */
+ for_each_sg(sgt->sgl, s, nr_sgt, i)
+ srv_path->dma_addr[chunks + i] = sg_dma_address(s);
+
+ ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
+ srv_mr->mr = mr;
+
+ continue;
+err:
+ while (mri--) {
+ srv_mr = &srv_path->mrs[mri];
+ sgt = &srv_mr->sgt;
+ mr = srv_mr->mr;
+ rtrs_iu_free(srv_mr->iu, srv_path->s.dev->ib_dev, 1);
+dereg_mr:
+ ib_dereg_mr(mr);
+unmap_sg:
+ ib_dma_unmap_sg(srv_path->s.dev->ib_dev, sgt->sgl,
+ sgt->nents, DMA_BIDIRECTIONAL);
+free_sg:
+ sg_free_table(sgt);
+ }
+ kfree(srv_path->mrs);
+
+ return err;
+ }
+
+ chunk_bits = ilog2(srv->queue_depth - 1) + 1;
+ srv_path->mem_bits = (MAX_IMM_PAYL_BITS - chunk_bits);
+
+ return 0;
+}
+
+static void rtrs_srv_hb_err_handler(struct rtrs_con *c)
+{
+ close_path(to_srv_path(c->path));
+}
+
+static void rtrs_srv_init_hb(struct rtrs_srv_path *srv_path)
+{
+ rtrs_init_hb(&srv_path->s, &io_comp_cqe,
+ RTRS_HB_INTERVAL_MS,
+ RTRS_HB_MISSED_MAX,
+ rtrs_srv_hb_err_handler,
+ rtrs_wq);
+}
+
+static void rtrs_srv_start_hb(struct rtrs_srv_path *srv_path)
+{
+ rtrs_start_hb(&srv_path->s);
+}
+
+static void rtrs_srv_stop_hb(struct rtrs_srv_path *srv_path)
+{
+ rtrs_stop_hb(&srv_path->s);
+}
+
+static void rtrs_srv_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_iu *iu;
+
+ iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
+ rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(s, "Sess info response send failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ close_path(srv_path);
+ return;
+ }
+ WARN_ON(wc->opcode != IB_WC_SEND);
+}
+
+static void rtrs_srv_path_up(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_srv_ctx *ctx = srv->ctx;
+ int up;
+
+ mutex_lock(&srv->paths_ev_mutex);
+ up = ++srv->paths_up;
+ if (up == 1)
+ ctx->ops.link_ev(srv, RTRS_SRV_LINK_EV_CONNECTED, NULL);
+ mutex_unlock(&srv->paths_ev_mutex);
+
+ /* Mark session as established */
+ srv_path->established = true;
+}
+
+static void rtrs_srv_path_down(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_srv_ctx *ctx = srv->ctx;
+
+ if (!srv_path->established)
+ return;
+
+ srv_path->established = false;
+ mutex_lock(&srv->paths_ev_mutex);
+ WARN_ON(!srv->paths_up);
+ if (--srv->paths_up == 0)
+ ctx->ops.link_ev(srv, RTRS_SRV_LINK_EV_DISCONNECTED, srv->priv);
+ mutex_unlock(&srv->paths_ev_mutex);
+}
+
+static bool exist_pathname(struct rtrs_srv_ctx *ctx,
+ const char *pathname, const uuid_t *path_uuid)
+{
+ struct rtrs_srv_sess *srv;
+ struct rtrs_srv_path *srv_path;
+ bool found = false;
+
+ mutex_lock(&ctx->srv_mutex);
+ list_for_each_entry(srv, &ctx->srv_list, ctx_list) {
+ mutex_lock(&srv->paths_mutex);
+
+ /* when a client with same uuid and same sessname tried to add a path */
+ if (uuid_equal(&srv->paths_uuid, path_uuid)) {
+ mutex_unlock(&srv->paths_mutex);
+ continue;
+ }
+
+ list_for_each_entry(srv_path, &srv->paths_list, s.entry) {
+ if (strlen(srv_path->s.sessname) == strlen(pathname) &&
+ !strcmp(srv_path->s.sessname, pathname)) {
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&srv->paths_mutex);
+ if (found)
+ break;
+ }
+ mutex_unlock(&ctx->srv_mutex);
+ return found;
+}
+
+static int post_recv_path(struct rtrs_srv_path *srv_path);
+static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno);
+
+static int process_info_req(struct rtrs_srv_con *con,
+ struct rtrs_msg_info_req *msg)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct ib_send_wr *reg_wr = NULL;
+ struct rtrs_msg_info_rsp *rsp;
+ struct rtrs_iu *tx_iu;
+ struct ib_reg_wr *rwr;
+ int mri, err;
+ size_t tx_sz;
+
+ err = post_recv_path(srv_path);
+ if (err) {
+ rtrs_err(s, "post_recv_path(), err: %d\n", err);
+ return err;
+ }
+
+ if (strchr(msg->pathname, '/') || strchr(msg->pathname, '.')) {
+ rtrs_err(s, "pathname cannot contain / and .\n");
+ return -EINVAL;
+ }
+
+ if (exist_pathname(srv_path->srv->ctx,
+ msg->pathname, &srv_path->srv->paths_uuid)) {
+ rtrs_err(s, "pathname is duplicated: %s\n", msg->pathname);
+ return -EPERM;
+ }
+ strscpy(srv_path->s.sessname, msg->pathname,
+ sizeof(srv_path->s.sessname));
+
+ rwr = kcalloc(srv_path->mrs_num, sizeof(*rwr), GFP_KERNEL);
+ if (!rwr)
+ return -ENOMEM;
+
+ tx_sz = sizeof(*rsp);
+ tx_sz += sizeof(rsp->desc[0]) * srv_path->mrs_num;
+ tx_iu = rtrs_iu_alloc(1, tx_sz, GFP_KERNEL, srv_path->s.dev->ib_dev,
+ DMA_TO_DEVICE, rtrs_srv_info_rsp_done);
+ if (!tx_iu) {
+ err = -ENOMEM;
+ goto rwr_free;
+ }
+
+ rsp = tx_iu->buf;
+ rsp->type = cpu_to_le16(RTRS_MSG_INFO_RSP);
+ rsp->sg_cnt = cpu_to_le16(srv_path->mrs_num);
+
+ for (mri = 0; mri < srv_path->mrs_num; mri++) {
+ struct ib_mr *mr = srv_path->mrs[mri].mr;
+
+ rsp->desc[mri].addr = cpu_to_le64(mr->iova);
+ rsp->desc[mri].key = cpu_to_le32(mr->rkey);
+ rsp->desc[mri].len = cpu_to_le32(mr->length);
+
+ /*
+ * Fill in reg MR request and chain them *backwards*
+ */
+ rwr[mri].wr.next = mri ? &rwr[mri - 1].wr : NULL;
+ rwr[mri].wr.opcode = IB_WR_REG_MR;
+ rwr[mri].wr.wr_cqe = &local_reg_cqe;
+ rwr[mri].wr.num_sge = 0;
+ rwr[mri].wr.send_flags = 0;
+ rwr[mri].mr = mr;
+ rwr[mri].key = mr->rkey;
+ rwr[mri].access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ reg_wr = &rwr[mri].wr;
+ }
+
+ err = rtrs_srv_create_path_files(srv_path);
+ if (err)
+ goto iu_free;
+ kobject_get(&srv_path->kobj);
+ get_device(&srv_path->srv->dev);
+ rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED);
+ rtrs_srv_start_hb(srv_path);
+
+ /*
+ * We do not account number of established connections at the current
+ * moment, we rely on the client, which should send info request when
+ * all connections are successfully established. Thus, simply notify
+ * listener with a proper event if we are the first path.
+ */
+ rtrs_srv_path_up(srv_path);
+
+ ib_dma_sync_single_for_device(srv_path->s.dev->ib_dev,
+ tx_iu->dma_addr,
+ tx_iu->size, DMA_TO_DEVICE);
+
+ /* Send info response */
+ err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr);
+ if (err) {
+ rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err);
+iu_free:
+ rtrs_iu_free(tx_iu, srv_path->s.dev->ib_dev, 1);
+ }
+rwr_free:
+ kfree(rwr);
+
+ return err;
+}
+
+static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_msg_info_req *msg;
+ struct rtrs_iu *iu;
+ int err;
+
+ WARN_ON(con->c.cid);
+
+ iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(s, "Sess info request receive failed: %s\n",
+ ib_wc_status_msg(wc->status));
+ goto close;
+ }
+ WARN_ON(wc->opcode != IB_WC_RECV);
+
+ if (wc->byte_len < sizeof(*msg)) {
+ rtrs_err(s, "Sess info request is malformed: size %d\n",
+ wc->byte_len);
+ goto close;
+ }
+ ib_dma_sync_single_for_cpu(srv_path->s.dev->ib_dev, iu->dma_addr,
+ iu->size, DMA_FROM_DEVICE);
+ msg = iu->buf;
+ if (le16_to_cpu(msg->type) != RTRS_MSG_INFO_REQ) {
+ rtrs_err(s, "Sess info request is malformed: type %d\n",
+ le16_to_cpu(msg->type));
+ goto close;
+ }
+ err = process_info_req(con, msg);
+ if (err)
+ goto close;
+
+out:
+ rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1);
+ return;
+close:
+ close_path(srv_path);
+ goto out;
+}
+
+static int post_recv_info_req(struct rtrs_srv_con *con)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_iu *rx_iu;
+ int err;
+
+ rx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req),
+ GFP_KERNEL, srv_path->s.dev->ib_dev,
+ DMA_FROM_DEVICE, rtrs_srv_info_req_done);
+ if (!rx_iu)
+ return -ENOMEM;
+ /* Prepare for getting info response */
+ err = rtrs_iu_post_recv(&con->c, rx_iu);
+ if (err) {
+ rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err);
+ rtrs_iu_free(rx_iu, srv_path->s.dev->ib_dev, 1);
+ return err;
+ }
+
+ return 0;
+}
+
+static int post_recv_io(struct rtrs_srv_con *con, size_t q_size)
+{
+ int i, err;
+
+ for (i = 0; i < q_size; i++) {
+ err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int post_recv_path(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_path *s = &srv_path->s;
+ size_t q_size;
+ int err, cid;
+
+ for (cid = 0; cid < srv_path->s.con_num; cid++) {
+ if (cid == 0)
+ q_size = SERVICE_CON_QUEUE_DEPTH;
+ else
+ q_size = srv->queue_depth;
+
+ err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size);
+ if (err) {
+ rtrs_err(s, "post_recv_io(), err: %d\n", err);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void process_read(struct rtrs_srv_con *con,
+ struct rtrs_msg_rdma_read *msg,
+ u32 buf_id, u32 off)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_srv_ctx *ctx = srv->ctx;
+ struct rtrs_srv_op *id;
+
+ size_t usr_len, data_len;
+ void *data;
+ int ret;
+
+ if (srv_path->state != RTRS_SRV_CONNECTED) {
+ rtrs_err_rl(s,
+ "Processing read request failed, session is disconnected, sess state %s\n",
+ rtrs_srv_state_str(srv_path->state));
+ return;
+ }
+ if (msg->sg_cnt != 1 && msg->sg_cnt != 0) {
+ rtrs_err_rl(s,
+ "Processing read request failed, invalid message\n");
+ return;
+ }
+ rtrs_srv_get_ops_ids(srv_path);
+ rtrs_srv_update_rdma_stats(srv_path->stats, off, READ);
+ id = srv_path->ops_ids[buf_id];
+ id->con = con;
+ id->dir = READ;
+ id->msg_id = buf_id;
+ id->rd_msg = msg;
+ usr_len = le16_to_cpu(msg->usr_len);
+ data_len = off - usr_len;
+ data = page_address(srv->chunks[buf_id]);
+ ret = ctx->ops.rdma_ev(srv->priv, id, data, data_len,
+ data + data_len, usr_len);
+
+ if (ret) {
+ rtrs_err_rl(s,
+ "Processing read request failed, user module cb reported for msg_id %d, err: %d\n",
+ buf_id, ret);
+ goto send_err_msg;
+ }
+
+ return;
+
+send_err_msg:
+ ret = send_io_resp_imm(con, id, ret);
+ if (ret < 0) {
+ rtrs_err_rl(s,
+ "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n",
+ buf_id, ret);
+ close_path(srv_path);
+ }
+ rtrs_srv_put_ops_ids(srv_path);
+}
+
+static void process_write(struct rtrs_srv_con *con,
+ struct rtrs_msg_rdma_write *req,
+ u32 buf_id, u32 off)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_srv_ctx *ctx = srv->ctx;
+ struct rtrs_srv_op *id;
+
+ size_t data_len, usr_len;
+ void *data;
+ int ret;
+
+ if (srv_path->state != RTRS_SRV_CONNECTED) {
+ rtrs_err_rl(s,
+ "Processing write request failed, session is disconnected, sess state %s\n",
+ rtrs_srv_state_str(srv_path->state));
+ return;
+ }
+ rtrs_srv_get_ops_ids(srv_path);
+ rtrs_srv_update_rdma_stats(srv_path->stats, off, WRITE);
+ id = srv_path->ops_ids[buf_id];
+ id->con = con;
+ id->dir = WRITE;
+ id->msg_id = buf_id;
+
+ usr_len = le16_to_cpu(req->usr_len);
+ data_len = off - usr_len;
+ data = page_address(srv->chunks[buf_id]);
+ ret = ctx->ops.rdma_ev(srv->priv, id, data, data_len,
+ data + data_len, usr_len);
+ if (ret) {
+ rtrs_err_rl(s,
+ "Processing write request failed, user module callback reports err: %d\n",
+ ret);
+ goto send_err_msg;
+ }
+
+ return;
+
+send_err_msg:
+ ret = send_io_resp_imm(con, id, ret);
+ if (ret < 0) {
+ rtrs_err_rl(s,
+ "Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n",
+ buf_id, ret);
+ close_path(srv_path);
+ }
+ rtrs_srv_put_ops_ids(srv_path);
+}
+
+static void process_io_req(struct rtrs_srv_con *con, void *msg,
+ u32 id, u32 off)
+{
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_msg_rdma_hdr *hdr;
+ unsigned int type;
+
+ ib_dma_sync_single_for_cpu(srv_path->s.dev->ib_dev,
+ srv_path->dma_addr[id],
+ max_chunk_size, DMA_BIDIRECTIONAL);
+ hdr = msg;
+ type = le16_to_cpu(hdr->type);
+
+ switch (type) {
+ case RTRS_MSG_WRITE:
+ process_write(con, msg, id, off);
+ break;
+ case RTRS_MSG_READ:
+ process_read(con, msg, id, off);
+ break;
+ default:
+ rtrs_err(s,
+ "Processing I/O request failed, unknown message type received: 0x%02x\n",
+ type);
+ goto err;
+ }
+
+ return;
+
+err:
+ close_path(srv_path);
+}
+
+static void rtrs_srv_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_srv_mr *mr =
+ container_of(wc->wr_cqe, typeof(*mr), inv_cqe);
+ struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ u32 msg_id, off;
+ void *data;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rtrs_err(s, "Failed IB_WR_LOCAL_INV: %s\n",
+ ib_wc_status_msg(wc->status));
+ close_path(srv_path);
+ }
+ msg_id = mr->msg_id;
+ off = mr->msg_off;
+ data = page_address(srv->chunks[msg_id]) + off;
+ process_io_req(con, data, msg_id, off);
+}
+
+static int rtrs_srv_inv_rkey(struct rtrs_srv_con *con,
+ struct rtrs_srv_mr *mr)
+{
+ struct ib_send_wr wr = {
+ .opcode = IB_WR_LOCAL_INV,
+ .wr_cqe = &mr->inv_cqe,
+ .send_flags = IB_SEND_SIGNALED,
+ .ex.invalidate_rkey = mr->mr->rkey,
+ };
+ mr->inv_cqe.done = rtrs_srv_inv_rkey_done;
+
+ return ib_post_send(con->c.qp, &wr, NULL);
+}
+
+static void rtrs_rdma_process_wr_wait_list(struct rtrs_srv_con *con)
+{
+ spin_lock(&con->rsp_wr_wait_lock);
+ while (!list_empty(&con->rsp_wr_wait_list)) {
+ struct rtrs_srv_op *id;
+ int ret;
+
+ id = list_entry(con->rsp_wr_wait_list.next,
+ struct rtrs_srv_op, wait_list);
+ list_del(&id->wait_list);
+
+ spin_unlock(&con->rsp_wr_wait_lock);
+ ret = rtrs_srv_resp_rdma(id, id->status);
+ spin_lock(&con->rsp_wr_wait_lock);
+
+ if (!ret) {
+ list_add(&id->wait_list, &con->rsp_wr_wait_list);
+ break;
+ }
+ }
+ spin_unlock(&con->rsp_wr_wait_lock);
+}
+
+static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
+ struct rtrs_path *s = con->c.path;
+ struct rtrs_srv_path *srv_path = to_srv_path(s);
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ u32 imm_type, imm_payload;
+ int err;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ rtrs_err(s,
+ "%s (wr_cqe: %p, type: %d, vendor_err: 0x%x, len: %u)\n",
+ ib_wc_status_msg(wc->status), wc->wr_cqe,
+ wc->opcode, wc->vendor_err, wc->byte_len);
+ close_path(srv_path);
+ }
+ return;
+ }
+
+ switch (wc->opcode) {
+ case IB_WC_RECV_RDMA_WITH_IMM:
+ /*
+ * post_recv() RDMA write completions of IO reqs (read/write)
+ * and hb
+ */
+ if (WARN_ON(wc->wr_cqe != &io_comp_cqe))
+ return;
+ err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
+ if (err) {
+ rtrs_err(s, "rtrs_post_recv(), err: %d\n", err);
+ close_path(srv_path);
+ break;
+ }
+ rtrs_from_imm(be32_to_cpu(wc->ex.imm_data),
+ &imm_type, &imm_payload);
+ if (imm_type == RTRS_IO_REQ_IMM) {
+ u32 msg_id, off;
+ void *data;
+
+ msg_id = imm_payload >> srv_path->mem_bits;
+ off = imm_payload & ((1 << srv_path->mem_bits) - 1);
+ if (msg_id >= srv->queue_depth || off >= max_chunk_size) {
+ rtrs_err(s, "Wrong msg_id %u, off %u\n",
+ msg_id, off);
+ close_path(srv_path);
+ return;
+ }
+ if (always_invalidate) {
+ struct rtrs_srv_mr *mr = &srv_path->mrs[msg_id];
+
+ mr->msg_off = off;
+ mr->msg_id = msg_id;
+ err = rtrs_srv_inv_rkey(con, mr);
+ if (err) {
+ rtrs_err(s, "rtrs_post_recv(), err: %d\n",
+ err);
+ close_path(srv_path);
+ break;
+ }
+ } else {
+ data = page_address(srv->chunks[msg_id]) + off;
+ process_io_req(con, data, msg_id, off);
+ }
+ } else if (imm_type == RTRS_HB_MSG_IMM) {
+ WARN_ON(con->c.cid);
+ rtrs_send_hb_ack(&srv_path->s);
+ } else if (imm_type == RTRS_HB_ACK_IMM) {
+ WARN_ON(con->c.cid);
+ srv_path->s.hb_missed_cnt = 0;
+ } else {
+ rtrs_wrn(s, "Unknown IMM type %u\n", imm_type);
+ }
+ break;
+ case IB_WC_RDMA_WRITE:
+ case IB_WC_SEND:
+ /*
+ * post_send() RDMA write completions of IO reqs (read/write)
+ * and hb.
+ */
+ atomic_add(s->signal_interval, &con->c.sq_wr_avail);
+
+ if (!list_empty_careful(&con->rsp_wr_wait_list))
+ rtrs_rdma_process_wr_wait_list(con);
+
+ break;
+ default:
+ rtrs_wrn(s, "Unexpected WC type: %d\n", wc->opcode);
+ return;
+ }
+}
+
+/**
+ * rtrs_srv_get_path_name() - Get rtrs_srv peer hostname.
+ * @srv: Session
+ * @pathname: Pathname buffer
+ * @len: Length of sessname buffer
+ */
+int rtrs_srv_get_path_name(struct rtrs_srv_sess *srv, char *pathname,
+ size_t len)
+{
+ struct rtrs_srv_path *srv_path;
+ int err = -ENOTCONN;
+
+ mutex_lock(&srv->paths_mutex);
+ list_for_each_entry(srv_path, &srv->paths_list, s.entry) {
+ if (srv_path->state != RTRS_SRV_CONNECTED)
+ continue;
+ strscpy(pathname, srv_path->s.sessname,
+ min_t(size_t, sizeof(srv_path->s.sessname), len));
+ err = 0;
+ break;
+ }
+ mutex_unlock(&srv->paths_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL(rtrs_srv_get_path_name);
+
+/**
+ * rtrs_srv_get_queue_depth() - Get rtrs_srv qdepth.
+ * @srv: Session
+ */
+int rtrs_srv_get_queue_depth(struct rtrs_srv_sess *srv)
+{
+ return srv->queue_depth;
+}
+EXPORT_SYMBOL(rtrs_srv_get_queue_depth);
+
+static int find_next_bit_ring(struct rtrs_srv_path *srv_path)
+{
+ struct ib_device *ib_dev = srv_path->s.dev->ib_dev;
+ int v;
+
+ v = cpumask_next(srv_path->cur_cq_vector, &cq_affinity_mask);
+ if (v >= nr_cpu_ids || v >= ib_dev->num_comp_vectors)
+ v = cpumask_first(&cq_affinity_mask);
+ return v;
+}
+
+static int rtrs_srv_get_next_cq_vector(struct rtrs_srv_path *srv_path)
+{
+ srv_path->cur_cq_vector = find_next_bit_ring(srv_path);
+
+ return srv_path->cur_cq_vector;
+}
+
+static void rtrs_srv_dev_release(struct device *dev)
+{
+ struct rtrs_srv_sess *srv = container_of(dev, struct rtrs_srv_sess,
+ dev);
+
+ kfree(srv);
+}
+
+static void free_srv(struct rtrs_srv_sess *srv)
+{
+ int i;
+
+ WARN_ON(refcount_read(&srv->refcount));
+ for (i = 0; i < srv->queue_depth; i++)
+ __free_pages(srv->chunks[i], get_order(max_chunk_size));
+ kfree(srv->chunks);
+ mutex_destroy(&srv->paths_mutex);
+ mutex_destroy(&srv->paths_ev_mutex);
+ /* last put to release the srv structure */
+ put_device(&srv->dev);
+}
+
+static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx,
+ const uuid_t *paths_uuid,
+ bool first_conn)
+{
+ struct rtrs_srv_sess *srv;
+ int i;
+
+ mutex_lock(&ctx->srv_mutex);
+ list_for_each_entry(srv, &ctx->srv_list, ctx_list) {
+ if (uuid_equal(&srv->paths_uuid, paths_uuid) &&
+ refcount_inc_not_zero(&srv->refcount)) {
+ mutex_unlock(&ctx->srv_mutex);
+ return srv;
+ }
+ }
+ mutex_unlock(&ctx->srv_mutex);
+ /*
+ * If this request is not the first connection request from the
+ * client for this session then fail and return error.
+ */
+ if (!first_conn) {
+ pr_err_ratelimited("Error: Not the first connection request for this session\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ /* need to allocate a new srv */
+ srv = kzalloc(sizeof(*srv), GFP_KERNEL);
+ if (!srv)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&srv->paths_list);
+ mutex_init(&srv->paths_mutex);
+ mutex_init(&srv->paths_ev_mutex);
+ uuid_copy(&srv->paths_uuid, paths_uuid);
+ srv->queue_depth = sess_queue_depth;
+ srv->ctx = ctx;
+ device_initialize(&srv->dev);
+ srv->dev.release = rtrs_srv_dev_release;
+
+ srv->chunks = kcalloc(srv->queue_depth, sizeof(*srv->chunks),
+ GFP_KERNEL);
+ if (!srv->chunks)
+ goto err_free_srv;
+
+ for (i = 0; i < srv->queue_depth; i++) {
+ srv->chunks[i] = alloc_pages(GFP_KERNEL,
+ get_order(max_chunk_size));
+ if (!srv->chunks[i])
+ goto err_free_chunks;
+ }
+ refcount_set(&srv->refcount, 1);
+ mutex_lock(&ctx->srv_mutex);
+ list_add(&srv->ctx_list, &ctx->srv_list);
+ mutex_unlock(&ctx->srv_mutex);
+
+ return srv;
+
+err_free_chunks:
+ while (i--)
+ __free_pages(srv->chunks[i], get_order(max_chunk_size));
+ kfree(srv->chunks);
+
+err_free_srv:
+ kfree(srv);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void put_srv(struct rtrs_srv_sess *srv)
+{
+ if (refcount_dec_and_test(&srv->refcount)) {
+ struct rtrs_srv_ctx *ctx = srv->ctx;
+
+ WARN_ON(srv->dev.kobj.state_in_sysfs);
+
+ mutex_lock(&ctx->srv_mutex);
+ list_del(&srv->ctx_list);
+ mutex_unlock(&ctx->srv_mutex);
+ free_srv(srv);
+ }
+}
+
+static void __add_path_to_srv(struct rtrs_srv_sess *srv,
+ struct rtrs_srv_path *srv_path)
+{
+ list_add_tail(&srv_path->s.entry, &srv->paths_list);
+ srv->paths_num++;
+ WARN_ON(srv->paths_num >= MAX_PATHS_NUM);
+}
+
+static void del_path_from_srv(struct rtrs_srv_path *srv_path)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+
+ if (WARN_ON(!srv))
+ return;
+
+ mutex_lock(&srv->paths_mutex);
+ list_del(&srv_path->s.entry);
+ WARN_ON(!srv->paths_num);
+ srv->paths_num--;
+ mutex_unlock(&srv->paths_mutex);
+}
+
+/* return true if addresses are the same, error other wise */
+static int sockaddr_cmp(const struct sockaddr *a, const struct sockaddr *b)
+{
+ switch (a->sa_family) {
+ case AF_IB:
+ return memcmp(&((struct sockaddr_ib *)a)->sib_addr,
+ &((struct sockaddr_ib *)b)->sib_addr,
+ sizeof(struct ib_addr)) &&
+ (b->sa_family == AF_IB);
+ case AF_INET:
+ return memcmp(&((struct sockaddr_in *)a)->sin_addr,
+ &((struct sockaddr_in *)b)->sin_addr,
+ sizeof(struct in_addr)) &&
+ (b->sa_family == AF_INET);
+ case AF_INET6:
+ return memcmp(&((struct sockaddr_in6 *)a)->sin6_addr,
+ &((struct sockaddr_in6 *)b)->sin6_addr,
+ sizeof(struct in6_addr)) &&
+ (b->sa_family == AF_INET6);
+ default:
+ return -ENOENT;
+ }
+}
+
+static bool __is_path_w_addr_exists(struct rtrs_srv_sess *srv,
+ struct rdma_addr *addr)
+{
+ struct rtrs_srv_path *srv_path;
+
+ list_for_each_entry(srv_path, &srv->paths_list, s.entry)
+ if (!sockaddr_cmp((struct sockaddr *)&srv_path->s.dst_addr,
+ (struct sockaddr *)&addr->dst_addr) &&
+ !sockaddr_cmp((struct sockaddr *)&srv_path->s.src_addr,
+ (struct sockaddr *)&addr->src_addr))
+ return true;
+
+ return false;
+}
+
+static void free_path(struct rtrs_srv_path *srv_path)
+{
+ if (srv_path->kobj.state_in_sysfs) {
+ kobject_del(&srv_path->kobj);
+ kobject_put(&srv_path->kobj);
+ } else {
+ free_percpu(srv_path->stats->rdma_stats);
+ kfree(srv_path->stats);
+ kfree(srv_path);
+ }
+}
+
+static void rtrs_srv_close_work(struct work_struct *work)
+{
+ struct rtrs_srv_path *srv_path;
+ struct rtrs_srv_con *con;
+ int i;
+
+ srv_path = container_of(work, typeof(*srv_path), close_work);
+
+ rtrs_srv_destroy_path_files(srv_path);
+ rtrs_srv_stop_hb(srv_path);
+
+ for (i = 0; i < srv_path->s.con_num; i++) {
+ if (!srv_path->s.con[i])
+ continue;
+ con = to_srv_con(srv_path->s.con[i]);
+ rdma_disconnect(con->c.cm_id);
+ ib_drain_qp(con->c.qp);
+ }
+
+ /*
+ * Degrade ref count to the usual model with a single shared
+ * atomic_t counter
+ */
+ percpu_ref_kill(&srv_path->ids_inflight_ref);
+
+ /* Wait for all completion */
+ wait_for_completion(&srv_path->complete_done);
+
+ /* Notify upper layer if we are the last path */
+ rtrs_srv_path_down(srv_path);
+
+ unmap_cont_bufs(srv_path);
+ rtrs_srv_free_ops_ids(srv_path);
+
+ for (i = 0; i < srv_path->s.con_num; i++) {
+ if (!srv_path->s.con[i])
+ continue;
+ con = to_srv_con(srv_path->s.con[i]);
+ rtrs_cq_qp_destroy(&con->c);
+ rdma_destroy_id(con->c.cm_id);
+ kfree(con);
+ }
+ rtrs_ib_dev_put(srv_path->s.dev);
+
+ del_path_from_srv(srv_path);
+ put_srv(srv_path->srv);
+ srv_path->srv = NULL;
+ rtrs_srv_change_state(srv_path, RTRS_SRV_CLOSED);
+
+ kfree(srv_path->dma_addr);
+ kfree(srv_path->s.con);
+ free_path(srv_path);
+}
+
+static int rtrs_rdma_do_accept(struct rtrs_srv_path *srv_path,
+ struct rdma_cm_id *cm_id)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_msg_conn_rsp msg;
+ struct rdma_conn_param param;
+ int err;
+
+ param = (struct rdma_conn_param) {
+ .rnr_retry_count = 7,
+ .private_data = &msg,
+ .private_data_len = sizeof(msg),
+ };
+
+ msg = (struct rtrs_msg_conn_rsp) {
+ .magic = cpu_to_le16(RTRS_MAGIC),
+ .version = cpu_to_le16(RTRS_PROTO_VER),
+ .queue_depth = cpu_to_le16(srv->queue_depth),
+ .max_io_size = cpu_to_le32(max_chunk_size - MAX_HDR_SIZE),
+ .max_hdr_size = cpu_to_le32(MAX_HDR_SIZE),
+ };
+
+ if (always_invalidate)
+ msg.flags = cpu_to_le32(RTRS_MSG_NEW_RKEY_F);
+
+ err = rdma_accept(cm_id, &param);
+ if (err)
+ pr_err("rdma_accept(), err: %d\n", err);
+
+ return err;
+}
+
+static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno)
+{
+ struct rtrs_msg_conn_rsp msg;
+ int err;
+
+ msg = (struct rtrs_msg_conn_rsp) {
+ .magic = cpu_to_le16(RTRS_MAGIC),
+ .version = cpu_to_le16(RTRS_PROTO_VER),
+ .errno = cpu_to_le16(errno),
+ };
+
+ err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED);
+ if (err)
+ pr_err("rdma_reject(), err: %d\n", err);
+
+ /* Bounce errno back */
+ return errno;
+}
+
+static struct rtrs_srv_path *
+__find_path(struct rtrs_srv_sess *srv, const uuid_t *sess_uuid)
+{
+ struct rtrs_srv_path *srv_path;
+
+ list_for_each_entry(srv_path, &srv->paths_list, s.entry) {
+ if (uuid_equal(&srv_path->s.uuid, sess_uuid))
+ return srv_path;
+ }
+
+ return NULL;
+}
+
+static int create_con(struct rtrs_srv_path *srv_path,
+ struct rdma_cm_id *cm_id,
+ unsigned int cid)
+{
+ struct rtrs_srv_sess *srv = srv_path->srv;
+ struct rtrs_path *s = &srv_path->s;
+ struct rtrs_srv_con *con;
+
+ u32 cq_num, max_send_wr, max_recv_wr, wr_limit;
+ int err, cq_vector;
+
+ con = kzalloc(sizeof(*con), GFP_KERNEL);
+ if (!con) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ spin_lock_init(&con->rsp_wr_wait_lock);
+ INIT_LIST_HEAD(&con->rsp_wr_wait_list);
+ con->c.cm_id = cm_id;
+ con->c.path = &srv_path->s;
+ con->c.cid = cid;
+ atomic_set(&con->c.wr_cnt, 1);
+ wr_limit = srv_path->s.dev->ib_dev->attrs.max_qp_wr;
+
+ if (con->c.cid == 0) {
+ /*
+ * All receive and all send (each requiring invalidate)
+ * + 2 for drain and heartbeat
+ */
+ max_send_wr = min_t(int, wr_limit,
+ SERVICE_CON_QUEUE_DEPTH * 2 + 2);
+ max_recv_wr = max_send_wr;
+ s->signal_interval = min_not_zero(srv->queue_depth,
+ (size_t)SERVICE_CON_QUEUE_DEPTH);
+ } else {
+ /* when always_invlaidate enalbed, we need linv+rinv+mr+imm */
+ if (always_invalidate)
+ max_send_wr =
+ min_t(int, wr_limit,
+ srv->queue_depth * (1 + 4) + 1);
+ else
+ max_send_wr =
+ min_t(int, wr_limit,
+ srv->queue_depth * (1 + 2) + 1);
+
+ max_recv_wr = srv->queue_depth + 1;
+ /*
+ * If we have all receive requests posted and
+ * all write requests posted and each read request
+ * requires an invalidate request + drain
+ * and qp gets into error state.
+ */
+ }
+ cq_num = max_send_wr + max_recv_wr;
+ atomic_set(&con->c.sq_wr_avail, max_send_wr);
+ cq_vector = rtrs_srv_get_next_cq_vector(srv_path);
+
+ /* TODO: SOFTIRQ can be faster, but be careful with softirq context */
+ err = rtrs_cq_qp_create(&srv_path->s, &con->c, 1, cq_vector, cq_num,
+ max_send_wr, max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (err) {
+ rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err);
+ goto free_con;
+ }
+ if (con->c.cid == 0) {
+ err = post_recv_info_req(con);
+ if (err)
+ goto free_cqqp;
+ }
+ WARN_ON(srv_path->s.con[cid]);
+ srv_path->s.con[cid] = &con->c;
+
+ /*
+ * Change context from server to current connection. The other
+ * way is to use cm_id->qp->qp_context, which does not work on OFED.
+ */
+ cm_id->context = &con->c;
+
+ return 0;
+
+free_cqqp:
+ rtrs_cq_qp_destroy(&con->c);
+free_con:
+ kfree(con);
+
+err:
+ return err;
+}
+
+static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv,
+ struct rdma_cm_id *cm_id,
+ unsigned int con_num,
+ unsigned int recon_cnt,
+ const uuid_t *uuid)
+{
+ struct rtrs_srv_path *srv_path;
+ int err = -ENOMEM;
+ char str[NAME_MAX];
+ struct rtrs_addr path;
+
+ if (srv->paths_num >= MAX_PATHS_NUM) {
+ err = -ECONNRESET;
+ goto err;
+ }
+ if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) {
+ err = -EEXIST;
+ pr_err("Path with same addr exists\n");
+ goto err;
+ }
+ srv_path = kzalloc(sizeof(*srv_path), GFP_KERNEL);
+ if (!srv_path)
+ goto err;
+
+ srv_path->stats = kzalloc(sizeof(*srv_path->stats), GFP_KERNEL);
+ if (!srv_path->stats)
+ goto err_free_sess;
+
+ srv_path->stats->rdma_stats = alloc_percpu(struct rtrs_srv_stats_rdma_stats);
+ if (!srv_path->stats->rdma_stats)
+ goto err_free_stats;
+
+ srv_path->stats->srv_path = srv_path;
+
+ srv_path->dma_addr = kcalloc(srv->queue_depth,
+ sizeof(*srv_path->dma_addr),
+ GFP_KERNEL);
+ if (!srv_path->dma_addr)
+ goto err_free_percpu;
+
+ srv_path->s.con = kcalloc(con_num, sizeof(*srv_path->s.con),
+ GFP_KERNEL);
+ if (!srv_path->s.con)
+ goto err_free_dma_addr;
+
+ srv_path->state = RTRS_SRV_CONNECTING;
+ srv_path->srv = srv;
+ srv_path->cur_cq_vector = -1;
+ srv_path->s.dst_addr = cm_id->route.addr.dst_addr;
+ srv_path->s.src_addr = cm_id->route.addr.src_addr;
+
+ /* temporary until receiving session-name from client */
+ path.src = &srv_path->s.src_addr;
+ path.dst = &srv_path->s.dst_addr;
+ rtrs_addr_to_str(&path, str, sizeof(str));
+ strscpy(srv_path->s.sessname, str, sizeof(srv_path->s.sessname));
+
+ srv_path->s.con_num = con_num;
+ srv_path->s.irq_con_num = con_num;
+ srv_path->s.recon_cnt = recon_cnt;
+ uuid_copy(&srv_path->s.uuid, uuid);
+ spin_lock_init(&srv_path->state_lock);
+ INIT_WORK(&srv_path->close_work, rtrs_srv_close_work);
+ rtrs_srv_init_hb(srv_path);
+
+ srv_path->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd);
+ if (!srv_path->s.dev) {
+ err = -ENOMEM;
+ goto err_free_con;
+ }
+ err = map_cont_bufs(srv_path);
+ if (err)
+ goto err_put_dev;
+
+ err = rtrs_srv_alloc_ops_ids(srv_path);
+ if (err)
+ goto err_unmap_bufs;
+
+ __add_path_to_srv(srv, srv_path);
+
+ return srv_path;
+
+err_unmap_bufs:
+ unmap_cont_bufs(srv_path);
+err_put_dev:
+ rtrs_ib_dev_put(srv_path->s.dev);
+err_free_con:
+ kfree(srv_path->s.con);
+err_free_dma_addr:
+ kfree(srv_path->dma_addr);
+err_free_percpu:
+ free_percpu(srv_path->stats->rdma_stats);
+err_free_stats:
+ kfree(srv_path->stats);
+err_free_sess:
+ kfree(srv_path);
+err:
+ return ERR_PTR(err);
+}
+
+static int rtrs_rdma_connect(struct rdma_cm_id *cm_id,
+ const struct rtrs_msg_conn_req *msg,
+ size_t len)
+{
+ struct rtrs_srv_ctx *ctx = cm_id->context;
+ struct rtrs_srv_path *srv_path;
+ struct rtrs_srv_sess *srv;
+
+ u16 version, con_num, cid;
+ u16 recon_cnt;
+ int err = -ECONNRESET;
+
+ if (len < sizeof(*msg)) {
+ pr_err("Invalid RTRS connection request\n");
+ goto reject_w_err;
+ }
+ if (le16_to_cpu(msg->magic) != RTRS_MAGIC) {
+ pr_err("Invalid RTRS magic\n");
+ goto reject_w_err;
+ }
+ version = le16_to_cpu(msg->version);
+ if (version >> 8 != RTRS_PROTO_VER_MAJOR) {
+ pr_err("Unsupported major RTRS version: %d, expected %d\n",
+ version >> 8, RTRS_PROTO_VER_MAJOR);
+ goto reject_w_err;
+ }
+ con_num = le16_to_cpu(msg->cid_num);
+ if (con_num > 4096) {
+ /* Sanity check */
+ pr_err("Too many connections requested: %d\n", con_num);
+ goto reject_w_err;
+ }
+ cid = le16_to_cpu(msg->cid);
+ if (cid >= con_num) {
+ /* Sanity check */
+ pr_err("Incorrect cid: %d >= %d\n", cid, con_num);
+ goto reject_w_err;
+ }
+ recon_cnt = le16_to_cpu(msg->recon_cnt);
+ srv = get_or_create_srv(ctx, &msg->paths_uuid, msg->first_conn);
+ if (IS_ERR(srv)) {
+ err = PTR_ERR(srv);
+ pr_err("get_or_create_srv(), error %d\n", err);
+ goto reject_w_err;
+ }
+ mutex_lock(&srv->paths_mutex);
+ srv_path = __find_path(srv, &msg->sess_uuid);
+ if (srv_path) {
+ struct rtrs_path *s = &srv_path->s;
+
+ /* Session already holds a reference */
+ put_srv(srv);
+
+ if (srv_path->state != RTRS_SRV_CONNECTING) {
+ rtrs_err(s, "Session in wrong state: %s\n",
+ rtrs_srv_state_str(srv_path->state));
+ mutex_unlock(&srv->paths_mutex);
+ goto reject_w_err;
+ }
+ /*
+ * Sanity checks
+ */
+ if (con_num != s->con_num || cid >= s->con_num) {
+ rtrs_err(s, "Incorrect request: %d, %d\n",
+ cid, con_num);
+ mutex_unlock(&srv->paths_mutex);
+ goto reject_w_err;
+ }
+ if (s->con[cid]) {
+ rtrs_err(s, "Connection already exists: %d\n",
+ cid);
+ mutex_unlock(&srv->paths_mutex);
+ goto reject_w_err;
+ }
+ } else {
+ srv_path = __alloc_path(srv, cm_id, con_num, recon_cnt,
+ &msg->sess_uuid);
+ if (IS_ERR(srv_path)) {
+ mutex_unlock(&srv->paths_mutex);
+ put_srv(srv);
+ err = PTR_ERR(srv_path);
+ pr_err("RTRS server session allocation failed: %d\n", err);
+ goto reject_w_err;
+ }
+ }
+ err = create_con(srv_path, cm_id, cid);
+ if (err) {
+ rtrs_err((&srv_path->s), "create_con(), error %d\n", err);
+ rtrs_rdma_do_reject(cm_id, err);
+ /*
+ * Since session has other connections we follow normal way
+ * through workqueue, but still return an error to tell cma.c
+ * to call rdma_destroy_id() for current connection.
+ */
+ goto close_and_return_err;
+ }
+ err = rtrs_rdma_do_accept(srv_path, cm_id);
+ if (err) {
+ rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %d\n", err);
+ rtrs_rdma_do_reject(cm_id, err);
+ /*
+ * Since current connection was successfully added to the
+ * session we follow normal way through workqueue to close the
+ * session, thus return 0 to tell cma.c we call
+ * rdma_destroy_id() ourselves.
+ */
+ err = 0;
+ goto close_and_return_err;
+ }
+ mutex_unlock(&srv->paths_mutex);
+
+ return 0;
+
+reject_w_err:
+ return rtrs_rdma_do_reject(cm_id, err);
+
+close_and_return_err:
+ mutex_unlock(&srv->paths_mutex);
+ close_path(srv_path);
+
+ return err;
+}
+
+static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *ev)
+{
+ struct rtrs_srv_path *srv_path = NULL;
+ struct rtrs_path *s = NULL;
+
+ if (ev->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+ struct rtrs_con *c = cm_id->context;
+
+ s = c->path;
+ srv_path = to_srv_path(s);
+ }
+
+ switch (ev->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ /*
+ * In case of error cma.c will destroy cm_id,
+ * see cma_process_remove()
+ */
+ return rtrs_rdma_connect(cm_id, ev->param.conn.private_data,
+ ev->param.conn.private_data_len);
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* Nothing here */
+ break;
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ rtrs_err(s, "CM error (CM event: %s, err: %d)\n",
+ rdma_event_msg(ev->event), ev->status);
+ fallthrough;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ close_path(srv_path);
+ break;
+ default:
+ pr_err("Ignoring unexpected CM event %s, err %d\n",
+ rdma_event_msg(ev->event), ev->status);
+ break;
+ }
+
+ return 0;
+}
+
+static struct rdma_cm_id *rtrs_srv_cm_init(struct rtrs_srv_ctx *ctx,
+ struct sockaddr *addr,
+ enum rdma_ucm_port_space ps)
+{
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ cm_id = rdma_create_id(&init_net, rtrs_srv_rdma_cm_handler,
+ ctx, ps, IB_QPT_RC);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ pr_err("Creating id for RDMA connection failed, err: %d\n",
+ ret);
+ goto err_out;
+ }
+ ret = rdma_bind_addr(cm_id, addr);
+ if (ret) {
+ pr_err("Binding RDMA address failed, err: %d\n", ret);
+ goto err_cm;
+ }
+ ret = rdma_listen(cm_id, 64);
+ if (ret) {
+ pr_err("Listening on RDMA connection failed, err: %d\n",
+ ret);
+ goto err_cm;
+ }
+
+ return cm_id;
+
+err_cm:
+ rdma_destroy_id(cm_id);
+err_out:
+
+ return ERR_PTR(ret);
+}
+
+static int rtrs_srv_rdma_init(struct rtrs_srv_ctx *ctx, u16 port)
+{
+ struct sockaddr_in6 sin = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ struct sockaddr_ib sib = {
+ .sib_family = AF_IB,
+ .sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port),
+ .sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL),
+ .sib_pkey = cpu_to_be16(0xffff),
+ };
+ struct rdma_cm_id *cm_ip, *cm_ib;
+ int ret;
+
+ /*
+ * We accept both IPoIB and IB connections, so we need to keep
+ * two cm id's, one for each socket type and port space.
+ * If the cm initialization of one of the id's fails, we abort
+ * everything.
+ */
+ cm_ip = rtrs_srv_cm_init(ctx, (struct sockaddr *)&sin, RDMA_PS_TCP);
+ if (IS_ERR(cm_ip))
+ return PTR_ERR(cm_ip);
+
+ cm_ib = rtrs_srv_cm_init(ctx, (struct sockaddr *)&sib, RDMA_PS_IB);
+ if (IS_ERR(cm_ib)) {
+ ret = PTR_ERR(cm_ib);
+ goto free_cm_ip;
+ }
+
+ ctx->cm_id_ip = cm_ip;
+ ctx->cm_id_ib = cm_ib;
+
+ return 0;
+
+free_cm_ip:
+ rdma_destroy_id(cm_ip);
+
+ return ret;
+}
+
+static struct rtrs_srv_ctx *alloc_srv_ctx(struct rtrs_srv_ops *ops)
+{
+ struct rtrs_srv_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ ctx->ops = *ops;
+ mutex_init(&ctx->srv_mutex);
+ INIT_LIST_HEAD(&ctx->srv_list);
+
+ return ctx;
+}
+
+static void free_srv_ctx(struct rtrs_srv_ctx *ctx)
+{
+ WARN_ON(!list_empty(&ctx->srv_list));
+ mutex_destroy(&ctx->srv_mutex);
+ kfree(ctx);
+}
+
+static int rtrs_srv_add_one(struct ib_device *device)
+{
+ struct rtrs_srv_ctx *ctx;
+ int ret = 0;
+
+ mutex_lock(&ib_ctx.ib_dev_mutex);
+ if (ib_ctx.ib_dev_count)
+ goto out;
+
+ /*
+ * Since our CM IDs are NOT bound to any ib device we will create them
+ * only once
+ */
+ ctx = ib_ctx.srv_ctx;
+ ret = rtrs_srv_rdma_init(ctx, ib_ctx.port);
+ if (ret) {
+ /*
+ * We errored out here.
+ * According to the ib code, if we encounter an error here then the
+ * error code is ignored, and no more calls to our ops are made.
+ */
+ pr_err("Failed to initialize RDMA connection");
+ goto err_out;
+ }
+
+out:
+ /*
+ * Keep a track on the number of ib devices added
+ */
+ ib_ctx.ib_dev_count++;
+
+err_out:
+ mutex_unlock(&ib_ctx.ib_dev_mutex);
+ return ret;
+}
+
+static void rtrs_srv_remove_one(struct ib_device *device, void *client_data)
+{
+ struct rtrs_srv_ctx *ctx;
+
+ mutex_lock(&ib_ctx.ib_dev_mutex);
+ ib_ctx.ib_dev_count--;
+
+ if (ib_ctx.ib_dev_count)
+ goto out;
+
+ /*
+ * Since our CM IDs are NOT bound to any ib device we will remove them
+ * only once, when the last device is removed
+ */
+ ctx = ib_ctx.srv_ctx;
+ rdma_destroy_id(ctx->cm_id_ip);
+ rdma_destroy_id(ctx->cm_id_ib);
+
+out:
+ mutex_unlock(&ib_ctx.ib_dev_mutex);
+}
+
+static struct ib_client rtrs_srv_client = {
+ .name = "rtrs_server",
+ .add = rtrs_srv_add_one,
+ .remove = rtrs_srv_remove_one
+};
+
+/**
+ * rtrs_srv_open() - open RTRS server context
+ * @ops: callback functions
+ * @port: port to listen on
+ *
+ * Creates server context with specified callbacks.
+ *
+ * Return a valid pointer on success otherwise PTR_ERR.
+ */
+struct rtrs_srv_ctx *rtrs_srv_open(struct rtrs_srv_ops *ops, u16 port)
+{
+ struct rtrs_srv_ctx *ctx;
+ int err;
+
+ ctx = alloc_srv_ctx(ops);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&ib_ctx.ib_dev_mutex);
+ ib_ctx.srv_ctx = ctx;
+ ib_ctx.port = port;
+
+ err = ib_register_client(&rtrs_srv_client);
+ if (err) {
+ free_srv_ctx(ctx);
+ return ERR_PTR(err);
+ }
+
+ return ctx;
+}
+EXPORT_SYMBOL(rtrs_srv_open);
+
+static void close_paths(struct rtrs_srv_sess *srv)
+{
+ struct rtrs_srv_path *srv_path;
+
+ mutex_lock(&srv->paths_mutex);
+ list_for_each_entry(srv_path, &srv->paths_list, s.entry)
+ close_path(srv_path);
+ mutex_unlock(&srv->paths_mutex);
+}
+
+static void close_ctx(struct rtrs_srv_ctx *ctx)
+{
+ struct rtrs_srv_sess *srv;
+
+ mutex_lock(&ctx->srv_mutex);
+ list_for_each_entry(srv, &ctx->srv_list, ctx_list)
+ close_paths(srv);
+ mutex_unlock(&ctx->srv_mutex);
+ flush_workqueue(rtrs_wq);
+}
+
+/**
+ * rtrs_srv_close() - close RTRS server context
+ * @ctx: pointer to server context
+ *
+ * Closes RTRS server context with all client sessions.
+ */
+void rtrs_srv_close(struct rtrs_srv_ctx *ctx)
+{
+ ib_unregister_client(&rtrs_srv_client);
+ mutex_destroy(&ib_ctx.ib_dev_mutex);
+ close_ctx(ctx);
+ free_srv_ctx(ctx);
+}
+EXPORT_SYMBOL(rtrs_srv_close);
+
+static int check_module_params(void)
+{
+ if (sess_queue_depth < 1 || sess_queue_depth > MAX_SESS_QUEUE_DEPTH) {
+ pr_err("Invalid sess_queue_depth value %d, has to be >= %d, <= %d.\n",
+ sess_queue_depth, 1, MAX_SESS_QUEUE_DEPTH);
+ return -EINVAL;
+ }
+ if (max_chunk_size < MIN_CHUNK_SIZE || !is_power_of_2(max_chunk_size)) {
+ pr_err("Invalid max_chunk_size value %d, has to be >= %d and should be power of two.\n",
+ max_chunk_size, MIN_CHUNK_SIZE);
+ return -EINVAL;
+ }
+
+ /*
+ * Check if IB immediate data size is enough to hold the mem_id and the
+ * offset inside the memory chunk
+ */
+ if ((ilog2(sess_queue_depth - 1) + 1) +
+ (ilog2(max_chunk_size - 1) + 1) > MAX_IMM_PAYL_BITS) {
+ pr_err("RDMA immediate size (%db) not enough to encode %d buffers of size %dB. Reduce 'sess_queue_depth' or 'max_chunk_size' parameters.\n",
+ MAX_IMM_PAYL_BITS, sess_queue_depth, max_chunk_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init rtrs_server_init(void)
+{
+ int err;
+
+ pr_info("Loading module %s, proto %s: (max_chunk_size: %d (pure IO %ld, headers %ld) , sess_queue_depth: %d, always_invalidate: %d)\n",
+ KBUILD_MODNAME, RTRS_PROTO_VER_STRING,
+ max_chunk_size, max_chunk_size - MAX_HDR_SIZE, MAX_HDR_SIZE,
+ sess_queue_depth, always_invalidate);
+
+ rtrs_rdma_dev_pd_init(0, &dev_pd);
+
+ err = check_module_params();
+ if (err) {
+ pr_err("Failed to load module, invalid module parameters, err: %d\n",
+ err);
+ return err;
+ }
+ rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server");
+ if (IS_ERR(rtrs_dev_class)) {
+ err = PTR_ERR(rtrs_dev_class);
+ goto out_err;
+ }
+ rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0);
+ if (!rtrs_wq) {
+ err = -ENOMEM;
+ goto out_dev_class;
+ }
+
+ return 0;
+
+out_dev_class:
+ class_destroy(rtrs_dev_class);
+out_err:
+ return err;
+}
+
+static void __exit rtrs_server_exit(void)
+{
+ destroy_workqueue(rtrs_wq);
+ class_destroy(rtrs_dev_class);
+ rtrs_rdma_dev_pd_deinit(&dev_pd);
+}
+
+module_init(rtrs_server_init);
+module_exit(rtrs_server_exit);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h
new file mode 100644
index 000000000000..2f8a638e36fa
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#ifndef RTRS_SRV_H
+#define RTRS_SRV_H
+
+#include <linux/device.h>
+#include <linux/refcount.h>
+#include <linux/percpu.h>
+#include "rtrs-pri.h"
+
+/*
+ * enum rtrs_srv_state - Server states.
+ */
+enum rtrs_srv_state {
+ RTRS_SRV_CONNECTING,
+ RTRS_SRV_CONNECTED,
+ RTRS_SRV_CLOSING,
+ RTRS_SRV_CLOSED,
+};
+
+/* stats for Read and write operation.
+ * see Documentation/ABI/testing/sysfs-class-rtrs-server for details
+ */
+struct rtrs_srv_stats_rdma_stats {
+ struct {
+ u64 cnt;
+ u64 size_total;
+ } dir[2];
+};
+
+struct rtrs_srv_stats {
+ struct kobject kobj_stats;
+ struct rtrs_srv_stats_rdma_stats __percpu *rdma_stats;
+ struct rtrs_srv_path *srv_path;
+};
+
+struct rtrs_srv_con {
+ struct rtrs_con c;
+ struct list_head rsp_wr_wait_list;
+ spinlock_t rsp_wr_wait_lock;
+};
+
+/* IO context in rtrs_srv, each io has one */
+struct rtrs_srv_op {
+ struct rtrs_srv_con *con;
+ u32 msg_id;
+ u8 dir;
+ struct rtrs_msg_rdma_read *rd_msg;
+ struct ib_rdma_wr tx_wr;
+ struct ib_sge tx_sg;
+ struct list_head wait_list;
+ int status;
+};
+
+/*
+ * server side memory region context, when always_invalidate=Y, we need
+ * queue_depth of memory region to invalidate each memory region.
+ */
+struct rtrs_srv_mr {
+ struct ib_mr *mr;
+ struct sg_table sgt;
+ struct ib_cqe inv_cqe; /* only for always_invalidate=true */
+ u32 msg_id; /* only for always_invalidate=true */
+ u32 msg_off; /* only for always_invalidate=true */
+ struct rtrs_iu *iu; /* send buffer for new rkey msg */
+};
+
+struct rtrs_srv_path {
+ struct rtrs_path s;
+ struct rtrs_srv_sess *srv;
+ struct work_struct close_work;
+ enum rtrs_srv_state state;
+ spinlock_t state_lock;
+ int cur_cq_vector;
+ struct rtrs_srv_op **ops_ids;
+ struct percpu_ref ids_inflight_ref;
+ struct completion complete_done;
+ struct rtrs_srv_mr *mrs;
+ unsigned int mrs_num;
+ dma_addr_t *dma_addr;
+ bool established;
+ unsigned int mem_bits;
+ struct kobject kobj;
+ struct rtrs_srv_stats *stats;
+};
+
+static inline struct rtrs_srv_path *to_srv_path(struct rtrs_path *s)
+{
+ return container_of(s, struct rtrs_srv_path, s);
+}
+
+struct rtrs_srv_sess {
+ struct list_head paths_list;
+ int paths_up;
+ struct mutex paths_ev_mutex;
+ size_t paths_num;
+ struct mutex paths_mutex;
+ uuid_t paths_uuid;
+ refcount_t refcount;
+ struct rtrs_srv_ctx *ctx;
+ struct list_head ctx_list;
+ void *priv;
+ size_t queue_depth;
+ struct page **chunks;
+ struct device dev;
+ unsigned int dev_ref;
+ struct kobject *kobj_paths;
+};
+
+struct rtrs_srv_ctx {
+ struct rtrs_srv_ops ops;
+ struct rdma_cm_id *cm_id_ip;
+ struct rdma_cm_id *cm_id_ib;
+ struct mutex srv_mutex;
+ struct list_head srv_list;
+};
+
+struct rtrs_srv_ib_ctx {
+ struct rtrs_srv_ctx *srv_ctx;
+ u16 port;
+ struct mutex ib_dev_mutex;
+ int ib_dev_count;
+};
+
+extern struct class *rtrs_dev_class;
+
+void close_path(struct rtrs_srv_path *srv_path);
+
+static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s,
+ size_t size, int d)
+{
+ this_cpu_inc(s->rdma_stats->dir[d].cnt);
+ this_cpu_add(s->rdma_stats->dir[d].size_total, size);
+}
+
+/* functions which are implemented in rtrs-srv-stats.c */
+int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable);
+ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page);
+int rtrs_srv_reset_all_stats(struct rtrs_srv_stats *stats, bool enable);
+ssize_t rtrs_srv_reset_all_help(struct rtrs_srv_stats *stats,
+ char *page, size_t len);
+
+/* functions which are implemented in rtrs-srv-sysfs.c */
+int rtrs_srv_create_path_files(struct rtrs_srv_path *srv_path);
+void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path);
+
+#endif /* RTRS_SRV_H */
diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c
new file mode 100644
index 000000000000..ed324b47d93a
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/inet.h>
+
+#include "rtrs-pri.h"
+#include "rtrs-log.h"
+
+MODULE_DESCRIPTION("RDMA Transport Core");
+MODULE_LICENSE("GPL");
+
+struct rtrs_iu *rtrs_iu_alloc(u32 iu_num, size_t size, gfp_t gfp_mask,
+ struct ib_device *dma_dev,
+ enum dma_data_direction dir,
+ void (*done)(struct ib_cq *cq, struct ib_wc *wc))
+{
+ struct rtrs_iu *ius, *iu;
+ int i;
+
+ ius = kcalloc(iu_num, sizeof(*ius), gfp_mask);
+ if (!ius)
+ return NULL;
+ for (i = 0; i < iu_num; i++) {
+ iu = &ius[i];
+ iu->direction = dir;
+ iu->buf = kzalloc(size, gfp_mask);
+ if (!iu->buf)
+ goto err;
+
+ iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, dir);
+ if (ib_dma_mapping_error(dma_dev, iu->dma_addr))
+ goto err;
+
+ iu->cqe.done = done;
+ iu->size = size;
+ }
+ return ius;
+err:
+ rtrs_iu_free(ius, dma_dev, i);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(rtrs_iu_alloc);
+
+void rtrs_iu_free(struct rtrs_iu *ius, struct ib_device *ibdev, u32 queue_num)
+{
+ struct rtrs_iu *iu;
+ int i;
+
+ if (!ius)
+ return;
+
+ for (i = 0; i < queue_num; i++) {
+ iu = &ius[i];
+ ib_dma_unmap_single(ibdev, iu->dma_addr, iu->size, iu->direction);
+ kfree(iu->buf);
+ }
+ kfree(ius);
+}
+EXPORT_SYMBOL_GPL(rtrs_iu_free);
+
+int rtrs_iu_post_recv(struct rtrs_con *con, struct rtrs_iu *iu)
+{
+ struct rtrs_path *path = con->path;
+ struct ib_recv_wr wr;
+ struct ib_sge list;
+
+ list.addr = iu->dma_addr;
+ list.length = iu->size;
+ list.lkey = path->dev->ib_pd->local_dma_lkey;
+
+ if (list.length == 0) {
+ rtrs_wrn(con->path,
+ "Posting receive work request failed, sg list is empty\n");
+ return -EINVAL;
+ }
+ wr = (struct ib_recv_wr) {
+ .wr_cqe = &iu->cqe,
+ .sg_list = &list,
+ .num_sge = 1,
+ };
+
+ return ib_post_recv(con->qp, &wr, NULL);
+}
+EXPORT_SYMBOL_GPL(rtrs_iu_post_recv);
+
+int rtrs_post_recv_empty(struct rtrs_con *con, struct ib_cqe *cqe)
+{
+ struct ib_recv_wr wr;
+
+ wr = (struct ib_recv_wr) {
+ .wr_cqe = cqe,
+ };
+
+ return ib_post_recv(con->qp, &wr, NULL);
+}
+EXPORT_SYMBOL_GPL(rtrs_post_recv_empty);
+
+static int rtrs_post_send(struct ib_qp *qp, struct ib_send_wr *head,
+ struct ib_send_wr *wr, struct ib_send_wr *tail)
+{
+ if (head) {
+ struct ib_send_wr *next = head;
+
+ while (next->next)
+ next = next->next;
+ next->next = wr;
+ } else {
+ head = wr;
+ }
+
+ if (tail)
+ wr->next = tail;
+
+ return ib_post_send(qp, head, NULL);
+}
+
+int rtrs_iu_post_send(struct rtrs_con *con, struct rtrs_iu *iu, size_t size,
+ struct ib_send_wr *head)
+{
+ struct rtrs_path *path = con->path;
+ struct ib_send_wr wr;
+ struct ib_sge list;
+
+ if (WARN_ON(size == 0))
+ return -EINVAL;
+
+ list.addr = iu->dma_addr;
+ list.length = size;
+ list.lkey = path->dev->ib_pd->local_dma_lkey;
+
+ wr = (struct ib_send_wr) {
+ .wr_cqe = &iu->cqe,
+ .sg_list = &list,
+ .num_sge = 1,
+ .opcode = IB_WR_SEND,
+ .send_flags = IB_SEND_SIGNALED,
+ };
+
+ return rtrs_post_send(con->qp, head, &wr, NULL);
+}
+EXPORT_SYMBOL_GPL(rtrs_iu_post_send);
+
+int rtrs_iu_post_rdma_write_imm(struct rtrs_con *con, struct rtrs_iu *iu,
+ struct ib_sge *sge, unsigned int num_sge,
+ u32 rkey, u64 rdma_addr, u32 imm_data,
+ enum ib_send_flags flags,
+ struct ib_send_wr *head,
+ struct ib_send_wr *tail)
+{
+ struct ib_rdma_wr wr;
+ int i;
+
+ wr = (struct ib_rdma_wr) {
+ .wr.wr_cqe = &iu->cqe,
+ .wr.sg_list = sge,
+ .wr.num_sge = num_sge,
+ .rkey = rkey,
+ .remote_addr = rdma_addr,
+ .wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM,
+ .wr.ex.imm_data = cpu_to_be32(imm_data),
+ .wr.send_flags = flags,
+ };
+
+ /*
+ * If one of the sges has 0 size, the operation will fail with a
+ * length error
+ */
+ for (i = 0; i < num_sge; i++)
+ if (WARN_ONCE(sge[i].length == 0, "sg %d is zero length\n", i))
+ return -EINVAL;
+
+ return rtrs_post_send(con->qp, head, &wr.wr, tail);
+}
+EXPORT_SYMBOL_GPL(rtrs_iu_post_rdma_write_imm);
+
+static int rtrs_post_rdma_write_imm_empty(struct rtrs_con *con,
+ struct ib_cqe *cqe,
+ u32 imm_data,
+ struct ib_send_wr *head)
+{
+ struct ib_rdma_wr wr;
+ struct rtrs_path *path = con->path;
+ enum ib_send_flags sflags;
+
+ atomic_dec_if_positive(&con->sq_wr_avail);
+ sflags = (atomic_inc_return(&con->wr_cnt) % path->signal_interval) ?
+ 0 : IB_SEND_SIGNALED;
+
+ wr = (struct ib_rdma_wr) {
+ .wr.wr_cqe = cqe,
+ .wr.send_flags = sflags,
+ .wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM,
+ .wr.ex.imm_data = cpu_to_be32(imm_data),
+ };
+
+ return rtrs_post_send(con->qp, head, &wr.wr, NULL);
+}
+
+static void qp_event_handler(struct ib_event *ev, void *ctx)
+{
+ struct rtrs_con *con = ctx;
+
+ switch (ev->event) {
+ case IB_EVENT_COMM_EST:
+ rtrs_info(con->path, "QP event %s (%d) received\n",
+ ib_event_msg(ev->event), ev->event);
+ rdma_notify(con->cm_id, IB_EVENT_COMM_EST);
+ break;
+ default:
+ rtrs_info(con->path, "Unhandled QP event %s (%d) received\n",
+ ib_event_msg(ev->event), ev->event);
+ break;
+ }
+}
+
+static bool is_pollqueue(struct rtrs_con *con)
+{
+ return con->cid >= con->path->irq_con_num;
+}
+
+static int create_cq(struct rtrs_con *con, int cq_vector, int nr_cqe,
+ enum ib_poll_context poll_ctx)
+{
+ struct rdma_cm_id *cm_id = con->cm_id;
+ struct ib_cq *cq;
+
+ if (is_pollqueue(con))
+ cq = ib_alloc_cq(cm_id->device, con, nr_cqe, cq_vector,
+ poll_ctx);
+ else
+ cq = ib_cq_pool_get(cm_id->device, nr_cqe, cq_vector, poll_ctx);
+
+ if (IS_ERR(cq)) {
+ rtrs_err(con->path, "Creating completion queue failed, errno: %ld\n",
+ PTR_ERR(cq));
+ return PTR_ERR(cq);
+ }
+ con->cq = cq;
+ con->nr_cqe = nr_cqe;
+
+ return 0;
+}
+
+static int create_qp(struct rtrs_con *con, struct ib_pd *pd,
+ u32 max_send_wr, u32 max_recv_wr, u32 max_sge)
+{
+ struct ib_qp_init_attr init_attr = {NULL};
+ struct rdma_cm_id *cm_id = con->cm_id;
+ int ret;
+
+ init_attr.cap.max_send_wr = max_send_wr;
+ init_attr.cap.max_recv_wr = max_recv_wr;
+ init_attr.cap.max_recv_sge = 1;
+ init_attr.event_handler = qp_event_handler;
+ init_attr.qp_context = con;
+ init_attr.cap.max_send_sge = max_sge;
+
+ init_attr.qp_type = IB_QPT_RC;
+ init_attr.send_cq = con->cq;
+ init_attr.recv_cq = con->cq;
+ init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+
+ ret = rdma_create_qp(cm_id, pd, &init_attr);
+ if (ret) {
+ rtrs_err(con->path, "Creating QP failed, err: %d\n", ret);
+ return ret;
+ }
+ con->qp = cm_id->qp;
+
+ return ret;
+}
+
+static void destroy_cq(struct rtrs_con *con)
+{
+ if (con->cq) {
+ if (is_pollqueue(con))
+ ib_free_cq(con->cq);
+ else
+ ib_cq_pool_put(con->cq, con->nr_cqe);
+ }
+ con->cq = NULL;
+}
+
+int rtrs_cq_qp_create(struct rtrs_path *path, struct rtrs_con *con,
+ u32 max_send_sge, int cq_vector, int nr_cqe,
+ u32 max_send_wr, u32 max_recv_wr,
+ enum ib_poll_context poll_ctx)
+{
+ int err;
+
+ err = create_cq(con, cq_vector, nr_cqe, poll_ctx);
+ if (err)
+ return err;
+
+ err = create_qp(con, path->dev->ib_pd, max_send_wr, max_recv_wr,
+ max_send_sge);
+ if (err) {
+ destroy_cq(con);
+ return err;
+ }
+ con->path = path;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtrs_cq_qp_create);
+
+void rtrs_cq_qp_destroy(struct rtrs_con *con)
+{
+ if (con->qp) {
+ rdma_destroy_qp(con->cm_id);
+ con->qp = NULL;
+ }
+ destroy_cq(con);
+}
+EXPORT_SYMBOL_GPL(rtrs_cq_qp_destroy);
+
+static void schedule_hb(struct rtrs_path *path)
+{
+ queue_delayed_work(path->hb_wq, &path->hb_dwork,
+ msecs_to_jiffies(path->hb_interval_ms));
+}
+
+void rtrs_send_hb_ack(struct rtrs_path *path)
+{
+ struct rtrs_con *usr_con = path->con[0];
+ u32 imm;
+ int err;
+
+ imm = rtrs_to_imm(RTRS_HB_ACK_IMM, 0);
+ err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm,
+ NULL);
+ if (err) {
+ rtrs_err(path, "send HB ACK failed, errno: %d\n", err);
+ path->hb_err_handler(usr_con);
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(rtrs_send_hb_ack);
+
+static void hb_work(struct work_struct *work)
+{
+ struct rtrs_con *usr_con;
+ struct rtrs_path *path;
+ u32 imm;
+ int err;
+
+ path = container_of(to_delayed_work(work), typeof(*path), hb_dwork);
+ usr_con = path->con[0];
+
+ if (path->hb_missed_cnt > path->hb_missed_max) {
+ rtrs_err(path, "HB missed max reached.\n");
+ path->hb_err_handler(usr_con);
+ return;
+ }
+ if (path->hb_missed_cnt++) {
+ /* Reschedule work without sending hb */
+ schedule_hb(path);
+ return;
+ }
+
+ path->hb_last_sent = ktime_get();
+
+ imm = rtrs_to_imm(RTRS_HB_MSG_IMM, 0);
+ err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm,
+ NULL);
+ if (err) {
+ rtrs_err(path, "HB send failed, errno: %d\n", err);
+ path->hb_err_handler(usr_con);
+ return;
+ }
+
+ schedule_hb(path);
+}
+
+void rtrs_init_hb(struct rtrs_path *path, struct ib_cqe *cqe,
+ unsigned int interval_ms, unsigned int missed_max,
+ void (*err_handler)(struct rtrs_con *con),
+ struct workqueue_struct *wq)
+{
+ path->hb_cqe = cqe;
+ path->hb_interval_ms = interval_ms;
+ path->hb_err_handler = err_handler;
+ path->hb_wq = wq;
+ path->hb_missed_max = missed_max;
+ path->hb_missed_cnt = 0;
+ INIT_DELAYED_WORK(&path->hb_dwork, hb_work);
+}
+EXPORT_SYMBOL_GPL(rtrs_init_hb);
+
+void rtrs_start_hb(struct rtrs_path *path)
+{
+ schedule_hb(path);
+}
+EXPORT_SYMBOL_GPL(rtrs_start_hb);
+
+void rtrs_stop_hb(struct rtrs_path *path)
+{
+ cancel_delayed_work_sync(&path->hb_dwork);
+ path->hb_missed_cnt = 0;
+}
+EXPORT_SYMBOL_GPL(rtrs_stop_hb);
+
+static int rtrs_str_gid_to_sockaddr(const char *addr, size_t len,
+ short port, struct sockaddr_storage *dst)
+{
+ struct sockaddr_ib *dst_ib = (struct sockaddr_ib *)dst;
+ int ret;
+
+ /*
+ * We can use some of the IPv6 functions since GID is a valid
+ * IPv6 address format
+ */
+ ret = in6_pton(addr, len, dst_ib->sib_addr.sib_raw, '\0', NULL);
+ if (ret == 0)
+ return -EINVAL;
+
+ dst_ib->sib_family = AF_IB;
+ /*
+ * Use the same TCP server port number as the IB service ID
+ * on the IB port space range
+ */
+ dst_ib->sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port);
+ dst_ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ dst_ib->sib_pkey = cpu_to_be16(0xffff);
+
+ return 0;
+}
+
+/**
+ * rtrs_str_to_sockaddr() - Convert rtrs address string to sockaddr
+ * @addr: String representation of an addr (IPv4, IPv6 or IB GID):
+ * - "ip:192.168.1.1"
+ * - "ip:fe80::200:5aee:feaa:20a2"
+ * - "gid:fe80::200:5aee:feaa:20a2"
+ * @len: String address length
+ * @port: Destination port
+ * @dst: Destination sockaddr structure
+ *
+ * Returns 0 if conversion successful. Non-zero on error.
+ */
+static int rtrs_str_to_sockaddr(const char *addr, size_t len,
+ u16 port, struct sockaddr_storage *dst)
+{
+ if (strncmp(addr, "gid:", 4) == 0) {
+ return rtrs_str_gid_to_sockaddr(addr + 4, len - 4, port, dst);
+ } else if (strncmp(addr, "ip:", 3) == 0) {
+ char port_str[8];
+ char *cpy;
+ int err;
+
+ snprintf(port_str, sizeof(port_str), "%u", port);
+ cpy = kstrndup(addr + 3, len - 3, GFP_KERNEL);
+ err = cpy ? inet_pton_with_scope(&init_net, AF_UNSPEC,
+ cpy, port_str, dst) : -ENOMEM;
+ kfree(cpy);
+
+ return err;
+ }
+ return -EPROTONOSUPPORT;
+}
+
+/**
+ * sockaddr_to_str() - convert sockaddr to a string.
+ * @addr: the sockadddr structure to be converted.
+ * @buf: string containing socket addr.
+ * @len: string length.
+ *
+ * The return value is the number of characters written into buf not
+ * including the trailing '\0'. If len is == 0 the function returns 0..
+ */
+int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len)
+{
+ switch (addr->sa_family) {
+ case AF_IB:
+ return scnprintf(buf, len, "gid:%pI6",
+ &((struct sockaddr_ib *)addr)->sib_addr.sib_raw);
+ case AF_INET:
+ return scnprintf(buf, len, "ip:%pI4",
+ &((struct sockaddr_in *)addr)->sin_addr);
+ case AF_INET6:
+ return scnprintf(buf, len, "ip:%pI6c",
+ &((struct sockaddr_in6 *)addr)->sin6_addr);
+ }
+ return scnprintf(buf, len, "<invalid address family>");
+}
+EXPORT_SYMBOL(sockaddr_to_str);
+
+/**
+ * rtrs_addr_to_str() - convert rtrs_addr to a string "src@dst"
+ * @addr: the rtrs_addr structure to be converted
+ * @buf: string containing source and destination addr of a path
+ * separated by '@' I.e. "ip:1.1.1.1@ip:1.1.1.2"
+ * "ip:1.1.1.1@ip:1.1.1.2".
+ * @len: string length
+ *
+ * The return value is the number of characters written into buf not
+ * including the trailing '\0'.
+ */
+int rtrs_addr_to_str(const struct rtrs_addr *addr, char *buf, size_t len)
+{
+ int cnt;
+
+ cnt = sockaddr_to_str((struct sockaddr *)addr->src,
+ buf, len);
+ cnt += scnprintf(buf + cnt, len - cnt, "@");
+ sockaddr_to_str((struct sockaddr *)addr->dst,
+ buf + cnt, len - cnt);
+ return cnt;
+}
+EXPORT_SYMBOL(rtrs_addr_to_str);
+
+/**
+ * rtrs_addr_to_sockaddr() - convert path string "src,dst" or "src@dst"
+ * to sockaddreses
+ * @str: string containing source and destination addr of a path
+ * separated by ',' or '@' I.e. "ip:1.1.1.1,ip:1.1.1.2" or
+ * "ip:1.1.1.1@ip:1.1.1.2". If str contains only one address it's
+ * considered to be destination.
+ * @len: string length
+ * @port: Destination port number.
+ * @addr: will be set to the source/destination address or to NULL
+ * if str doesn't contain any source address.
+ *
+ * Returns zero if conversion successful. Non-zero otherwise.
+ */
+int rtrs_addr_to_sockaddr(const char *str, size_t len, u16 port,
+ struct rtrs_addr *addr)
+{
+ const char *d;
+
+ d = strchr(str, ',');
+ if (!d)
+ d = strchr(str, '@');
+ if (d) {
+ if (rtrs_str_to_sockaddr(str, d - str, 0, addr->src))
+ return -EINVAL;
+ d += 1;
+ len -= d - str;
+ str = d;
+
+ } else {
+ addr->src = NULL;
+ }
+ return rtrs_str_to_sockaddr(str, len, port, addr->dst);
+}
+EXPORT_SYMBOL(rtrs_addr_to_sockaddr);
+
+void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags,
+ struct rtrs_rdma_dev_pd *pool)
+{
+ WARN_ON(pool->ops && (!pool->ops->alloc ^ !pool->ops->free));
+ INIT_LIST_HEAD(&pool->list);
+ mutex_init(&pool->mutex);
+ pool->pd_flags = pd_flags;
+}
+EXPORT_SYMBOL(rtrs_rdma_dev_pd_init);
+
+void rtrs_rdma_dev_pd_deinit(struct rtrs_rdma_dev_pd *pool)
+{
+ mutex_destroy(&pool->mutex);
+ WARN_ON(!list_empty(&pool->list));
+}
+EXPORT_SYMBOL(rtrs_rdma_dev_pd_deinit);
+
+static void dev_free(struct kref *ref)
+{
+ struct rtrs_rdma_dev_pd *pool;
+ struct rtrs_ib_dev *dev;
+
+ dev = container_of(ref, typeof(*dev), ref);
+ pool = dev->pool;
+
+ mutex_lock(&pool->mutex);
+ list_del(&dev->entry);
+ mutex_unlock(&pool->mutex);
+
+ if (pool->ops && pool->ops->deinit)
+ pool->ops->deinit(dev);
+
+ ib_dealloc_pd(dev->ib_pd);
+
+ if (pool->ops && pool->ops->free)
+ pool->ops->free(dev);
+ else
+ kfree(dev);
+}
+
+int rtrs_ib_dev_put(struct rtrs_ib_dev *dev)
+{
+ return kref_put(&dev->ref, dev_free);
+}
+EXPORT_SYMBOL(rtrs_ib_dev_put);
+
+static int rtrs_ib_dev_get(struct rtrs_ib_dev *dev)
+{
+ return kref_get_unless_zero(&dev->ref);
+}
+
+struct rtrs_ib_dev *
+rtrs_ib_dev_find_or_add(struct ib_device *ib_dev,
+ struct rtrs_rdma_dev_pd *pool)
+{
+ struct rtrs_ib_dev *dev;
+
+ mutex_lock(&pool->mutex);
+ list_for_each_entry(dev, &pool->list, entry) {
+ if (dev->ib_dev->node_guid == ib_dev->node_guid &&
+ rtrs_ib_dev_get(dev))
+ goto out_unlock;
+ }
+ mutex_unlock(&pool->mutex);
+ if (pool->ops && pool->ops->alloc)
+ dev = pool->ops->alloc();
+ else
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (IS_ERR_OR_NULL(dev))
+ goto out_err;
+
+ kref_init(&dev->ref);
+ dev->pool = pool;
+ dev->ib_dev = ib_dev;
+ dev->ib_pd = ib_alloc_pd(ib_dev, pool->pd_flags);
+ if (IS_ERR(dev->ib_pd))
+ goto out_free_dev;
+
+ if (pool->ops && pool->ops->init && pool->ops->init(dev))
+ goto out_free_pd;
+
+ mutex_lock(&pool->mutex);
+ list_add(&dev->entry, &pool->list);
+out_unlock:
+ mutex_unlock(&pool->mutex);
+ return dev;
+
+out_free_pd:
+ ib_dealloc_pd(dev->ib_pd);
+out_free_dev:
+ if (pool->ops && pool->ops->free)
+ pool->ops->free(dev);
+ else
+ kfree(dev);
+out_err:
+ return NULL;
+}
+EXPORT_SYMBOL(rtrs_ib_dev_find_or_add);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h
new file mode 100644
index 000000000000..b48b53a7c143
--- /dev/null
+++ b/drivers/infiniband/ulp/rtrs/rtrs.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Transport Layer
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RTRS_H
+#define RTRS_H
+
+#include <linux/socket.h>
+#include <linux/scatterlist.h>
+
+struct rtrs_permit;
+struct rtrs_clt_sess;
+struct rtrs_srv_ctx;
+struct rtrs_srv_sess;
+struct rtrs_srv_op;
+
+/*
+ * RDMA transport (RTRS) client API
+ */
+
+/**
+ * enum rtrs_clt_link_ev - Events about connectivity state of a client
+ * @RTRS_CLT_LINK_EV_RECONNECTED Client was reconnected.
+ * @RTRS_CLT_LINK_EV_DISCONNECTED Client was disconnected.
+ */
+enum rtrs_clt_link_ev {
+ RTRS_CLT_LINK_EV_RECONNECTED,
+ RTRS_CLT_LINK_EV_DISCONNECTED,
+};
+
+/**
+ * Source and destination address of a path to be established
+ */
+struct rtrs_addr {
+ struct sockaddr_storage *src;
+ struct sockaddr_storage *dst;
+};
+
+/**
+ * rtrs_clt_ops - it holds the link event callback and private pointer.
+ * @priv: User supplied private data.
+ * @link_ev: Event notification callback function for connection state changes
+ * @priv: User supplied data that was passed to rtrs_clt_open()
+ * @ev: Occurred event
+ */
+struct rtrs_clt_ops {
+ void *priv;
+ void (*link_ev)(void *priv, enum rtrs_clt_link_ev ev);
+};
+
+struct rtrs_clt_sess *rtrs_clt_open(struct rtrs_clt_ops *ops,
+ const char *pathname,
+ const struct rtrs_addr *paths,
+ size_t path_cnt, u16 port,
+ size_t pdu_sz, u8 reconnect_delay_sec,
+ s16 max_reconnect_attempts, u32 nr_poll_queues);
+
+void rtrs_clt_close(struct rtrs_clt_sess *clt);
+
+enum wait_type {
+ RTRS_PERMIT_NOWAIT = 0,
+ RTRS_PERMIT_WAIT = 1
+};
+
+/**
+ * enum rtrs_clt_con_type() type of ib connection to use with a given
+ * rtrs_permit
+ * @ADMIN_CON - use connection reserved for "service" messages
+ * @IO_CON - use a connection reserved for IO
+ */
+enum rtrs_clt_con_type {
+ RTRS_ADMIN_CON,
+ RTRS_IO_CON
+};
+
+struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt_sess *sess,
+ enum rtrs_clt_con_type con_type,
+ enum wait_type wait);
+
+void rtrs_clt_put_permit(struct rtrs_clt_sess *sess,
+ struct rtrs_permit *permit);
+
+/**
+ * rtrs_clt_req_ops - it holds the request confirmation callback
+ * and a private pointer.
+ * @priv: User supplied private data.
+ * @conf_fn: callback function to be called as confirmation
+ * @priv: User provided data, passed back with corresponding
+ * @(conf) confirmation.
+ * @errno: error number.
+ */
+struct rtrs_clt_req_ops {
+ void *priv;
+ void (*conf_fn)(void *priv, int errno);
+};
+
+int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops,
+ struct rtrs_clt_sess *sess, struct rtrs_permit *permit,
+ const struct kvec *vec, size_t nr, size_t len,
+ struct scatterlist *sg, unsigned int sg_cnt);
+int rtrs_clt_rdma_cq_direct(struct rtrs_clt_sess *clt, unsigned int index);
+
+/**
+ * rtrs_attrs - RTRS session attributes
+ */
+struct rtrs_attrs {
+ u32 queue_depth;
+ u32 max_io_size;
+ u32 max_segments;
+};
+
+int rtrs_clt_query(struct rtrs_clt_sess *sess, struct rtrs_attrs *attr);
+
+/*
+ * Here goes RTRS server API
+ */
+
+/**
+ * enum rtrs_srv_link_ev - Server link events
+ * @RTRS_SRV_LINK_EV_CONNECTED: Connection from client established
+ * @RTRS_SRV_LINK_EV_DISCONNECTED: Connection was disconnected, all
+ * connection RTRS resources were freed.
+ */
+enum rtrs_srv_link_ev {
+ RTRS_SRV_LINK_EV_CONNECTED,
+ RTRS_SRV_LINK_EV_DISCONNECTED,
+};
+
+struct rtrs_srv_ops {
+ /**
+ * rdma_ev(): Event notification for RDMA operations
+ * If the callback returns a value != 0, an error
+ * message for the data transfer will be sent to
+ * the client.
+
+ * @priv: Private data set by rtrs_srv_set_sess_priv()
+ * @id: internal RTRS operation id
+ * @data: Pointer to (bidirectional) rdma memory area:
+ * - in case of %RTRS_SRV_RDMA_EV_RECV contains
+ * data sent by the client
+ * - in case of %RTRS_SRV_RDMA_EV_WRITE_REQ points
+ * to the memory area where the response is to be
+ * written to
+ * @datalen: Size of the memory area in @data
+ * @usr: The extra user message sent by the client (%vec)
+ * @usrlen: Size of the user message
+ */
+ int (*rdma_ev)(void *priv,
+ struct rtrs_srv_op *id,
+ void *data, size_t datalen, const void *usr,
+ size_t usrlen);
+ /**
+ * link_ev(): Events about connectivity state changes
+ * If the callback returns != 0 and the event
+ * %RTRS_SRV_LINK_EV_CONNECTED the corresponding
+ * session will be destroyed.
+ * @sess: Session
+ * @ev: event
+ * @priv: Private data from user if previously set with
+ * rtrs_srv_set_sess_priv()
+ */
+ int (*link_ev)(struct rtrs_srv_sess *sess, enum rtrs_srv_link_ev ev,
+ void *priv);
+};
+
+struct rtrs_srv_ctx *rtrs_srv_open(struct rtrs_srv_ops *ops, u16 port);
+
+void rtrs_srv_close(struct rtrs_srv_ctx *ctx);
+
+bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int errno);
+
+void rtrs_srv_set_sess_priv(struct rtrs_srv_sess *sess, void *priv);
+
+int rtrs_srv_get_path_name(struct rtrs_srv_sess *sess, char *pathname,
+ size_t len);
+
+int rtrs_srv_get_queue_depth(struct rtrs_srv_sess *sess);
+
+int rtrs_addr_to_sockaddr(const char *str, size_t len, u16 port,
+ struct rtrs_addr *addr);
+
+int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len);
+int rtrs_addr_to_str(const struct rtrs_addr *addr, char *buf, size_t len);
+#endif
diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig
index 6f5e7b3a3864..67cd63d1399c 100644
--- a/drivers/infiniband/ulp/srp/Kconfig
+++ b/drivers/infiniband/ulp/srp/Kconfig
@@ -3,7 +3,7 @@ config INFINIBAND_SRP
tristate "InfiniBand SCSI RDMA Protocol"
depends on SCSI && INFINIBAND_ADDR_TRANS
select SCSI_SRP_ATTRS
- ---help---
+ help
Support for the SCSI RDMA Protocol over InfiniBand. This
allows you to access storage devices that speak SRP over
InfiniBand.
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index cd1181c39ed2..1075c2ac8fe2 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -71,7 +71,6 @@ static unsigned int srp_sg_tablesize;
static unsigned int cmd_sg_entries;
static unsigned int indirect_sg_entries;
static bool allow_ext_sg;
-static bool prefer_fr = true;
static bool register_always = true;
static bool never_register;
static int topspin_workarounds = 1;
@@ -95,10 +94,6 @@ module_param(topspin_workarounds, int, 0444);
MODULE_PARM_DESC(topspin_workarounds,
"Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
-module_param(prefer_fr, bool, 0444);
-MODULE_PARM_DESC(prefer_fr,
-"Whether to use fast registration if both FMR and fast registration are supported");
-
module_param(register_always, bool, 0444);
MODULE_PARM_DESC(register_always,
"Use memory registration even for contiguous memory regions");
@@ -146,7 +141,7 @@ module_param(ch_count, uint, 0444);
MODULE_PARM_DESC(ch_count,
"Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
-static void srp_add_one(struct ib_device *device);
+static int srp_add_one(struct ib_device *device);
static void srp_remove_one(struct ib_device *device, void *client_data);
static void srp_rename_dev(struct ib_device *device, void *client_data);
static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
@@ -174,9 +169,9 @@ static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
int tmo = *(int *)kp->arg;
if (tmo >= 0)
- return sprintf(buffer, "%d\n", tmo);
+ return sysfs_emit(buffer, "%d\n", tmo);
else
- return sprintf(buffer, "off\n");
+ return sysfs_emit(buffer, "off\n");
}
static int srp_tmo_set(const char *val, const struct kernel_param *kp)
@@ -388,24 +383,6 @@ static int srp_new_cm_id(struct srp_rdma_ch *ch)
srp_new_ib_cm_id(ch);
}
-static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
-{
- struct srp_device *dev = target->srp_host->srp_dev;
- struct ib_fmr_pool_param fmr_param;
-
- memset(&fmr_param, 0, sizeof(fmr_param));
- fmr_param.pool_size = target->mr_pool_size;
- fmr_param.dirty_watermark = fmr_param.pool_size / 4;
- fmr_param.cache = 1;
- fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
- fmr_param.page_shift = ilog2(dev->mr_page_size);
- fmr_param.access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ);
-
- return ib_create_fmr_pool(dev->pd, &fmr_param);
-}
-
/**
* srp_destroy_fr_pool() - free the resources owned by a pool
* @pool: Fast registration pool to be destroyed.
@@ -453,7 +430,7 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
spin_lock_init(&pool->lock);
INIT_LIST_HEAD(&pool->free_list);
- if (device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
mr_type = IB_MR_TYPE_SG_GAPS;
else
mr_type = IB_MR_TYPE_MEM_REG;
@@ -556,7 +533,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
struct ib_qp_init_attr *init_attr;
struct ib_cq *recv_cq, *send_cq;
struct ib_qp *qp;
- struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL;
const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
int ret;
@@ -619,14 +595,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
"FR pool allocation failed (%d)\n", ret);
goto err_qp;
}
- } else if (dev->use_fmr) {
- fmr_pool = srp_alloc_fmr_pool(target);
- if (IS_ERR(fmr_pool)) {
- ret = PTR_ERR(fmr_pool);
- shost_printk(KERN_WARNING, target->scsi_host, PFX
- "FMR pool allocation failed (%d)\n", ret);
- goto err_qp;
- }
}
if (ch->qp)
@@ -644,10 +612,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
if (ch->fr_pool)
srp_destroy_fr_pool(ch->fr_pool);
ch->fr_pool = fr_pool;
- } else if (dev->use_fmr) {
- if (ch->fmr_pool)
- ib_destroy_fmr_pool(ch->fmr_pool);
- ch->fmr_pool = fmr_pool;
}
kfree(init_attr);
@@ -702,9 +666,6 @@ static void srp_free_ch_ib(struct srp_target_port *target,
if (dev->use_fast_reg) {
if (ch->fr_pool)
srp_destroy_fr_pool(ch->fr_pool);
- } else if (dev->use_fmr) {
- if (ch->fmr_pool)
- ib_destroy_fmr_pool(ch->fmr_pool);
}
srp_destroy_qp(ch);
@@ -738,7 +699,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
static void srp_path_rec_completion(int status,
struct sa_path_rec *pathrec,
- void *ch_ptr)
+ int num_paths, void *ch_ptr)
{
struct srp_rdma_ch *ch = ch_ptr;
struct srp_target_port *target = ch->target;
@@ -1004,80 +965,52 @@ static void srp_disconnect_target(struct srp_target_port *target)
}
}
-static void srp_free_req_data(struct srp_target_port *target,
- struct srp_rdma_ch *ch)
+static int srp_exit_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
+ struct srp_target_port *target = host_to_target(shost);
struct srp_device *dev = target->srp_host->srp_dev;
struct ib_device *ibdev = dev->dev;
- struct srp_request *req;
- int i;
-
- if (!ch->req_ring)
- return;
+ struct srp_request *req = scsi_cmd_priv(cmd);
- for (i = 0; i < target->req_ring_size; ++i) {
- req = &ch->req_ring[i];
- if (dev->use_fast_reg) {
- kfree(req->fr_list);
- } else {
- kfree(req->fmr_list);
- kfree(req->map_page);
- }
- if (req->indirect_dma_addr) {
- ib_dma_unmap_single(ibdev, req->indirect_dma_addr,
- target->indirect_size,
- DMA_TO_DEVICE);
- }
- kfree(req->indirect_desc);
+ kfree(req->fr_list);
+ if (req->indirect_dma_addr) {
+ ib_dma_unmap_single(ibdev, req->indirect_dma_addr,
+ target->indirect_size,
+ DMA_TO_DEVICE);
}
+ kfree(req->indirect_desc);
- kfree(ch->req_ring);
- ch->req_ring = NULL;
+ return 0;
}
-static int srp_alloc_req_data(struct srp_rdma_ch *ch)
+static int srp_init_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
- struct srp_target_port *target = ch->target;
+ struct srp_target_port *target = host_to_target(shost);
struct srp_device *srp_dev = target->srp_host->srp_dev;
struct ib_device *ibdev = srp_dev->dev;
- struct srp_request *req;
- void *mr_list;
+ struct srp_request *req = scsi_cmd_priv(cmd);
dma_addr_t dma_addr;
- int i, ret = -ENOMEM;
-
- ch->req_ring = kcalloc(target->req_ring_size, sizeof(*ch->req_ring),
- GFP_KERNEL);
- if (!ch->req_ring)
- goto out;
+ int ret = -ENOMEM;
- for (i = 0; i < target->req_ring_size; ++i) {
- req = &ch->req_ring[i];
- mr_list = kmalloc_array(target->mr_per_cmd, sizeof(void *),
+ if (srp_dev->use_fast_reg) {
+ req->fr_list = kmalloc_array(target->mr_per_cmd, sizeof(void *),
GFP_KERNEL);
- if (!mr_list)
- goto out;
- if (srp_dev->use_fast_reg) {
- req->fr_list = mr_list;
- } else {
- req->fmr_list = mr_list;
- req->map_page = kmalloc_array(srp_dev->max_pages_per_mr,
- sizeof(void *),
- GFP_KERNEL);
- if (!req->map_page)
- goto out;
- }
- req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
- if (!req->indirect_desc)
- goto out;
-
- dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
- target->indirect_size,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(ibdev, dma_addr))
+ if (!req->fr_list)
goto out;
+ }
+ req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+ if (!req->indirect_desc)
+ goto out;
- req->indirect_dma_addr = dma_addr;
+ dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+ target->indirect_size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, dma_addr)) {
+ srp_exit_cmd_priv(shost, cmd);
+ goto out;
}
+
+ req->indirect_dma_addr = dma_addr;
ret = 0;
out:
@@ -1093,10 +1026,17 @@ out:
*/
static void srp_del_scsi_host_attr(struct Scsi_Host *shost)
{
- struct device_attribute **attr;
+ const struct attribute_group **g;
+ struct attribute **attr;
- for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr)
- device_remove_file(&shost->shost_dev, *attr);
+ for (g = shost->hostt->shost_groups; *g; ++g) {
+ for (attr = (*g)->attrs; *attr; ++attr) {
+ struct device_attribute *dev_attr =
+ container_of(*attr, typeof(*dev_attr), attr);
+
+ device_remove_file(&shost->shost_dev, dev_attr);
+ }
+ }
}
static void srp_remove_target(struct srp_target_port *target)
@@ -1119,10 +1059,6 @@ static void srp_remove_target(struct srp_target_port *target)
}
cancel_work_sync(&target->tl_err_work);
srp_rport_put(target->rport);
- for (i = 0; i < target->ch_count; i++) {
- ch = &target->ch[i];
- srp_free_req_data(target, ch);
- }
kfree(target->ch);
target->ch = NULL;
@@ -1272,11 +1208,6 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
if (req->nmdesc)
srp_fr_pool_put(ch->fr_pool, req->fr_list,
req->nmdesc);
- } else if (dev->use_fmr) {
- struct ib_pool_fmr **pfmr;
-
- for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
- ib_fmr_pool_unmap(*pfmr);
}
ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd),
@@ -1342,26 +1273,35 @@ static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req,
if (scmnd) {
srp_free_req(ch, req, scmnd, 0);
scmnd->result = result;
- scmnd->scsi_done(scmnd);
+ scsi_done(scmnd);
}
}
-static void srp_terminate_io(struct srp_rport *rport)
+struct srp_terminate_context {
+ struct srp_target_port *srp_target;
+ int scsi_result;
+};
+
+static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr)
{
- struct srp_target_port *target = rport->lld_data;
- struct srp_rdma_ch *ch;
- int i, j;
+ struct srp_terminate_context *context = context_ptr;
+ struct srp_target_port *target = context->srp_target;
+ u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd));
+ struct srp_rdma_ch *ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)];
+ struct srp_request *req = scsi_cmd_priv(scmnd);
- for (i = 0; i < target->ch_count; i++) {
- ch = &target->ch[i];
+ srp_finish_req(ch, req, NULL, context->scsi_result);
- for (j = 0; j < target->req_ring_size; ++j) {
- struct srp_request *req = &ch->req_ring[j];
+ return true;
+}
- srp_finish_req(ch, req, NULL,
- DID_TRANSPORT_FAILFAST << 16);
- }
- }
+static void srp_terminate_io(struct srp_rport *rport)
+{
+ struct srp_target_port *target = rport->lld_data;
+ struct srp_terminate_context context = { .srp_target = target,
+ .scsi_result = DID_TRANSPORT_FAILFAST << 16 };
+
+ scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, &context);
}
/* Calculate maximum initiator to target information unit length. */
@@ -1417,13 +1357,12 @@ static int srp_rport_reconnect(struct srp_rport *rport)
ch = &target->ch[i];
ret += srp_new_cm_id(ch);
}
- for (i = 0; i < target->ch_count; i++) {
- ch = &target->ch[i];
- for (j = 0; j < target->req_ring_size; ++j) {
- struct srp_request *req = &ch->req_ring[j];
+ {
+ struct srp_terminate_context context = {
+ .srp_target = target, .scsi_result = DID_RESET << 16};
- srp_finish_req(ch, req, NULL, DID_RESET << 16);
- }
+ scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd,
+ &context);
}
for (i = 0; i < target->ch_count; i++) {
ch = &target->ch[i];
@@ -1472,50 +1411,6 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
state->ndesc++;
}
-static int srp_map_finish_fmr(struct srp_map_state *state,
- struct srp_rdma_ch *ch)
-{
- struct srp_target_port *target = ch->target;
- struct srp_device *dev = target->srp_host->srp_dev;
- struct ib_pool_fmr *fmr;
- u64 io_addr = 0;
-
- if (state->fmr.next >= state->fmr.end) {
- shost_printk(KERN_ERR, ch->target->scsi_host,
- PFX "Out of MRs (mr_per_cmd = %d)\n",
- ch->target->mr_per_cmd);
- return -ENOMEM;
- }
-
- WARN_ON_ONCE(!dev->use_fmr);
-
- if (state->npages == 0)
- return 0;
-
- if (state->npages == 1 && target->global_rkey) {
- srp_map_desc(state, state->base_dma_addr, state->dma_len,
- target->global_rkey);
- goto reset_state;
- }
-
- fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
- state->npages, io_addr);
- if (IS_ERR(fmr))
- return PTR_ERR(fmr);
-
- *state->fmr.next++ = fmr;
- state->nmdesc++;
-
- srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
- state->dma_len, fmr->fmr->rkey);
-
-reset_state:
- state->npages = 0;
- state->dma_len = 0;
-
- return 0;
-}
-
static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
{
srp_handle_qp_err(cq, wc, "FAST REG");
@@ -1606,74 +1501,6 @@ static int srp_map_finish_fr(struct srp_map_state *state,
return n;
}
-static int srp_map_sg_entry(struct srp_map_state *state,
- struct srp_rdma_ch *ch,
- struct scatterlist *sg)
-{
- struct srp_target_port *target = ch->target;
- struct srp_device *dev = target->srp_host->srp_dev;
- dma_addr_t dma_addr = sg_dma_address(sg);
- unsigned int dma_len = sg_dma_len(sg);
- unsigned int len = 0;
- int ret;
-
- WARN_ON_ONCE(!dma_len);
-
- while (dma_len) {
- unsigned offset = dma_addr & ~dev->mr_page_mask;
-
- if (state->npages == dev->max_pages_per_mr ||
- (state->npages > 0 && offset != 0)) {
- ret = srp_map_finish_fmr(state, ch);
- if (ret)
- return ret;
- }
-
- len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
-
- if (!state->npages)
- state->base_dma_addr = dma_addr;
- state->pages[state->npages++] = dma_addr & dev->mr_page_mask;
- state->dma_len += len;
- dma_addr += len;
- dma_len -= len;
- }
-
- /*
- * If the end of the MR is not on a page boundary then we need to
- * close it out and start a new one -- we can only merge at page
- * boundaries.
- */
- ret = 0;
- if ((dma_addr & ~dev->mr_page_mask) != 0)
- ret = srp_map_finish_fmr(state, ch);
- return ret;
-}
-
-static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
- struct srp_request *req, struct scatterlist *scat,
- int count)
-{
- struct scatterlist *sg;
- int i, ret;
-
- state->pages = req->map_page;
- state->fmr.next = req->fmr_list;
- state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
-
- for_each_sg(scat, sg, count, i) {
- ret = srp_map_sg_entry(state, ch, sg);
- if (ret)
- return ret;
- }
-
- ret = srp_map_finish_fmr(state, ch);
- if (ret)
- return ret;
-
- return 0;
-}
-
static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
struct srp_request *req, struct scatterlist *scat,
int count)
@@ -1733,7 +1560,6 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
struct srp_device *dev = target->srp_host->srp_dev;
struct srp_map_state state;
struct srp_direct_buf idb_desc;
- u64 idb_pages[1];
struct scatterlist idb_sg[1];
int ret;
@@ -1756,14 +1582,6 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
if (ret < 0)
return ret;
WARN_ON_ONCE(ret < 1);
- } else if (dev->use_fmr) {
- state.pages = idb_pages;
- state.pages[0] = (req->indirect_dma_addr &
- dev->mr_page_mask);
- state.npages = 1;
- ret = srp_map_finish_fmr(&state, ch);
- if (ret < 0)
- return ret;
} else {
return -EINVAL;
}
@@ -1787,9 +1605,6 @@ static void srp_check_mapping(struct srp_map_state *state,
if (dev->use_fast_reg)
for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
mr_len += (*pfr)->mr->length;
- else if (dev->use_fmr)
- for (i = 0; i < state->nmdesc; i++)
- mr_len += be32_to_cpu(req->indirect_desc[i].len);
if (desc_len != scsi_bufflen(req->scmnd) ||
mr_len > scsi_bufflen(req->scmnd))
pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
@@ -1904,8 +1719,6 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
state.desc = req->indirect_desc;
if (dev->use_fast_reg)
ret = srp_map_sg_fr(&state, ch, req, scat, count);
- else if (dev->use_fmr)
- ret = srp_map_sg_fmr(&state, ch, req, scat, count);
else
ret = srp_map_sg_dma(&state, ch, req, scat, count);
req->nmdesc = state.nmdesc;
@@ -2145,11 +1958,9 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
spin_unlock_irqrestore(&ch->lock, flags);
} else {
scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag);
- if (scmnd && scmnd->host_scribble) {
- req = (void *)scmnd->host_scribble;
+ if (scmnd) {
+ req = scsi_cmd_priv(scmnd);
scmnd = srp_claim_req(ch, req, NULL, scmnd);
- } else {
- scmnd = NULL;
}
if (!scmnd) {
shost_printk(KERN_ERR, target->scsi_host,
@@ -2183,8 +1994,7 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
srp_free_req(ch, req, scmnd,
be32_to_cpu(rsp->req_lim_delta));
- scmnd->host_scribble = NULL;
- scmnd->scsi_done(scmnd);
+ scsi_done(scmnd);
}
}
@@ -2349,28 +2159,24 @@ static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
{
+ struct request *rq = scsi_cmd_to_rq(scmnd);
struct srp_target_port *target = host_to_target(shost);
struct srp_rdma_ch *ch;
- struct srp_request *req;
+ struct srp_request *req = scsi_cmd_priv(scmnd);
struct srp_iu *iu;
struct srp_cmd *cmd;
struct ib_device *dev;
unsigned long flags;
u32 tag;
- u16 idx;
int len, ret;
scmnd->result = srp_chkready(target->rport);
if (unlikely(scmnd->result))
goto err;
- WARN_ON_ONCE(scmnd->request->tag < 0);
- tag = blk_mq_unique_tag(scmnd->request);
+ WARN_ON_ONCE(rq->tag < 0);
+ tag = blk_mq_unique_tag(rq);
ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)];
- idx = blk_mq_unique_tag_to_tag(tag);
- WARN_ONCE(idx >= target->req_ring_size, "%s: tag %#x: idx %d >= %d\n",
- dev_name(&shost->shost_gendev), tag, idx,
- target->req_ring_size);
spin_lock_irqsave(&ch->lock, flags);
iu = __srp_get_tx_iu(ch, SRP_IU_CMD);
@@ -2379,13 +2185,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
if (!iu)
goto err;
- req = &ch->req_ring[idx];
dev = target->srp_host->srp_dev->dev;
ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len,
DMA_TO_DEVICE);
- scmnd->host_scribble = (void *) req;
-
cmd = iu->buf;
memset(cmd, 0, sizeof *cmd);
@@ -2414,7 +2217,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
* to reduce queue depth temporarily.
*/
scmnd->result = len == -ENOMEM ?
- DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16;
+ DID_OK << 16 | SAM_STAT_TASK_SET_FULL : DID_ERROR << 16;
goto err_iu;
}
@@ -2443,7 +2246,7 @@ err_iu:
err:
if (scmnd->result) {
- scmnd->scsi_done(scmnd);
+ scsi_done(scmnd);
ret = 0;
} else {
ret = SCSI_MLQUEUE_HOST_BUSY;
@@ -2986,7 +2789,7 @@ static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun,
static int srp_abort(struct scsi_cmnd *scmnd)
{
struct srp_target_port *target = host_to_target(scmnd->device->host);
- struct srp_request *req = (struct srp_request *) scmnd->host_scribble;
+ struct srp_request *req = scsi_cmd_priv(scmnd);
u32 tag;
u16 ch_idx;
struct srp_rdma_ch *ch;
@@ -2994,9 +2797,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
- if (!req)
- return SUCCESS;
- tag = blk_mq_unique_tag(scmnd->request);
+ tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd));
ch_idx = blk_mq_unique_tag_to_hwq(tag);
if (WARN_ON_ONCE(ch_idx >= target->ch_count))
return SUCCESS;
@@ -3015,7 +2816,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
if (ret == SUCCESS) {
srp_free_req(ch, req, scmnd, 0);
scmnd->result = DID_ABORT << 16;
- scmnd->scsi_done(scmnd);
+ scsi_done(scmnd);
}
return ret;
@@ -3073,52 +2874,63 @@ static int srp_slave_configure(struct scsi_device *sdev)
return 0;
}
-static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr,
+static ssize_t id_ext_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->id_ext));
+ return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->id_ext));
}
-static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr,
+static DEVICE_ATTR_RO(id_ext);
+
+static ssize_t ioc_guid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->ioc_guid));
+ return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->ioc_guid));
}
-static ssize_t show_service_id(struct device *dev,
+static DEVICE_ATTR_RO(ioc_guid);
+
+static ssize_t service_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
if (target->using_rdma_cm)
return -ENOENT;
- return sprintf(buf, "0x%016llx\n",
- be64_to_cpu(target->ib_cm.service_id));
+ return sysfs_emit(buf, "0x%016llx\n",
+ be64_to_cpu(target->ib_cm.service_id));
}
-static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
+static DEVICE_ATTR_RO(service_id);
+
+static ssize_t pkey_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
if (target->using_rdma_cm)
return -ENOENT;
- return sprintf(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey));
+
+ return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey));
}
-static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+static DEVICE_ATTR_RO(pkey);
+
+static ssize_t sgid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%pI6\n", target->sgid.raw);
+ return sysfs_emit(buf, "%pI6\n", target->sgid.raw);
}
-static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
+static DEVICE_ATTR_RO(sgid);
+
+static ssize_t dgid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
@@ -3126,21 +2938,27 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
if (target->using_rdma_cm)
return -ENOENT;
- return sprintf(buf, "%pI6\n", ch->ib_cm.path.dgid.raw);
+
+ return sysfs_emit(buf, "%pI6\n", ch->ib_cm.path.dgid.raw);
}
-static ssize_t show_orig_dgid(struct device *dev,
- struct device_attribute *attr, char *buf)
+static DEVICE_ATTR_RO(dgid);
+
+static ssize_t orig_dgid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
if (target->using_rdma_cm)
return -ENOENT;
- return sprintf(buf, "%pI6\n", target->ib_cm.orig_dgid.raw);
+
+ return sysfs_emit(buf, "%pI6\n", target->ib_cm.orig_dgid.raw);
}
-static ssize_t show_req_lim(struct device *dev,
- struct device_attribute *attr, char *buf)
+static DEVICE_ATTR_RO(orig_dgid);
+
+static ssize_t req_lim_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
struct srp_rdma_ch *ch;
@@ -3150,111 +2968,115 @@ static ssize_t show_req_lim(struct device *dev,
ch = &target->ch[i];
req_lim = min(req_lim, ch->req_lim);
}
- return sprintf(buf, "%d\n", req_lim);
+
+ return sysfs_emit(buf, "%d\n", req_lim);
}
-static ssize_t show_zero_req_lim(struct device *dev,
+static DEVICE_ATTR_RO(req_lim);
+
+static ssize_t zero_req_lim_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%d\n", target->zero_req_lim);
+ return sysfs_emit(buf, "%d\n", target->zero_req_lim);
}
-static ssize_t show_local_ib_port(struct device *dev,
+static DEVICE_ATTR_RO(zero_req_lim);
+
+static ssize_t local_ib_port_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%d\n", target->srp_host->port);
+ return sysfs_emit(buf, "%u\n", target->srp_host->port);
}
-static ssize_t show_local_ib_device(struct device *dev,
+static DEVICE_ATTR_RO(local_ib_port);
+
+static ssize_t local_ib_device_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%s\n",
- dev_name(&target->srp_host->srp_dev->dev->dev));
+ return sysfs_emit(buf, "%s\n",
+ dev_name(&target->srp_host->srp_dev->dev->dev));
}
-static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr,
+static DEVICE_ATTR_RO(local_ib_device);
+
+static ssize_t ch_count_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%d\n", target->ch_count);
+ return sysfs_emit(buf, "%d\n", target->ch_count);
}
-static ssize_t show_comp_vector(struct device *dev,
+static DEVICE_ATTR_RO(ch_count);
+
+static ssize_t comp_vector_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%d\n", target->comp_vector);
+ return sysfs_emit(buf, "%d\n", target->comp_vector);
}
-static ssize_t show_tl_retry_count(struct device *dev,
+static DEVICE_ATTR_RO(comp_vector);
+
+static ssize_t tl_retry_count_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%d\n", target->tl_retry_count);
+ return sysfs_emit(buf, "%d\n", target->tl_retry_count);
}
-static ssize_t show_cmd_sg_entries(struct device *dev,
+static DEVICE_ATTR_RO(tl_retry_count);
+
+static ssize_t cmd_sg_entries_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%u\n", target->cmd_sg_cnt);
+ return sysfs_emit(buf, "%u\n", target->cmd_sg_cnt);
}
-static ssize_t show_allow_ext_sg(struct device *dev,
+static DEVICE_ATTR_RO(cmd_sg_entries);
+
+static ssize_t allow_ext_sg_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct srp_target_port *target = host_to_target(class_to_shost(dev));
- return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", target->allow_ext_sg ? "true" : "false");
}
-static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL);
-static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL);
-static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL);
-static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
-static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL);
-static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL);
-static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL);
-static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL);
-static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL);
-static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL);
-static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
-static DEVICE_ATTR(ch_count, S_IRUGO, show_ch_count, NULL);
-static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL);
-static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL);
-static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL);
-static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL);
-
-static struct device_attribute *srp_host_attrs[] = {
- &dev_attr_id_ext,
- &dev_attr_ioc_guid,
- &dev_attr_service_id,
- &dev_attr_pkey,
- &dev_attr_sgid,
- &dev_attr_dgid,
- &dev_attr_orig_dgid,
- &dev_attr_req_lim,
- &dev_attr_zero_req_lim,
- &dev_attr_local_ib_port,
- &dev_attr_local_ib_device,
- &dev_attr_ch_count,
- &dev_attr_comp_vector,
- &dev_attr_tl_retry_count,
- &dev_attr_cmd_sg_entries,
- &dev_attr_allow_ext_sg,
+static DEVICE_ATTR_RO(allow_ext_sg);
+
+static struct attribute *srp_host_attrs[] = {
+ &dev_attr_id_ext.attr,
+ &dev_attr_ioc_guid.attr,
+ &dev_attr_service_id.attr,
+ &dev_attr_pkey.attr,
+ &dev_attr_sgid.attr,
+ &dev_attr_dgid.attr,
+ &dev_attr_orig_dgid.attr,
+ &dev_attr_req_lim.attr,
+ &dev_attr_zero_req_lim.attr,
+ &dev_attr_local_ib_port.attr,
+ &dev_attr_local_ib_device.attr,
+ &dev_attr_ch_count.attr,
+ &dev_attr_comp_vector.attr,
+ &dev_attr_tl_retry_count.attr,
+ &dev_attr_cmd_sg_entries.attr,
+ &dev_attr_allow_ext_sg.attr,
NULL
};
+ATTRIBUTE_GROUPS(srp_host);
+
static struct scsi_host_template srp_template = {
.module = THIS_MODULE,
.name = "InfiniBand SRP initiator",
@@ -3262,6 +3084,8 @@ static struct scsi_host_template srp_template = {
.target_alloc = srp_target_alloc,
.slave_configure = srp_slave_configure,
.info = srp_target_info,
+ .init_cmd_priv = srp_init_cmd_priv,
+ .exit_cmd_priv = srp_exit_cmd_priv,
.queuecommand = srp_queuecommand,
.change_queue_depth = srp_change_queue_depth,
.eh_timed_out = srp_timed_out,
@@ -3273,8 +3097,9 @@ static struct scsi_host_template srp_template = {
.can_queue = SRP_DEFAULT_CMD_SQ_SIZE,
.this_id = -1,
.cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE,
- .shost_attrs = srp_host_attrs,
+ .shost_groups = srp_host_groups,
.track_queue_depth = 1,
+ .cmd_size = sizeof(struct srp_request),
};
static int srp_sdev_count(struct Scsi_Host *host)
@@ -3352,11 +3177,16 @@ static void srp_release_dev(struct device *dev)
struct srp_host *host =
container_of(dev, struct srp_host, dev);
- complete(&host->released);
+ kfree(host);
}
+static struct attribute *srp_class_attrs[];
+
+ATTRIBUTE_GROUPS(srp_class);
+
static struct class srp_class = {
.name = "infiniband_srp",
+ .dev_groups = srp_class_groups,
.dev_release = srp_release_dev
};
@@ -3424,6 +3254,7 @@ enum {
SRP_OPT_IP_DEST = 1 << 16,
SRP_OPT_TARGET_CAN_QUEUE= 1 << 17,
SRP_OPT_MAX_IT_IU_SIZE = 1 << 18,
+ SRP_OPT_CH_COUNT = 1 << 19,
};
static unsigned int srp_opt_mandatory[] = {
@@ -3457,6 +3288,7 @@ static const match_table_t srp_opt_tokens = {
{ SRP_OPT_IP_SRC, "src=%s" },
{ SRP_OPT_IP_DEST, "dest=%s" },
{ SRP_OPT_MAX_IT_IU_SIZE, "max_it_iu_size=%d" },
+ { SRP_OPT_CH_COUNT, "ch_count=%u", },
{ SRP_OPT_ERR, NULL }
};
@@ -3758,6 +3590,14 @@ static int srp_parse_options(struct net *net, const char *buf,
target->max_it_iu_size = token;
break;
+ case SRP_OPT_CH_COUNT:
+ if (match_int(args, &token) || token < 1) {
+ pr_warn("bad channel count %s\n", p);
+ goto out;
+ }
+ target->ch_count = token;
+ break;
+
default:
pr_warn("unknown parameter or missing value '%s' in target creation request\n",
p);
@@ -3785,9 +3625,9 @@ out:
return ret;
}
-static ssize_t srp_create_target(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t add_target_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
{
struct srp_host *host =
container_of(dev, struct srp_host, dev);
@@ -3796,7 +3636,7 @@ static ssize_t srp_create_target(struct device *dev,
struct srp_rdma_ch *ch;
struct srp_device *srp_dev = host->srp_dev;
struct ib_device *ibdev = srp_dev->dev;
- int ret, node_idx, node, cpu, i;
+ int ret, i, ch_idx;
unsigned int max_sectors_per_mr, mr_per_cmd = 0;
bool multich = false;
uint32_t max_iu_len;
@@ -3813,7 +3653,7 @@ static ssize_t srp_create_target(struct device *dev,
target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
target_host->max_segment_size = ib_dma_max_seg_size(ibdev);
- if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+ if (!(ibdev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG))
target_host->virt_boundary_mask = ~srp_dev->mr_page_mask;
target = host_to_target(target_host);
@@ -3844,8 +3684,6 @@ static ssize_t srp_create_target(struct device *dev,
if (ret)
goto out;
- target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
-
if (!srp_conn_unique(target->srp_host, target)) {
if (target->using_rdma_cm) {
shost_printk(KERN_INFO, target->scsi_host,
@@ -3864,26 +3702,26 @@ static ssize_t srp_create_target(struct device *dev,
goto out;
}
- if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg &&
+ if (!srp_dev->has_fr && !target->allow_ext_sg &&
target->cmd_sg_cnt < target->sg_tablesize) {
pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n");
target->sg_tablesize = target->cmd_sg_cnt;
}
- if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
- bool gaps_reg = (ibdev->attrs.device_cap_flags &
- IB_DEVICE_SG_GAPS_REG);
+ if (srp_dev->use_fast_reg) {
+ bool gaps_reg = ibdev->attrs.kernel_cap_flags &
+ IBK_SG_GAPS_REG;
max_sectors_per_mr = srp_dev->max_pages_per_mr <<
(ilog2(srp_dev->mr_page_size) - 9);
if (!gaps_reg) {
/*
- * FR and FMR can only map one HCA page per entry. If
- * the start address is not aligned on a HCA page
- * boundary two entries will be used for the head and
- * the tail although these two entries combined
- * contain at most one HCA page of data. Hence the "+
- * 1" in the calculation below.
+ * FR can only map one HCA page per entry. If the start
+ * address is not aligned on a HCA page boundary two
+ * entries will be used for the head and the tail
+ * although these two entries combined contain at most
+ * one HCA page of data. Hence the "+ 1" in the
+ * calculation below.
*
* The indirect data buffer descriptor is contiguous
* so the memory for that buffer will only be
@@ -3921,79 +3759,56 @@ static ssize_t srp_create_target(struct device *dev,
goto out;
ret = -ENOMEM;
- target->ch_count = max_t(unsigned, num_online_nodes(),
- min(ch_count ? :
- min(4 * num_online_nodes(),
- ibdev->num_comp_vectors),
- num_online_cpus()));
+ if (target->ch_count == 0) {
+ target->ch_count =
+ min(ch_count ?:
+ max(4 * num_online_nodes(),
+ ibdev->num_comp_vectors),
+ num_online_cpus());
+ }
+
target->ch = kcalloc(target->ch_count, sizeof(*target->ch),
GFP_KERNEL);
if (!target->ch)
goto out;
- node_idx = 0;
- for_each_online_node(node) {
- const int ch_start = (node_idx * target->ch_count /
- num_online_nodes());
- const int ch_end = ((node_idx + 1) * target->ch_count /
- num_online_nodes());
- const int cv_start = node_idx * ibdev->num_comp_vectors /
- num_online_nodes();
- const int cv_end = (node_idx + 1) * ibdev->num_comp_vectors /
- num_online_nodes();
- int cpu_idx = 0;
-
- for_each_online_cpu(cpu) {
- if (cpu_to_node(cpu) != node)
- continue;
- if (ch_start + cpu_idx >= ch_end)
- continue;
- ch = &target->ch[ch_start + cpu_idx];
- ch->target = target;
- ch->comp_vector = cv_start == cv_end ? cv_start :
- cv_start + cpu_idx % (cv_end - cv_start);
- spin_lock_init(&ch->lock);
- INIT_LIST_HEAD(&ch->free_tx);
- ret = srp_new_cm_id(ch);
- if (ret)
- goto err_disconnect;
+ for (ch_idx = 0; ch_idx < target->ch_count; ++ch_idx) {
+ ch = &target->ch[ch_idx];
+ ch->target = target;
+ ch->comp_vector = ch_idx % ibdev->num_comp_vectors;
+ spin_lock_init(&ch->lock);
+ INIT_LIST_HEAD(&ch->free_tx);
+ ret = srp_new_cm_id(ch);
+ if (ret)
+ goto err_disconnect;
- ret = srp_create_ch_ib(ch);
- if (ret)
- goto err_disconnect;
+ ret = srp_create_ch_ib(ch);
+ if (ret)
+ goto err_disconnect;
- ret = srp_alloc_req_data(ch);
- if (ret)
- goto err_disconnect;
+ ret = srp_connect_ch(ch, max_iu_len, multich);
+ if (ret) {
+ char dst[64];
- ret = srp_connect_ch(ch, max_iu_len, multich);
- if (ret) {
- char dst[64];
-
- if (target->using_rdma_cm)
- snprintf(dst, sizeof(dst), "%pIS",
- &target->rdma_cm.dst);
- else
- snprintf(dst, sizeof(dst), "%pI6",
- target->ib_cm.orig_dgid.raw);
- shost_printk(KERN_ERR, target->scsi_host,
- PFX "Connection %d/%d to %s failed\n",
- ch_start + cpu_idx,
- target->ch_count, dst);
- if (node_idx == 0 && cpu_idx == 0) {
- goto free_ch;
- } else {
- srp_free_ch_ib(target, ch);
- srp_free_req_data(target, ch);
- target->ch_count = ch - target->ch;
- goto connected;
- }
+ if (target->using_rdma_cm)
+ snprintf(dst, sizeof(dst), "%pIS",
+ &target->rdma_cm.dst);
+ else
+ snprintf(dst, sizeof(dst), "%pI6",
+ target->ib_cm.orig_dgid.raw);
+ shost_printk(KERN_ERR, target->scsi_host,
+ PFX "Connection %d/%d to %s failed\n",
+ ch_idx,
+ target->ch_count, dst);
+ if (ch_idx == 0) {
+ goto free_ch;
+ } else {
+ srp_free_ch_ib(target, ch);
+ target->ch_count = ch - target->ch;
+ goto connected;
}
-
- multich = true;
- cpu_idx++;
}
- node_idx++;
+ multich = true;
}
connected:
@@ -4049,36 +3864,42 @@ free_ch:
for (i = 0; i < target->ch_count; i++) {
ch = &target->ch[i];
srp_free_ch_ib(target, ch);
- srp_free_req_data(target, ch);
}
kfree(target->ch);
goto out;
}
-static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target);
+static DEVICE_ATTR_WO(add_target);
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_host *host = container_of(dev, struct srp_host, dev);
- return sprintf(buf, "%s\n", dev_name(&host->srp_dev->dev->dev));
+ return sysfs_emit(buf, "%s\n", dev_name(&host->srp_dev->dev->dev));
}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
-static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+static ssize_t port_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct srp_host *host = container_of(dev, struct srp_host, dev);
- return sprintf(buf, "%d\n", host->port);
+ return sysfs_emit(buf, "%u\n", host->port);
}
-static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+static DEVICE_ATTR_RO(port);
+
+static struct attribute *srp_class_attrs[] = {
+ &dev_attr_add_target.attr,
+ &dev_attr_ibdev.attr,
+ &dev_attr_port.attr,
+ NULL
+};
-static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
+static struct srp_host *srp_add_port(struct srp_device *device, u32 port)
{
struct srp_host *host;
@@ -4088,33 +3909,24 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
INIT_LIST_HEAD(&host->target_list);
spin_lock_init(&host->target_lock);
- init_completion(&host->released);
mutex_init(&host->add_target_mutex);
host->srp_dev = device;
host->port = port;
+ device_initialize(&host->dev);
host->dev.class = &srp_class;
host->dev.parent = device->dev->dev.parent;
- dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev),
- port);
-
- if (device_register(&host->dev))
- goto free_host;
- if (device_create_file(&host->dev, &dev_attr_add_target))
- goto err_class;
- if (device_create_file(&host->dev, &dev_attr_ibdev))
- goto err_class;
- if (device_create_file(&host->dev, &dev_attr_port))
- goto err_class;
+ if (dev_set_name(&host->dev, "srp-%s-%u", dev_name(&device->dev->dev),
+ port))
+ goto put_host;
+ if (device_add(&host->dev))
+ goto put_host;
return host;
-err_class:
- device_unregister(&host->dev);
-
-free_host:
- kfree(host);
-
+put_host:
+ device_del(&host->dev);
+ put_device(&host->dev);
return NULL;
}
@@ -4126,25 +3938,25 @@ static void srp_rename_dev(struct ib_device *device, void *client_data)
list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
char name[IB_DEVICE_NAME_MAX + 8];
- snprintf(name, sizeof(name), "srp-%s-%d",
+ snprintf(name, sizeof(name), "srp-%s-%u",
dev_name(&device->dev), host->port);
device_rename(&host->dev, name);
}
}
-static void srp_add_one(struct ib_device *device)
+static int srp_add_one(struct ib_device *device)
{
struct srp_device *srp_dev;
struct ib_device_attr *attr = &device->attrs;
struct srp_host *host;
int mr_page_shift;
- unsigned int p;
+ u32 p;
u64 max_pages_per_mr;
unsigned int flags = 0;
srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL);
if (!srp_dev)
- return;
+ return -ENOMEM;
/*
* Use the smallest page size supported by the HCA, down to a
@@ -4162,23 +3974,15 @@ static void srp_add_one(struct ib_device *device)
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr);
- srp_dev->has_fmr = (device->ops.alloc_fmr &&
- device->ops.dealloc_fmr &&
- device->ops.map_phys_fmr &&
- device->ops.unmap_fmr);
srp_dev->has_fr = (attr->device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
- if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
- dev_warn(&device->dev, "neither FMR nor FR is supported\n");
- } else if (!never_register &&
- attr->max_mr_size >= 2 * srp_dev->mr_page_size) {
- srp_dev->use_fast_reg = (srp_dev->has_fr &&
- (!srp_dev->has_fmr || prefer_fr));
- srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
- }
+ if (!never_register && !srp_dev->has_fr)
+ dev_warn(&device->dev, "FR is not supported\n");
+ else if (!never_register &&
+ attr->max_mr_size >= 2 * srp_dev->mr_page_size)
+ srp_dev->use_fast_reg = srp_dev->has_fr;
- if (never_register || !register_always ||
- (!srp_dev->has_fmr && !srp_dev->has_fr))
+ if (never_register || !register_always || !srp_dev->has_fr)
flags |= IB_PD_UNSAFE_GLOBAL_RKEY;
if (srp_dev->use_fast_reg) {
@@ -4197,8 +4001,12 @@ static void srp_add_one(struct ib_device *device)
srp_dev->dev = device;
srp_dev->pd = ib_alloc_pd(device, flags);
- if (IS_ERR(srp_dev->pd))
- goto free_dev;
+ if (IS_ERR(srp_dev->pd)) {
+ int ret = PTR_ERR(srp_dev->pd);
+
+ kfree(srp_dev);
+ return ret;
+ }
if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
srp_dev->global_rkey = srp_dev->pd->unsafe_global_rkey;
@@ -4212,10 +4020,7 @@ static void srp_add_one(struct ib_device *device)
}
ib_set_client_data(device, &srp_client, srp_dev);
- return;
-
-free_dev:
- kfree(srp_dev);
+ return 0;
}
static void srp_remove_one(struct ib_device *device, void *client_data)
@@ -4225,16 +4030,13 @@ static void srp_remove_one(struct ib_device *device, void *client_data)
struct srp_target_port *target;
srp_dev = client_data;
- if (!srp_dev)
- return;
list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
- device_unregister(&host->dev);
/*
- * Wait for the sysfs entry to go away, so that no new
- * target ports can be created.
+ * Remove the add_target sysfs entry so that no new target ports
+ * can be created.
*/
- wait_for_completion(&host->released);
+ device_del(&host->dev);
/*
* Remove all target ports.
@@ -4245,12 +4047,14 @@ static void srp_remove_one(struct ib_device *device, void *client_data)
spin_unlock(&host->target_lock);
/*
- * Wait for tl_err and target port removal tasks.
+ * srp_queue_remove_work() queues a call to
+ * srp_remove_target(). The latter function cancels
+ * target->tl_err_work so waiting for the remove works to
+ * finish is sufficient.
*/
- flush_workqueue(system_long_wq);
flush_workqueue(srp_remove_wq);
- kfree(host);
+ put_device(&host->dev);
}
ib_dealloc_pd(srp_dev->pd);
@@ -4273,10 +4077,13 @@ static int __init srp_init_module(void)
{
int ret;
+ BUILD_BUG_ON(sizeof(struct srp_aer_req) != 36);
+ BUILD_BUG_ON(sizeof(struct srp_cmd) != 48);
BUILD_BUG_ON(sizeof(struct srp_imm_buf) != 4);
+ BUILD_BUG_ON(sizeof(struct srp_indirect_buf) != 20);
BUILD_BUG_ON(sizeof(struct srp_login_req) != 64);
BUILD_BUG_ON(sizeof(struct srp_login_req_rdma) != 56);
- BUILD_BUG_ON(sizeof(struct srp_cmd) != 48);
+ BUILD_BUG_ON(sizeof(struct srp_rsp) != 36);
if (srp_sg_tablesize) {
pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 5359ece561ca..00b0068fda20 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -44,7 +44,6 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_sa.h>
#include <rdma/ib_cm.h>
-#include <rdma/ib_fmr_pool.h>
#include <rdma/rdma_cm.h>
enum {
@@ -93,10 +92,12 @@ enum srp_iu_type {
};
/*
+ * RDMA adapter in the initiator system.
+ *
+ * @dev_list: List of RDMA ports associated with this RDMA adapter (srp_host).
* @mr_page_mask: HCA memory registration page mask.
* @mr_page_size: HCA memory registration page size.
- * @mr_max_size: Maximum size in bytes of a single FMR / FR registration
- * request.
+ * @mr_max_size: Maximum size in bytes of a single FR registration request.
*/
struct srp_device {
struct list_head dev_list;
@@ -107,19 +108,22 @@ struct srp_device {
int mr_page_size;
int mr_max_size;
int max_pages_per_mr;
- bool has_fmr;
bool has_fr;
- bool use_fmr;
bool use_fast_reg;
};
+/*
+ * One port of an RDMA adapter in the initiator system.
+ *
+ * @target_list: List of connected target ports (struct srp_target_port).
+ * @target_lock: Protects @target_list.
+ */
struct srp_host {
struct srp_device *srp_dev;
- u8 port;
+ u32 port;
struct device dev;
struct list_head target_list;
spinlock_t target_lock;
- struct completion released;
struct list_head list;
struct mutex add_target_mutex;
};
@@ -127,11 +131,7 @@ struct srp_host {
struct srp_request {
struct scsi_cmnd *scmnd;
struct srp_iu *cmd;
- union {
- struct ib_pool_fmr **fmr_list;
- struct srp_fr_desc **fr_list;
- };
- u64 *map_page;
+ struct srp_fr_desc **fr_list;
struct srp_direct_buf *indirect_desc;
dma_addr_t indirect_dma_addr;
short nmdesc;
@@ -155,10 +155,7 @@ struct srp_rdma_ch {
struct ib_cq *send_cq;
struct ib_cq *recv_cq;
struct ib_qp *qp;
- union {
- struct ib_fmr_pool *fmr_pool;
- struct srp_fr_pool *fr_pool;
- };
+ struct srp_fr_pool *fr_pool;
uint32_t max_it_iu_len;
uint32_t max_ti_iu_len;
u8 max_imm_sge;
@@ -185,7 +182,6 @@ struct srp_rdma_ch {
struct srp_iu **tx_ring;
struct srp_iu **rx_ring;
- struct srp_request *req_ring;
int comp_vector;
u64 tsk_mgmt_tag;
@@ -195,7 +191,7 @@ struct srp_rdma_ch {
};
/**
- * struct srp_target_port
+ * struct srp_target_port - RDMA port in the SRP target system
* @comp_vector: Completion vector used by the first RDMA channel created for
* this target port.
*/
@@ -231,7 +227,6 @@ struct srp_target_port {
int mr_pool_size;
int mr_per_cmd;
int queue_size;
- int req_ring_size;
int comp_vector;
int tl_retry_count;
@@ -309,7 +304,7 @@ struct srp_fr_pool {
int max_page_list_len;
spinlock_t lock;
struct list_head free_list;
- struct srp_fr_desc desc[0];
+ struct srp_fr_desc desc[];
};
/**
@@ -319,20 +314,16 @@ struct srp_fr_pool {
* @pages: Array with DMA addresses of pages being considered for
* memory registration.
* @base_dma_addr: DMA address of the first page that has not yet been mapped.
- * @dma_len: Number of bytes that will be registered with the next
- * FMR or FR memory registration call.
+ * @dma_len: Number of bytes that will be registered with the next FR
+ * memory registration call.
* @total_len: Total number of bytes in the sg-list being mapped.
* @npages: Number of page addresses in the pages[] array.
- * @nmdesc: Number of FMR or FR memory descriptors used for mapping.
+ * @nmdesc: Number of FR memory descriptors used for mapping.
* @ndesc: Number of SRP buffer descriptors that have been filled in.
*/
struct srp_map_state {
union {
struct {
- struct ib_pool_fmr **next;
- struct ib_pool_fmr **end;
- } fmr;
- struct {
struct srp_fr_desc **next;
struct srp_fr_desc **end;
} fr;
diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
index ce7567cea9f6..4b5d9b792cfa 100644
--- a/drivers/infiniband/ulp/srpt/Kconfig
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -2,7 +2,7 @@
config INFINIBAND_SRPT
tristate "InfiniBand SCSI RDMA Protocol target support"
depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
- ---help---
+ help
Support for the SCSI RDMA Protocol (SRP) Target driver. The
SRP protocol is a protocol that allows an initiator to access
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 98552749d71c..3c3fae738c3e 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -81,7 +81,7 @@ MODULE_PARM_DESC(srpt_srq_size,
static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg);
+ return sprintf(buffer, "0x%016llx\n", *(u64 *)kp->arg);
}
module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
0444);
@@ -135,14 +135,11 @@ static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new)
static void srpt_event_handler(struct ib_event_handler *handler,
struct ib_event *event)
{
- struct srpt_device *sdev;
+ struct srpt_device *sdev =
+ container_of(handler, struct srpt_device, event_handler);
struct srpt_port *sport;
u8 port_num;
- sdev = ib_get_client_data(event->device, &srpt_client);
- if (!sdev || sdev->device != event->device)
- return;
-
pr_debug("ASYNC event= %d on device= %s\n", event->event,
dev_name(&sdev->device->dev));
@@ -217,8 +214,9 @@ static const char *get_ch_state_name(enum rdma_ch_state s)
*/
static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch)
{
- pr_debug("QP event %d on ch=%p sess_name=%s state=%d\n",
- event->event, ch, ch->sess_name, ch->state);
+ pr_debug("QP event %d on ch=%p sess_name=%s-%d state=%s\n",
+ event->event, ch, ch->sess_name, ch->qp->qp_num,
+ get_ch_state_name(ch->state));
switch (event->event) {
case IB_EVENT_COMM_EST:
@@ -567,12 +565,9 @@ static int srpt_refresh_port(struct srpt_port *sport)
if (ret)
return ret;
- sport->port_guid_id.wwn.priv = sport;
- srpt_format_guid(sport->port_guid_id.name,
- sizeof(sport->port_guid_id.name),
+ srpt_format_guid(sport->guid_name, ARRAY_SIZE(sport->guid_name),
&sport->gid.global.interface_id);
- sport->port_gid_id.wwn.priv = sport;
- snprintf(sport->port_gid_id.name, sizeof(sport->port_gid_id.name),
+ snprintf(sport->gid_name, ARRAY_SIZE(sport->gid_name),
"0x%016llx%016llx",
be64_to_cpu(sport->gid.global.subnet_prefix),
be64_to_cpu(sport->gid.global.interface_id));
@@ -610,6 +605,11 @@ static int srpt_refresh_port(struct srpt_port *sport)
dev_name(&sport->sdev->device->dev), sport->port,
PTR_ERR(sport->mad_agent));
sport->mad_agent = NULL;
+ memset(&port_modify, 0, sizeof(port_modify));
+ port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+ ib_modify_port(sport->sdev->device, sport->port, 0,
+ &port_modify);
+
}
}
@@ -619,10 +619,11 @@ static int srpt_refresh_port(struct srpt_port *sport)
/**
* srpt_unregister_mad_agent - unregister MAD callback functions
* @sdev: SRPT HCA pointer.
+ * @port_cnt: number of ports with registered MAD
*
* Note: It is safe to call this function more than once for the same device.
*/
-static void srpt_unregister_mad_agent(struct srpt_device *sdev)
+static void srpt_unregister_mad_agent(struct srpt_device *sdev, int port_cnt)
{
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP,
@@ -630,12 +631,11 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev)
struct srpt_port *sport;
int i;
- for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+ for (i = 1; i <= port_cnt; i++) {
sport = &sdev->port[i - 1];
WARN_ON(sport->port != i);
- if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0)
- pr_err("disabling MAD processing failed.\n");
if (sport->mad_agent) {
+ ib_modify_port(sdev->device, i, 0, &port_modify);
ib_unregister_mad_agent(sport->mad_agent);
sport->mad_agent = NULL;
}
@@ -867,7 +867,7 @@ static int srpt_zerolength_write(struct srpt_rdma_ch *ch)
static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_rdma_ch *ch = wc->qp->qp_context;
pr_debug("%s-%d wc->status %d\n", ch->sess_name, ch->qp->qp_num,
wc->status);
@@ -1320,7 +1320,7 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
*/
static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_rdma_ch *ch = wc->qp->qp_context;
struct srpt_send_ioctx *ioctx =
container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
@@ -1421,7 +1421,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch,
srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID;
srp_rsp->sense_data_len = cpu_to_be32(sense_data_len);
- memcpy(srp_rsp + 1, sense_data, sense_data_len);
+ memcpy(srp_rsp->data, sense_data, sense_data_len);
}
return sizeof(*srp_rsp) + sense_data_len;
@@ -1525,16 +1525,20 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
goto busy;
}
- rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
- &send_ioctx->sense_data[0],
- scsilun_to_int(&srp_cmd->lun), data_len,
- TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF,
- sg, sg_cnt, NULL, 0, NULL, 0);
+ rc = target_init_cmd(cmd, ch->sess, &send_ioctx->sense_data[0],
+ scsilun_to_int(&srp_cmd->lun), data_len,
+ TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF);
if (rc != 0) {
pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
srp_cmd->tag);
goto busy;
}
+
+ if (target_submit_prep(cmd, srp_cmd->cdb, sg, sg_cnt, NULL, 0, NULL, 0,
+ GFP_KERNEL))
+ return;
+
+ target_submit(cmd);
return;
busy:
@@ -1681,7 +1685,7 @@ push:
static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_rdma_ch *ch = wc->qp->qp_context;
struct srpt_recv_ioctx *ioctx =
container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
@@ -1742,7 +1746,7 @@ static void srpt_process_wait_list(struct srpt_rdma_ch *ch)
*/
static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct srpt_rdma_ch *ch = cq->cq_context;
+ struct srpt_rdma_ch *ch = wc->qp->qp_context;
struct srpt_send_ioctx *ioctx =
container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
enum srpt_command_state state;
@@ -1789,7 +1793,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
goto out;
retry:
- ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
+ ch->cq = ib_cq_pool_get(sdev->device, ch->rq_size + sq_size, -1,
IB_POLL_WORKQUEUE);
if (IS_ERR(ch->cq)) {
ret = PTR_ERR(ch->cq);
@@ -1797,6 +1801,7 @@ retry:
ch->rq_size + sq_size, ret);
goto out;
}
+ ch->cq_size = ch->rq_size + sq_size;
qp_init->qp_context = (void *)ch;
qp_init->event_handler
@@ -1814,18 +1819,13 @@ retry:
*/
qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr);
qp_init->cap.max_rdma_ctxs = sq_size / 2;
- qp_init->cap.max_send_sge = min(attrs->max_send_sge,
- SRPT_MAX_SG_PER_WQE);
- qp_init->cap.max_recv_sge = min(attrs->max_recv_sge,
- SRPT_MAX_SG_PER_WQE);
+ qp_init->cap.max_send_sge = attrs->max_send_sge;
+ qp_init->cap.max_recv_sge = 1;
qp_init->port_num = ch->sport->port;
- if (sdev->use_srq) {
+ if (sdev->use_srq)
qp_init->srq = sdev->srq;
- } else {
+ else
qp_init->cap.max_recv_wr = ch->rq_size;
- qp_init->cap.max_recv_sge = min(attrs->max_recv_sge,
- SRPT_MAX_SG_PER_WQE);
- }
if (ch->using_rdma_cm) {
ret = rdma_create_qp(ch->rdma_cm.cm_id, sdev->pd, qp_init);
@@ -1846,7 +1846,7 @@ retry:
if (retry) {
pr_debug("failed to create queue pair with sq_size = %d (%d) - retrying\n",
sq_size, ret);
- ib_free_cq(ch->cq);
+ ib_cq_pool_put(ch->cq, ch->cq_size);
sq_size = max(sq_size / 2, MIN_SRPT_SQ_SIZE);
goto retry;
} else {
@@ -1872,14 +1872,14 @@ out:
err_destroy_cq:
ch->qp = NULL;
- ib_free_cq(ch->cq);
+ ib_cq_pool_put(ch->cq, ch->cq_size);
goto out;
}
static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
{
ib_destroy_qp(ch->qp);
- ib_free_cq(ch->cq);
+ ib_cq_pool_put(ch->cq, ch->cq_size);
}
/**
@@ -1984,8 +1984,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport)
list_for_each_entry(nexus, &sport->nexus_list, entry) {
list_for_each_entry(ch, &nexus->ch_list, list) {
if (srpt_disconnect_ch(ch) >= 0)
- pr_info("Closing channel %s because target %s_%d has been disabled\n",
- ch->sess_name,
+ pr_info("Closing channel %s-%d because target %s_%d has been disabled\n",
+ ch->sess_name, ch->qp->qp_num,
dev_name(&sport->sdev->device->dev),
sport->port);
srpt_close_ch(ch);
@@ -2086,7 +2086,7 @@ static void srpt_release_channel_work(struct work_struct *w)
se_sess = ch->sess;
BUG_ON(!se_sess);
- target_sess_cmd_list_set_waiting(se_sess);
+ target_stop_session(se_sess);
target_wait_for_sess_cmds(se_sess);
target_remove_session(se_sess);
@@ -2159,9 +2159,6 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
WARN_ON_ONCE(irqs_disabled());
- if (WARN_ON(!sdev || !req))
- return -EINVAL;
-
it_iu_len = be32_to_cpu(req->req_it_iu_len);
pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n",
@@ -2221,13 +2218,13 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
ch->zw_cqe.done = srpt_zerolength_write_done;
INIT_WORK(&ch->release_work, srpt_release_channel_work);
ch->sport = sport;
- if (ib_cm_id) {
- ch->ib_cm.cm_id = ib_cm_id;
- ib_cm_id->context = ch;
- } else {
+ if (rdma_cm_id) {
ch->using_rdma_cm = true;
ch->rdma_cm.cm_id = rdma_cm_id;
rdma_cm_id->context = ch;
+ } else {
+ ch->ib_cm.cm_id = ib_cm_id;
+ ib_cm_id->context = ch;
}
/*
* ch->rq_size should be at least as large as the initiator queue
@@ -2303,7 +2300,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
goto free_recv_ring;
}
- strlcpy(ch->sess_name, src_addr, sizeof(ch->sess_name));
+ strscpy(ch->sess_name, src_addr, sizeof(ch->sess_name));
snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx",
be64_to_cpu(*(__be64 *)nexus->i_port_id),
be64_to_cpu(*(__be64 *)(nexus->i_port_id + 8)));
@@ -2314,31 +2311,35 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
tag_num = ch->rq_size;
tag_size = 1; /* ib_srpt does not use se_sess->sess_cmd_map */
- mutex_lock(&sport->port_guid_id.mutex);
- list_for_each_entry(stpg, &sport->port_guid_id.tpg_list, entry) {
- if (!IS_ERR_OR_NULL(ch->sess))
- break;
- ch->sess = target_setup_session(&stpg->tpg, tag_num,
+ if (sport->guid_id) {
+ mutex_lock(&sport->guid_id->mutex);
+ list_for_each_entry(stpg, &sport->guid_id->tpg_list, entry) {
+ if (!IS_ERR_OR_NULL(ch->sess))
+ break;
+ ch->sess = target_setup_session(&stpg->tpg, tag_num,
tag_size, TARGET_PROT_NORMAL,
ch->sess_name, ch, NULL);
+ }
+ mutex_unlock(&sport->guid_id->mutex);
}
- mutex_unlock(&sport->port_guid_id.mutex);
- mutex_lock(&sport->port_gid_id.mutex);
- list_for_each_entry(stpg, &sport->port_gid_id.tpg_list, entry) {
- if (!IS_ERR_OR_NULL(ch->sess))
- break;
- ch->sess = target_setup_session(&stpg->tpg, tag_num,
+ if (sport->gid_id) {
+ mutex_lock(&sport->gid_id->mutex);
+ list_for_each_entry(stpg, &sport->gid_id->tpg_list, entry) {
+ if (!IS_ERR_OR_NULL(ch->sess))
+ break;
+ ch->sess = target_setup_session(&stpg->tpg, tag_num,
tag_size, TARGET_PROT_NORMAL, i_port_id,
ch, NULL);
- if (!IS_ERR_OR_NULL(ch->sess))
- break;
- /* Retry without leading "0x" */
- ch->sess = target_setup_session(&stpg->tpg, tag_num,
+ if (!IS_ERR_OR_NULL(ch->sess))
+ break;
+ /* Retry without leading "0x" */
+ ch->sess = target_setup_session(&stpg->tpg, tag_num,
tag_size, TARGET_PROT_NORMAL,
i_port_id + 2, ch, NULL);
+ }
+ mutex_unlock(&sport->gid_id->mutex);
}
- mutex_unlock(&sport->port_gid_id.mutex);
if (IS_ERR_OR_NULL(ch->sess)) {
WARN_ON_ONCE(ch->sess == NULL);
@@ -2382,6 +2383,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n",
dev_name(&sdev->device->dev), port_num);
mutex_unlock(&sport->mutex);
+ ret = -EINVAL;
goto reject;
}
@@ -2496,7 +2498,8 @@ reject:
SRP_BUF_FORMAT_INDIRECT);
if (rdma_cm_id)
- rdma_reject(rdma_cm_id, rej, sizeof(*rej));
+ rdma_reject(rdma_cm_id, rej, sizeof(*rej),
+ IB_CM_REJ_CONSUMER_DEFINED);
else
ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
rej, sizeof(*rej));
@@ -2856,7 +2859,6 @@ static void srpt_queue_response(struct se_cmd *cmd)
&ch->sq_wr_avail) < 0)) {
pr_warn("%s: IB send queue full (needed %d)\n",
__func__, ioctx->n_rdma);
- ret = -ENOMEM;
goto out;
}
@@ -2982,7 +2984,12 @@ static int srpt_release_sport(struct srpt_port *sport)
return 0;
}
-static struct se_wwn *__srpt_lookup_wwn(const char *name)
+struct port_and_port_id {
+ struct srpt_port *sport;
+ struct srpt_port_id **port_id;
+};
+
+static struct port_and_port_id __srpt_lookup_port(const char *name)
{
struct ib_device *dev;
struct srpt_device *sdev;
@@ -2997,25 +3004,38 @@ static struct se_wwn *__srpt_lookup_wwn(const char *name)
for (i = 0; i < dev->phys_port_cnt; i++) {
sport = &sdev->port[i];
- if (strcmp(sport->port_guid_id.name, name) == 0)
- return &sport->port_guid_id.wwn;
- if (strcmp(sport->port_gid_id.name, name) == 0)
- return &sport->port_gid_id.wwn;
+ if (strcmp(sport->guid_name, name) == 0) {
+ kref_get(&sdev->refcnt);
+ return (struct port_and_port_id){
+ sport, &sport->guid_id};
+ }
+ if (strcmp(sport->gid_name, name) == 0) {
+ kref_get(&sdev->refcnt);
+ return (struct port_and_port_id){
+ sport, &sport->gid_id};
+ }
}
}
- return NULL;
+ return (struct port_and_port_id){};
}
-static struct se_wwn *srpt_lookup_wwn(const char *name)
+/**
+ * srpt_lookup_port() - Look up an RDMA port by name
+ * @name: ASCII port name
+ *
+ * Increments the RDMA port reference count if an RDMA port pointer is returned.
+ * The caller must drop that reference count by calling srpt_port_put_ref().
+ */
+static struct port_and_port_id srpt_lookup_port(const char *name)
{
- struct se_wwn *wwn;
+ struct port_and_port_id papi;
spin_lock(&srpt_dev_lock);
- wwn = __srpt_lookup_wwn(name);
+ papi = __srpt_lookup_port(name);
spin_unlock(&srpt_dev_lock);
- return wwn;
+ return papi;
}
static void srpt_free_srq(struct srpt_device *sdev)
@@ -3100,29 +3120,45 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq)
return ret;
}
+static void srpt_free_sdev(struct kref *refcnt)
+{
+ struct srpt_device *sdev = container_of(refcnt, typeof(*sdev), refcnt);
+
+ kfree(sdev);
+}
+
+static void srpt_sdev_put(struct srpt_device *sdev)
+{
+ kref_put(&sdev->refcnt, srpt_free_sdev);
+}
+
/**
* srpt_add_one - InfiniBand device addition callback function
* @device: Describes a HCA.
*/
-static void srpt_add_one(struct ib_device *device)
+static int srpt_add_one(struct ib_device *device)
{
struct srpt_device *sdev;
struct srpt_port *sport;
- int i, ret;
+ int ret;
+ u32 i;
pr_debug("device = %p\n", device);
sdev = kzalloc(struct_size(sdev, port, device->phys_port_cnt),
GFP_KERNEL);
if (!sdev)
- goto err;
+ return -ENOMEM;
+ kref_init(&sdev->refcnt);
sdev->device = device;
mutex_init(&sdev->sdev_mutex);
sdev->pd = ib_alloc_pd(device, 0);
- if (IS_ERR(sdev->pd))
+ if (IS_ERR(sdev->pd)) {
+ ret = PTR_ERR(sdev->pd);
goto free_dev;
+ }
sdev->lkey = sdev->pd->local_dma_lkey;
@@ -3138,6 +3174,7 @@ static void srpt_add_one(struct ib_device *device)
if (IS_ERR(sdev->cm_id)) {
pr_info("ib_create_cm_id() failed: %ld\n",
PTR_ERR(sdev->cm_id));
+ ret = PTR_ERR(sdev->cm_id);
sdev->cm_id = NULL;
if (!rdma_cm_id)
goto err_ring;
@@ -3154,7 +3191,7 @@ static void srpt_add_one(struct ib_device *device)
* if this HCA is gone bad and replaced by different HCA
*/
ret = sdev->cm_id ?
- ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0) :
+ ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid)) :
0;
if (ret < 0) {
pr_err("ib_cm_listen() failed: %d (cm_id state = %d)\n", ret,
@@ -3177,15 +3214,13 @@ static void srpt_add_one(struct ib_device *device)
sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE;
sport->port_attrib.use_srq = false;
INIT_WORK(&sport->work, srpt_refresh_port_work);
- mutex_init(&sport->port_guid_id.mutex);
- INIT_LIST_HEAD(&sport->port_guid_id.tpg_list);
- mutex_init(&sport->port_gid_id.mutex);
- INIT_LIST_HEAD(&sport->port_gid_id.tpg_list);
- if (srpt_refresh_port(sport)) {
+ ret = srpt_refresh_port(sport);
+ if (ret) {
pr_err("MAD registration failed for %s-%d.\n",
dev_name(&sdev->device->dev), i);
- goto err_event;
+ i--;
+ goto err_port;
}
}
@@ -3193,12 +3228,12 @@ static void srpt_add_one(struct ib_device *device)
list_add_tail(&sdev->list, &srpt_dev_list);
spin_unlock(&srpt_dev_lock);
-out:
ib_set_client_data(device, &srpt_client, sdev);
pr_debug("added %s.\n", dev_name(&device->dev));
- return;
+ return 0;
-err_event:
+err_port:
+ srpt_unregister_mad_agent(sdev, i);
ib_unregister_event_handler(&sdev->event_handler);
err_cm:
if (sdev->cm_id)
@@ -3207,11 +3242,9 @@ err_ring:
srpt_free_srq(sdev);
ib_dealloc_pd(sdev->pd);
free_dev:
- kfree(sdev);
-err:
- sdev = NULL;
+ srpt_sdev_put(sdev);
pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev));
- goto out;
+ return ret;
}
/**
@@ -3224,13 +3257,7 @@ static void srpt_remove_one(struct ib_device *device, void *client_data)
struct srpt_device *sdev = client_data;
int i;
- if (!sdev) {
- pr_info("%s(%s): nothing to do.\n", __func__,
- dev_name(&device->dev));
- return;
- }
-
- srpt_unregister_mad_agent(sdev);
+ srpt_unregister_mad_agent(sdev, sdev->device->phys_port_cnt);
ib_unregister_event_handler(&sdev->event_handler);
@@ -3259,7 +3286,7 @@ static void srpt_remove_one(struct ib_device *device, void *client_data)
ib_dealloc_pd(sdev->pd);
- kfree(sdev);
+ srpt_sdev_put(sdev);
}
static struct ib_client srpt_client = {
@@ -3287,10 +3314,10 @@ static struct srpt_port_id *srpt_wwn_to_sport_id(struct se_wwn *wwn)
{
struct srpt_port *sport = wwn->priv;
- if (wwn == &sport->port_guid_id.wwn)
- return &sport->port_guid_id;
- if (wwn == &sport->port_gid_id.wwn)
- return &sport->port_gid_id;
+ if (sport->guid_id && &sport->guid_id->wwn == wwn)
+ return sport->guid_id;
+ if (sport->gid_id && &sport->gid_id->wwn == wwn)
+ return sport->gid_id;
WARN_ON_ONCE(true);
return NULL;
}
@@ -3454,7 +3481,7 @@ static ssize_t srpt_tpg_attrib_srp_max_rdma_size_show(struct config_item *item,
struct se_portal_group *se_tpg = attrib_to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size);
+ return sysfs_emit(page, "%u\n", sport->port_attrib.srp_max_rdma_size);
}
static ssize_t srpt_tpg_attrib_srp_max_rdma_size_store(struct config_item *item,
@@ -3491,7 +3518,7 @@ static ssize_t srpt_tpg_attrib_srp_max_rsp_size_show(struct config_item *item,
struct se_portal_group *se_tpg = attrib_to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size);
+ return sysfs_emit(page, "%u\n", sport->port_attrib.srp_max_rsp_size);
}
static ssize_t srpt_tpg_attrib_srp_max_rsp_size_store(struct config_item *item,
@@ -3528,7 +3555,7 @@ static ssize_t srpt_tpg_attrib_srp_sq_size_show(struct config_item *item,
struct se_portal_group *se_tpg = attrib_to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size);
+ return sysfs_emit(page, "%u\n", sport->port_attrib.srp_sq_size);
}
static ssize_t srpt_tpg_attrib_srp_sq_size_store(struct config_item *item,
@@ -3565,7 +3592,7 @@ static ssize_t srpt_tpg_attrib_use_srq_show(struct config_item *item,
struct se_portal_group *se_tpg = attrib_to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- return sprintf(page, "%d\n", sport->port_attrib.use_srq);
+ return sysfs_emit(page, "%d\n", sport->port_attrib.use_srq);
}
static ssize_t srpt_tpg_attrib_use_srq_store(struct config_item *item,
@@ -3655,7 +3682,7 @@ out:
static ssize_t srpt_rdma_cm_port_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", rdma_cm_port);
+ return sysfs_emit(page, "%d\n", rdma_cm_port);
}
static ssize_t srpt_rdma_cm_port_store(struct config_item *item,
@@ -3706,47 +3733,17 @@ static struct configfs_attribute *srpt_da_attrs[] = {
NULL,
};
-static ssize_t srpt_tpg_enable_show(struct config_item *item, char *page)
-{
- struct se_portal_group *se_tpg = to_tpg(item);
- struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
-
- return snprintf(page, PAGE_SIZE, "%d\n", sport->enabled);
-}
-
-static ssize_t srpt_tpg_enable_store(struct config_item *item,
- const char *page, size_t count)
+static int srpt_enable_tpg(struct se_portal_group *se_tpg, bool enable)
{
- struct se_portal_group *se_tpg = to_tpg(item);
struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
- unsigned long tmp;
- int ret;
-
- ret = kstrtoul(page, 0, &tmp);
- if (ret < 0) {
- pr_err("Unable to extract srpt_tpg_store_enable\n");
- return -EINVAL;
- }
-
- if ((tmp != 0) && (tmp != 1)) {
- pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp);
- return -EINVAL;
- }
mutex_lock(&sport->mutex);
- srpt_set_enabled(sport, tmp);
+ srpt_set_enabled(sport, enable);
mutex_unlock(&sport->mutex);
- return count;
+ return 0;
}
-CONFIGFS_ATTR(srpt_tpg_, enable);
-
-static struct configfs_attribute *srpt_tpg_attrs[] = {
- &srpt_tpg_attr_enable,
- NULL,
-};
-
/**
* srpt_make_tpg - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port/$tpg
* @wwn: Corresponds to $driver/$port.
@@ -3805,7 +3802,31 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
struct config_group *group,
const char *name)
{
- return srpt_lookup_wwn(name) ? : ERR_PTR(-EINVAL);
+ struct port_and_port_id papi = srpt_lookup_port(name);
+ struct srpt_port *sport = papi.sport;
+ struct srpt_port_id *port_id;
+
+ if (!papi.port_id)
+ return ERR_PTR(-EINVAL);
+ if (*papi.port_id) {
+ /* Attempt to create a directory that already exists. */
+ WARN_ON_ONCE(true);
+ return &(*papi.port_id)->wwn;
+ }
+ port_id = kzalloc(sizeof(*port_id), GFP_KERNEL);
+ if (!port_id) {
+ srpt_sdev_put(sport->sdev);
+ return ERR_PTR(-ENOMEM);
+ }
+ mutex_init(&port_id->mutex);
+ INIT_LIST_HEAD(&port_id->tpg_list);
+ port_id->wwn.priv = sport;
+ memcpy(port_id->name, port_id == sport->guid_id ? sport->guid_name :
+ sport->gid_name, ARRAY_SIZE(port_id->name));
+
+ *papi.port_id = port_id;
+
+ return &port_id->wwn;
}
/**
@@ -3814,11 +3835,23 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
*/
static void srpt_drop_tport(struct se_wwn *wwn)
{
+ struct srpt_port_id *port_id = container_of(wwn, typeof(*port_id), wwn);
+ struct srpt_port *sport = wwn->priv;
+
+ if (sport->guid_id == port_id)
+ sport->guid_id = NULL;
+ else if (sport->gid_id == port_id)
+ sport->gid_id = NULL;
+ else
+ WARN_ON_ONCE(true);
+
+ srpt_sdev_put(sport->sdev);
+ kfree(port_id);
}
static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "\n");
+ return sysfs_emit(buf, "\n");
}
CONFIGFS_ATTR_RO(srpt_wwn_, version);
@@ -3857,12 +3890,12 @@ static const struct target_core_fabric_ops srpt_template = {
.fabric_make_wwn = srpt_make_tport,
.fabric_drop_wwn = srpt_drop_tport,
.fabric_make_tpg = srpt_make_tpg,
+ .fabric_enable_tpg = srpt_enable_tpg,
.fabric_drop_tpg = srpt_drop_tpg,
.fabric_init_nodeacl = srpt_init_nodeacl,
.tfc_discovery_attrs = srpt_da_attrs,
.tfc_wwn_attrs = srpt_wwn_attrs,
- .tfc_tpg_base_attrs = srpt_tpg_attrs,
.tfc_tpg_attrib_attrs = srpt_tpg_attrib_attrs,
};
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 2e1a69840857..4c46b301eea1 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -105,11 +105,6 @@ enum {
SRP_CMD_ACA = 0x4,
SRPT_DEF_SG_TABLESIZE = 128,
- /*
- * An experimentally determined value that avoids that QP creation
- * fails due to "swiotlb buffer is full" on systems using the swiotlb.
- */
- SRPT_MAX_SG_PER_WQE = 16,
MIN_SRPT_SQ_SIZE = 16,
DEF_SRPT_SQ_SIZE = 4096,
@@ -261,6 +256,7 @@ enum rdma_ch_state {
* @rdma_cm: See below.
* @rdma_cm.cm_id: RDMA CM ID associated with the channel.
* @cq: IB completion queue for this channel.
+ * @cq_size: Number of CQEs in @cq.
* @zw_cqe: Zero-length write CQE.
* @rcu: RCU head.
* @kref: kref for this channel.
@@ -305,6 +301,7 @@ struct srpt_rdma_ch {
} rdma_cm;
};
struct ib_cq *cq;
+ u32 cq_size;
struct ib_cqe zw_cqe;
struct rcu_head rcu;
struct kref kref;
@@ -350,7 +347,7 @@ struct srpt_nexus {
};
/**
- * struct srpt_port_attib - attributes for SRPT port
+ * struct srpt_port_attrib - attributes for SRPT port
* @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections.
* @srp_max_rsp_size: Maximum size of SRP response messages in bytes.
* @srp_sq_size: Shared receive queue (SRQ) size.
@@ -379,7 +376,7 @@ struct srpt_tpg {
};
/**
- * struct srpt_port_id - information about an RDMA port name
+ * struct srpt_port_id - LIO RDMA port information
* @mutex: Protects @tpg_list changes.
* @tpg_list: TPGs associated with the RDMA port name.
* @wwn: WWN associated with the RDMA port name.
@@ -396,7 +393,7 @@ struct srpt_port_id {
};
/**
- * struct srpt_port - information associated by SRPT with a single IB port
+ * struct srpt_port - SRPT RDMA port information
* @sdev: backpointer to the HCA information.
* @mad_agent: per-port management datagram processing information.
* @enabled: Whether or not this target port is enabled.
@@ -405,8 +402,10 @@ struct srpt_port_id {
* @lid: cached value of the port's lid.
* @gid: cached value of the port's gid.
* @work: work structure for refreshing the aforementioned cached values.
- * @port_guid_id: target port GUID
- * @port_gid_id: target port GID
+ * @guid_name: port name in GUID format.
+ * @guid_id: LIO target port information for the port name in GUID format.
+ * @gid_name: port name in GID format.
+ * @gid_id: LIO target port information for the port name in GID format.
* @port_attrib: Port attributes that can be accessed through configfs.
* @refcount: Number of objects associated with this port.
* @freed_channels: Completion that will be signaled once @refcount becomes 0.
@@ -422,8 +421,10 @@ struct srpt_port {
u32 lid;
union ib_gid gid;
struct work_struct work;
- struct srpt_port_id port_guid_id;
- struct srpt_port_id port_gid_id;
+ char guid_name[64];
+ struct srpt_port_id *guid_id;
+ char gid_name[64];
+ struct srpt_port_id *gid_id;
struct srpt_port_attrib port_attrib;
atomic_t refcount;
struct completion *freed_channels;
@@ -433,6 +434,7 @@ struct srpt_port {
/**
* struct srpt_device - information associated by SRPT with a single HCA
+ * @refcnt: Reference count for this device.
* @device: Backpointer to the struct ib_device managed by the IB core.
* @pd: IB protection domain.
* @lkey: L_Key (local key) with write access to all local memory.
@@ -448,6 +450,7 @@ struct srpt_port {
* @port: Information about the ports owned by this HCA.
*/
struct srpt_device {
+ struct kref refcnt;
struct ib_device *device;
struct ib_pd *pd;
u32 lkey;