aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp/ipoib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-22 11:50:05 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-22 11:50:05 -0700
commit7c034dfd58bbc056280262887acf5b7a98944d0a (patch)
tree49dc3b77590f9929069db17922c823bb679c1456 /drivers/infiniband/ulp/ipoib
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client (diff)
parentMerge branches 'cve-fixup', 'ipoib', 'iser', 'misc-4.1', 'or-mlx4' and 'srp' into for-4.1 (diff)
downloadlinux-dev-7c034dfd58bbc056280262887acf5b7a98944d0a.tar.xz
linux-dev-7c034dfd58bbc056280262887acf5b7a98944d0a.zip
Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
Pull InfiniBand/RDMA updates from Roland Dreier: - IPoIB fixes from Doug Ledford and Erez Shitrit - iSER updates from Sagi Grimberg - mlx4 GUID handling changes from Yishai Hadas - other misc fixes * tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (51 commits) mlx5: wrong page mask if CONFIG_ARCH_DMA_ADDR_T_64BIT enabled for 32Bit architectures IB/iser: Rewrite bounce buffer code path IB/iser: Bump version to 1.6 IB/iser: Remove code duplication for a single DMA entry IB/iser: Pass struct iser_mem_reg to iser_fast_reg_mr and iser_reg_sig_mr IB/iser: Modify struct iser_mem_reg members IB/iser: Make fastreg pool cache friendly IB/iser: Move PI context alloc/free to routines IB/iser: Move fastreg descriptor pool get/put to helper functions IB/iser: Merge build page-vec into register page-vec IB/iser: Get rid of struct iser_rdma_regd IB/iser: Remove redundant assignments in iser_reg_page_vec IB/iser: Move memory reg/dereg routines to iser_memory.c IB/iser: Don't pass ib_device to fall_to_bounce_buff routine IB/iser: Remove a redundant struct iser_data_buf IB/iser: Remove redundant cmd_data_len calculation IB/iser: Fix wrong calculation of protection buffer length IB/iser: Handle fastreg/local_inv completion errors IB/iser: Fix unload during ep_poll wrong dereference ib_srpt: convert printk's to pr_* functions ...
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h31
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c18
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c195
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c73
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c520
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c44
6 files changed, 492 insertions, 389 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index d7562beb5423..bd94b0a6e9e5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -87,7 +87,6 @@ enum {
IPOIB_FLAG_ADMIN_UP = 2,
IPOIB_PKEY_ASSIGNED = 3,
IPOIB_FLAG_SUBINTERFACE = 5,
- IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7,
IPOIB_FLAG_ADMIN_CM = 9,
IPOIB_FLAG_UMCAST = 10,
@@ -98,9 +97,15 @@ enum {
IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
IPOIB_MCAST_FLAG_SENDONLY = 1,
- IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
+ /*
+ * For IPOIB_MCAST_FLAG_BUSY
+ * When set, in flight join and mcast->mc is unreliable
+ * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
+ * haven't started yet
+ * When clear and mcast->mc is valid pointer, join was successful
+ */
+ IPOIB_MCAST_FLAG_BUSY = 2,
IPOIB_MCAST_FLAG_ATTACHED = 3,
- IPOIB_MCAST_JOIN_STARTED = 4,
MAX_SEND_CQE = 16,
IPOIB_CM_COPYBREAK = 256,
@@ -148,6 +153,7 @@ struct ipoib_mcast {
unsigned long created;
unsigned long backoff;
+ unsigned long delay_until;
unsigned long flags;
unsigned char logcount;
@@ -292,6 +298,11 @@ struct ipoib_neigh_table {
struct completion deleted;
};
+struct ipoib_qp_state_validate {
+ struct work_struct work;
+ struct ipoib_dev_priv *priv;
+};
+
/*
* Device private locking: network stack tx_lock protects members used
* in TX fast path, lock protects everything else. lock nests inside
@@ -317,6 +328,7 @@ struct ipoib_dev_priv {
struct list_head multicast_list;
struct rb_root multicast_tree;
+ struct workqueue_struct *wq;
struct delayed_work mcast_task;
struct work_struct carrier_on_task;
struct work_struct flush_light;
@@ -426,11 +438,6 @@ struct ipoib_neigh {
#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES)
-static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
-{
- return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
-}
-
void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
{
@@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
void ipoib_pkey_event(struct work_struct *work);
void ipoib_ib_dev_cleanup(struct net_device *dev);
-int ipoib_ib_dev_open(struct net_device *dev, int flush);
+int ipoib_ib_dev_open(struct net_device *dev);
int ipoib_ib_dev_up(struct net_device *dev);
-int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev, int flush);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
void ipoib_pkey_dev_check_presence(struct net_device *dev);
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
@@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+int ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 933efcea0d03..56959adb6c7d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
}
spin_lock_irq(&priv->lock);
- queue_delayed_work(ipoib_workqueue,
+ queue_delayed_work(priv->wq,
&priv->cm.stale_task, IPOIB_CM_RX_DELAY);
/* Add this entry to passive ids list head, but do not re-add it
* if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
@@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->lock, flags);
list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
ipoib_cm_start_rx_drain(priv);
- queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+ queue_work(priv->wq, &priv->cm.rx_reap_task);
spin_unlock_irqrestore(&priv->lock, flags);
} else
ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
@@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->lock, flags);
list_move(&p->list, &priv->cm.rx_reap_list);
spin_unlock_irqrestore(&priv->lock, flags);
- queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+ queue_work(priv->wq, &priv->cm.rx_reap_task);
}
return;
}
@@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
list_move(&tx->list, &priv->cm.reap_list);
- queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ queue_work(priv->wq, &priv->cm.reap_task);
}
clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
@@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
list_move(&tx->list, &priv->cm.reap_list);
- queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ queue_work(priv->wq, &priv->cm.reap_task);
}
spin_unlock_irqrestore(&priv->lock, flags);
@@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
tx->dev = dev;
list_add(&tx->list, &priv->cm.start_list);
set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
- queue_work(ipoib_workqueue, &priv->cm.start_task);
+ queue_work(priv->wq, &priv->cm.start_task);
return tx;
}
@@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
spin_lock_irqsave(&priv->lock, flags);
list_move(&tx->list, &priv->cm.reap_list);
- queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ queue_work(priv->wq, &priv->cm.reap_task);
ipoib_dbg(priv, "Reap connection for gid %pI6\n",
tx->neigh->daddr + 4);
tx->neigh = NULL;
@@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
skb_queue_tail(&priv->cm.skb_queue, skb);
if (e)
- queue_work(ipoib_workqueue, &priv->cm.skb_task);
+ queue_work(priv->wq, &priv->cm.skb_task);
}
static void ipoib_cm_rx_reap(struct work_struct *work)
@@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
}
if (!list_empty(&priv->cm.passive_ids))
- queue_delayed_work(ipoib_workqueue,
+ queue_delayed_work(priv->wq,
&priv->cm.stale_task, IPOIB_CM_RX_DELAY);
spin_unlock_irq(&priv->lock);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 72626c348174..63b92cbb29ad 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -94,39 +94,9 @@ void ipoib_free_ah(struct kref *kref)
static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
u64 mapping[IPOIB_UD_RX_SG])
{
- if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
- ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
- DMA_FROM_DEVICE);
- ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
- DMA_FROM_DEVICE);
- } else
- ib_dma_unmap_single(priv->ca, mapping[0],
- IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
- DMA_FROM_DEVICE);
-}
-
-static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
- struct sk_buff *skb,
- unsigned int length)
-{
- if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
- unsigned int size;
- /*
- * There is only two buffers needed for max_payload = 4K,
- * first buf size is IPOIB_UD_HEAD_SIZE
- */
- skb->tail += IPOIB_UD_HEAD_SIZE;
- skb->len += length;
-
- size = length - IPOIB_UD_HEAD_SIZE;
-
- skb_frag_size_set(frag, size);
- skb->data_len += size;
- skb->truesize += PAGE_SIZE;
- } else
- skb_put(skb, length);
-
+ ib_dma_unmap_single(priv->ca, mapping[0],
+ IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+ DMA_FROM_DEVICE);
}
static int ipoib_ib_post_receive(struct net_device *dev, int id)
@@ -156,18 +126,11 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
int buf_size;
- int tailroom;
u64 *mapping;
- if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
- buf_size = IPOIB_UD_HEAD_SIZE;
- tailroom = 128; /* reserve some tailroom for IP/TCP headers */
- } else {
- buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
- tailroom = 0;
- }
+ buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
- skb = dev_alloc_skb(buf_size + tailroom + 4);
+ skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
if (unlikely(!skb))
return NULL;
@@ -184,23 +147,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
goto error;
- if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
- struct page *page = alloc_page(GFP_ATOMIC);
- if (!page)
- goto partial_error;
- skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
- mapping[1] =
- ib_dma_map_page(priv->ca, page,
- 0, PAGE_SIZE, DMA_FROM_DEVICE);
- if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
- goto partial_error;
- }
-
priv->rx_ring[id].skb = skb;
return skb;
-
-partial_error:
- ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
error:
dev_kfree_skb_any(skb);
return NULL;
@@ -278,7 +226,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
wc->byte_len, wc->slid);
ipoib_ud_dma_unmap_rx(priv, mapping);
- ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
+
+ skb_put(skb, wc->byte_len);
/* First byte of dgid signals multicast when 0xff */
dgid = &((struct ib_grh *)skb->data)->dgid;
@@ -296,6 +245,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
skb_reset_mac_header(skb);
skb_pull(skb, IPOIB_ENCAP_LEN);
+ skb->truesize = SKB_TRUESIZE(skb->len);
+
++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len;
@@ -376,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
}
}
+/*
+ * As the result of a completion error the QP Can be transferred to SQE states.
+ * The function checks if the (send)QP is in SQE state and
+ * moves it back to RTS state, that in order to have it functional again.
+ */
+static void ipoib_qp_state_validate_work(struct work_struct *work)
+{
+ struct ipoib_qp_state_validate *qp_work =
+ container_of(work, struct ipoib_qp_state_validate, work);
+
+ struct ipoib_dev_priv *priv = qp_work->priv;
+ struct ib_qp_attr qp_attr;
+ struct ib_qp_init_attr query_init_attr;
+ int ret;
+
+ ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
+ if (ret) {
+ ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
+ __func__, ret);
+ goto free_res;
+ }
+ pr_info("%s: QP: 0x%x is in state: %d\n",
+ __func__, priv->qp->qp_num, qp_attr.qp_state);
+
+ /* currently support only in SQE->RTS transition*/
+ if (qp_attr.qp_state == IB_QPS_SQE) {
+ qp_attr.qp_state = IB_QPS_RTS;
+
+ ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
+ if (ret) {
+ pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
+ ret, priv->qp->qp_num);
+ goto free_res;
+ }
+ pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
+ __func__, priv->qp->qp_num);
+ } else {
+ pr_warn("QP (%d) will stay in state: %d\n",
+ priv->qp->qp_num, qp_attr.qp_state);
+ }
+
+free_res:
+ kfree(qp_work);
+}
+
static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -407,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
netif_wake_queue(dev);
if (wc->status != IB_WC_SUCCESS &&
- wc->status != IB_WC_WR_FLUSH_ERR)
+ wc->status != IB_WC_WR_FLUSH_ERR) {
+ struct ipoib_qp_state_validate *qp_work;
ipoib_warn(priv, "failed send event "
"(status=%d, wrid=%d vend_err %x)\n",
wc->status, wr_id, wc->vendor_err);
+ qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
+ if (!qp_work) {
+ ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
+ __func__, priv->qp->qp_num);
+ return;
+ }
+
+ INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
+ qp_work->priv = priv;
+ queue_work(priv->wq, &qp_work->work);
+ }
}
static int poll_tx(struct ipoib_dev_priv *priv)
@@ -655,16 +663,33 @@ void ipoib_reap_ah(struct work_struct *work)
__ipoib_reap_ah(dev);
if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
- queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+ queue_delayed_work(priv->wq, &priv->ah_reap_task,
round_jiffies_relative(HZ));
}
+static void ipoib_flush_ah(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ cancel_delayed_work(&priv->ah_reap_task);
+ flush_workqueue(priv->wq);
+ ipoib_reap_ah(&priv->ah_reap_task.work);
+}
+
+static void ipoib_stop_ah(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ set_bit(IPOIB_STOP_REAPER, &priv->flags);
+ ipoib_flush_ah(dev);
+}
+
static void ipoib_ib_tx_timer_func(unsigned long ctx)
{
drain_tx_cq((struct net_device *)ctx);
}
-int ipoib_ib_dev_open(struct net_device *dev, int flush)
+int ipoib_ib_dev_open(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
@@ -696,7 +721,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
}
clear_bit(IPOIB_STOP_REAPER, &priv->flags);
- queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+ queue_delayed_work(priv->wq, &priv->ah_reap_task,
round_jiffies_relative(HZ));
if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
@@ -706,7 +731,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
dev_stop:
if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
napi_enable(&priv->napi);
- ipoib_ib_dev_stop(dev, flush);
+ ipoib_ib_dev_stop(dev);
return -1;
}
@@ -738,7 +763,7 @@ int ipoib_ib_dev_up(struct net_device *dev)
return ipoib_mcast_start_thread(dev);
}
-int ipoib_ib_dev_down(struct net_device *dev, int flush)
+int ipoib_ib_dev_down(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -747,7 +772,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
netif_carrier_off(dev);
- ipoib_mcast_stop_thread(dev, flush);
+ ipoib_mcast_stop_thread(dev);
ipoib_mcast_dev_flush(dev);
ipoib_flush_paths(dev);
@@ -807,7 +832,7 @@ void ipoib_drain_cq(struct net_device *dev)
local_bh_enable();
}
-int ipoib_ib_dev_stop(struct net_device *dev, int flush)
+int ipoib_ib_dev_stop(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_attr qp_attr;
@@ -877,24 +902,7 @@ timeout:
if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
ipoib_warn(priv, "Failed to modify QP to RESET state\n");
- /* Wait for all AHs to be reaped */
- set_bit(IPOIB_STOP_REAPER, &priv->flags);
- cancel_delayed_work(&priv->ah_reap_task);
- if (flush)
- flush_workqueue(ipoib_workqueue);
-
- begin = jiffies;
-
- while (!list_empty(&priv->dead_ahs)) {
- __ipoib_reap_ah(dev);
-
- if (time_after(jiffies, begin + HZ)) {
- ipoib_warn(priv, "timing out; will leak address handles\n");
- break;
- }
-
- msleep(1);
- }
+ ipoib_flush_ah(dev);
ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
@@ -918,7 +926,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
(unsigned long) dev);
if (dev->flags & IFF_UP) {
- if (ipoib_ib_dev_open(dev, 1)) {
+ if (ipoib_ib_dev_open(dev)) {
ipoib_transport_dev_cleanup(dev);
return -ENODEV;
}
@@ -1037,15 +1045,16 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
if (level == IPOIB_FLUSH_LIGHT) {
ipoib_mark_paths_invalid(dev);
ipoib_mcast_dev_flush(dev);
+ ipoib_flush_ah(dev);
}
if (level >= IPOIB_FLUSH_NORMAL)
- ipoib_ib_dev_down(dev, 0);
+ ipoib_ib_dev_down(dev);
if (level == IPOIB_FLUSH_HEAVY) {
if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
- ipoib_ib_dev_stop(dev, 0);
- if (ipoib_ib_dev_open(dev, 0) != 0)
+ ipoib_ib_dev_stop(dev);
+ if (ipoib_ib_dev_open(dev) != 0)
return;
if (netif_queue_stopped(dev))
netif_start_queue(dev);
@@ -1097,9 +1106,17 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
*/
ipoib_flush_paths(dev);
- ipoib_mcast_stop_thread(dev, 1);
+ ipoib_mcast_stop_thread(dev);
ipoib_mcast_dev_flush(dev);
+ /*
+ * All of our ah references aren't free until after
+ * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
+ * the neighbor garbage collection is stopped and reaped.
+ * That should all be done now, so make a final ah flush.
+ */
+ ipoib_stop_ah(dev);
+
ipoib_transport_dev_cleanup(dev);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 915ad04a827e..9e1b203d756d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev)
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
- if (ipoib_ib_dev_open(dev, 1)) {
+ if (ipoib_ib_dev_open(dev)) {
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
return 0;
goto err_disable;
@@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev)
return 0;
err_stop:
- ipoib_ib_dev_stop(dev, 1);
+ ipoib_ib_dev_stop(dev);
err_disable:
clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
@@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev)
netif_stop_queue(dev);
- ipoib_ib_dev_down(dev, 1);
- ipoib_ib_dev_stop(dev, 0);
+ ipoib_ib_dev_down(dev);
+ ipoib_ib_dev_stop(dev);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
@@ -640,8 +640,10 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
if (!path->query && path_rec_start(dev, path))
goto err_path;
-
- __skb_queue_tail(&neigh->queue, skb);
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+ __skb_queue_tail(&neigh->queue, skb);
+ else
+ goto err_drop;
}
spin_unlock_irqrestore(&priv->lock, flags);
@@ -676,7 +678,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
new_path = 1;
}
if (path) {
- __skb_queue_tail(&path->queue, skb);
+ if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ __skb_queue_tail(&path->queue, skb);
+ } else {
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
if (!path->query && path_rec_start(dev, path)) {
spin_unlock_irqrestore(&priv->lock, flags);
@@ -839,7 +846,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
return;
}
- queue_work(ipoib_workqueue, &priv->restart_task);
+ queue_work(priv->wq, &priv->restart_task);
}
static int ipoib_get_iflink(const struct net_device *dev)
@@ -966,7 +973,7 @@ static void ipoib_reap_neigh(struct work_struct *work)
__ipoib_reap_neigh(priv);
if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
- queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+ queue_delayed_work(priv->wq, &priv->neigh_reap_task,
arp_tbl.gc_interval);
}
@@ -1145,7 +1152,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
/* start garbage collection */
clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
- queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+ queue_delayed_work(priv->wq, &priv->neigh_reap_task,
arp_tbl.gc_interval);
return 0;
@@ -1274,15 +1281,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- if (ipoib_neigh_hash_init(priv) < 0)
- goto out;
/* Allocate RX/TX "rings" to hold queued skbs */
priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
GFP_KERNEL);
if (!priv->rx_ring) {
printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
ca->name, ipoib_recvq_size);
- goto out_neigh_hash_cleanup;
+ goto out;
}
priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@@ -1297,16 +1302,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
+ /*
+ * Must be after ipoib_ib_dev_init so we can allocate a per
+ * device wq there and use it here
+ */
+ if (ipoib_neigh_hash_init(priv) < 0)
+ goto out_dev_uninit;
+
return 0;
+out_dev_uninit:
+ ipoib_ib_dev_cleanup(dev);
+
out_tx_ring_cleanup:
vfree(priv->tx_ring);
out_rx_ring_cleanup:
kfree(priv->rx_ring);
-out_neigh_hash_cleanup:
- ipoib_neigh_hash_uninit(dev);
out:
return -ENOMEM;
}
@@ -1329,6 +1342,12 @@ void ipoib_dev_cleanup(struct net_device *dev)
}
unregister_netdevice_many(&head);
+ /*
+ * Must be before ipoib_ib_dev_cleanup or we delete an in use
+ * work queue
+ */
+ ipoib_neigh_hash_uninit(dev);
+
ipoib_ib_dev_cleanup(dev);
kfree(priv->rx_ring);
@@ -1336,8 +1355,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
priv->rx_ring = NULL;
priv->tx_ring = NULL;
-
- ipoib_neigh_hash_uninit(dev);
}
static const struct header_ops ipoib_header_ops = {
@@ -1646,10 +1663,11 @@ sysfs_failed:
register_failed:
ib_unregister_event_handler(&priv->event_handler);
+ flush_workqueue(ipoib_workqueue);
/* Stop GC if started before flush */
set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
cancel_delayed_work(&priv->neigh_reap_task);
- flush_workqueue(ipoib_workqueue);
+ flush_workqueue(priv->wq);
event_failed:
ipoib_dev_cleanup(priv->dev);
@@ -1712,6 +1730,7 @@ static void ipoib_remove_one(struct ib_device *device)
list_for_each_entry_safe(priv, tmp, dev_list, list) {
ib_unregister_event_handler(&priv->event_handler);
+ flush_workqueue(ipoib_workqueue);
rtnl_lock();
dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
@@ -1720,7 +1739,7 @@ static void ipoib_remove_one(struct ib_device *device)
/* Stop GC */
set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
cancel_delayed_work(&priv->neigh_reap_task);
- flush_workqueue(ipoib_workqueue);
+ flush_workqueue(priv->wq);
unregister_netdev(priv->dev);
free_netdev(priv->dev);
@@ -1755,14 +1774,16 @@ static int __init ipoib_init_module(void)
return ret;
/*
- * We create our own workqueue mainly because we want to be
- * able to flush it when devices are being removed. We can't
- * use schedule_work()/flush_scheduled_work() because both
- * unregister_netdev() and linkwatch_event take the rtnl lock,
- * so flush_scheduled_work() can deadlock during device
- * removal.
+ * We create a global workqueue here that is used for all flush
+ * operations. However, if you attempt to flush a workqueue
+ * from a task on that same workqueue, it deadlocks the system.
+ * We want to be able to flush the tasks associated with a
+ * specific net device, so we also create a workqueue for each
+ * netdevice. We queue up the tasks for that device only on
+ * its private workqueue, and we only queue up flush events
+ * on our global flush workqueue. This avoids the deadlocks.
*/
- ipoib_workqueue = create_singlethread_workqueue("ipoib");
+ ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
if (!ipoib_workqueue) {
ret = -ENOMEM;
goto err_fs;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ffb83b5f7e80..0d23e0568deb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -55,8 +55,6 @@ MODULE_PARM_DESC(mcast_debug_level,
"Enable multicast debug tracing if > 0");
#endif
-static DEFINE_MUTEX(mcast_mutex);
-
struct ipoib_mcast_iter {
struct net_device *dev;
union ib_gid mgid;
@@ -66,6 +64,48 @@ struct ipoib_mcast_iter {
unsigned int send_only;
};
+/*
+ * This should be called with the priv->lock held
+ */
+static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
+ struct ipoib_mcast *mcast,
+ bool delay)
+{
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+ return;
+
+ /*
+ * We will be scheduling *something*, so cancel whatever is
+ * currently scheduled first
+ */
+ cancel_delayed_work(&priv->mcast_task);
+ if (mcast && delay) {
+ /*
+ * We had a failure and want to schedule a retry later
+ */
+ mcast->backoff *= 2;
+ if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+ mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+ mcast->delay_until = jiffies + (mcast->backoff * HZ);
+ /*
+ * Mark this mcast for its delay, but restart the
+ * task immediately. The join task will make sure to
+ * clear out all entries without delays, and then
+ * schedule itself to run again when the earliest
+ * delay expires
+ */
+ queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+ } else if (delay) {
+ /*
+ * Special case of retrying after a failure to
+ * allocate the broadcast multicast group, wait
+ * 1 second and try again
+ */
+ queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
+ } else
+ queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+}
+
static void ipoib_mcast_free(struct ipoib_mcast *mcast)
{
struct net_device *dev = mcast->dev;
@@ -103,6 +143,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
mcast->dev = dev;
mcast->created = jiffies;
+ mcast->delay_until = jiffies;
mcast->backoff = 1;
INIT_LIST_HEAD(&mcast->list);
@@ -185,17 +226,27 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
spin_unlock_irq(&priv->lock);
return -EAGAIN;
}
- priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+ /*update priv member according to the new mcast*/
+ priv->broadcast->mcmember.qkey = mcmember->qkey;
+ priv->broadcast->mcmember.mtu = mcmember->mtu;
+ priv->broadcast->mcmember.traffic_class = mcmember->traffic_class;
+ priv->broadcast->mcmember.rate = mcmember->rate;
+ priv->broadcast->mcmember.sl = mcmember->sl;
+ priv->broadcast->mcmember.flow_label = mcmember->flow_label;
+ priv->broadcast->mcmember.hop_limit = mcmember->hop_limit;
+ /* assume if the admin and the mcast are the same both can be changed */
+ if (priv->mcast_mtu == priv->admin_mtu)
+ priv->admin_mtu =
+ priv->mcast_mtu =
+ IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+ else
+ priv->mcast_mtu =
+ IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+
priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
spin_unlock_irq(&priv->lock);
priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
set_qkey = 1;
-
- if (!ipoib_cm_admin_enabled(dev)) {
- rtnl_lock();
- dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
- rtnl_unlock();
- }
}
if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -270,107 +321,35 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
return 0;
}
-static int
-ipoib_mcast_sendonly_join_complete(int status,
- struct ib_sa_multicast *multicast)
-{
- struct ipoib_mcast *mcast = multicast->context;
- struct net_device *dev = mcast->dev;
-
- /* We trap for port events ourselves. */
- if (status == -ENETRESET)
- return 0;
-
- if (!status)
- status = ipoib_mcast_join_finish(mcast, &multicast->rec);
-
- if (status) {
- if (mcast->logcount++ < 20)
- ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
- mcast->mcmember.mgid.raw, status);
-
- /* Flush out any queued packets */
- netif_tx_lock_bh(dev);
- while (!skb_queue_empty(&mcast->pkt_queue)) {
- ++dev->stats.tx_dropped;
- dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
- }
- netif_tx_unlock_bh(dev);
-
- /* Clear the busy flag so we try again */
- status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
- &mcast->flags);
- }
- return status;
-}
-
-static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
-{
- struct net_device *dev = mcast->dev;
- struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ib_sa_mcmember_rec rec = {
-#if 0 /* Some SMs don't support send-only yet */
- .join_state = 4
-#else
- .join_state = 1
-#endif
- };
- int ret = 0;
-
- if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
- ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
- return -ENODEV;
- }
-
- if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
- ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
- return -EBUSY;
- }
-
- rec.mgid = mcast->mcmember.mgid;
- rec.port_gid = priv->local_gid;
- rec.pkey = cpu_to_be16(priv->pkey);
-
- mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
- priv->port, &rec,
- IB_SA_MCMEMBER_REC_MGID |
- IB_SA_MCMEMBER_REC_PORT_GID |
- IB_SA_MCMEMBER_REC_PKEY |
- IB_SA_MCMEMBER_REC_JOIN_STATE,
- GFP_ATOMIC,
- ipoib_mcast_sendonly_join_complete,
- mcast);
- if (IS_ERR(mcast->mc)) {
- ret = PTR_ERR(mcast->mc);
- clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
- ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
- ret);
- } else {
- ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
- mcast->mcmember.mgid.raw);
- }
-
- return ret;
-}
-
void ipoib_mcast_carrier_on_task(struct work_struct *work)
{
struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
carrier_on_task);
struct ib_port_attr attr;
- /*
- * Take rtnl_lock to avoid racing with ipoib_stop() and
- * turning the carrier back on while a device is being
- * removed.
- */
if (ib_query_port(priv->ca, priv->port, &attr) ||
attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
return;
}
- rtnl_lock();
+ /*
+ * Take rtnl_lock to avoid racing with ipoib_stop() and
+ * turning the carrier back on while a device is being
+ * removed. However, ipoib_stop() will attempt to flush
+ * the workqueue while holding the rtnl lock, so loop
+ * on trylock until either we get the lock or we see
+ * FLAG_OPER_UP go away as that signals that we are bailing
+ * and can safely ignore the carrier on work.
+ */
+ while (!rtnl_trylock()) {
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+ return;
+ else
+ msleep(20);
+ }
+ if (!ipoib_cm_admin_enabled(priv->dev))
+ dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
netif_carrier_on(priv->dev);
rtnl_unlock();
}
@@ -382,7 +361,9 @@ static int ipoib_mcast_join_complete(int status,
struct net_device *dev = mcast->dev;
struct ipoib_dev_priv *priv = netdev_priv(dev);
- ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
+ ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n",
+ test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ?
+ "sendonly " : "",
mcast->mcmember.mgid.raw, status);
/* We trap for port events ourselves. */
@@ -396,49 +377,74 @@ static int ipoib_mcast_join_complete(int status,
if (!status) {
mcast->backoff = 1;
- mutex_lock(&mcast_mutex);
- if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
- queue_delayed_work(ipoib_workqueue,
- &priv->mcast_task, 0);
- mutex_unlock(&mcast_mutex);
+ mcast->delay_until = jiffies;
/*
- * Defer carrier on work to ipoib_workqueue to avoid a
- * deadlock on rtnl_lock here.
+ * Defer carrier on work to priv->wq to avoid a
+ * deadlock on rtnl_lock here. Requeue our multicast
+ * work too, which will end up happening right after
+ * our carrier on task work and will allow us to
+ * send out all of the non-broadcast joins
*/
- if (mcast == priv->broadcast)
- queue_work(ipoib_workqueue, &priv->carrier_on_task);
-
- status = 0;
- goto out;
- }
+ if (mcast == priv->broadcast) {
+ spin_lock_irq(&priv->lock);
+ queue_work(priv->wq, &priv->carrier_on_task);
+ __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+ goto out_locked;
+ }
+ } else {
+ if (mcast->logcount++ < 20) {
+ if (status == -ETIMEDOUT || status == -EAGAIN) {
+ ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
+ test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+ mcast->mcmember.mgid.raw, status);
+ } else {
+ ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n",
+ test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+ mcast->mcmember.mgid.raw, status);
+ }
+ }
- if (mcast->logcount++ < 20) {
- if (status == -ETIMEDOUT || status == -EAGAIN) {
- ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
- mcast->mcmember.mgid.raw, status);
+ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+ mcast->backoff >= 2) {
+ /*
+ * We only retry sendonly joins once before we drop
+ * the packet and quit trying to deal with the
+ * group. However, we leave the group in the
+ * mcast list as an unjoined group. If we want to
+ * try joining again, we simply queue up a packet
+ * and restart the join thread. The empty queue
+ * is why the join thread ignores this group.
+ */
+ mcast->backoff = 1;
+ netif_tx_lock_bh(dev);
+ while (!skb_queue_empty(&mcast->pkt_queue)) {
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+ }
+ netif_tx_unlock_bh(dev);
} else {
- ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
- mcast->mcmember.mgid.raw, status);
+ spin_lock_irq(&priv->lock);
+ /* Requeue this join task with a backoff delay */
+ __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+ goto out_locked;
}
}
-
- mcast->backoff *= 2;
- if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
- mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
-
- /* Clear the busy flag so we try again */
- status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-
- mutex_lock(&mcast_mutex);
+out:
spin_lock_irq(&priv->lock);
- if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
- queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
- mcast->backoff * HZ);
+out_locked:
+ /*
+ * Make sure to set mcast->mc before we clear the busy flag to avoid
+ * racing with code that checks for BUSY before checking mcast->mc
+ */
+ if (status)
+ mcast->mc = NULL;
+ else
+ mcast->mc = multicast;
+ clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
spin_unlock_irq(&priv->lock);
- mutex_unlock(&mcast_mutex);
-out:
complete(&mcast->done);
+
return status;
}
@@ -446,6 +452,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
int create)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_sa_multicast *multicast;
struct ib_sa_mcmember_rec rec = {
.join_state = 1
};
@@ -487,29 +494,18 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
rec.hop_limit = priv->broadcast->mcmember.hop_limit;
}
- set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
- init_completion(&mcast->done);
- set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
-
- mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+ multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
&rec, comp_mask, GFP_KERNEL,
ipoib_mcast_join_complete, mcast);
- if (IS_ERR(mcast->mc)) {
+ if (IS_ERR(multicast)) {
+ ret = PTR_ERR(multicast);
+ ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+ spin_lock_irq(&priv->lock);
+ /* Requeue this join task with a backoff delay */
+ __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ spin_unlock_irq(&priv->lock);
complete(&mcast->done);
- ret = PTR_ERR(mcast->mc);
- ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
-
- mcast->backoff *= 2;
- if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
- mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
-
- mutex_lock(&mcast_mutex);
- if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
- queue_delayed_work(ipoib_workqueue,
- &priv->mcast_task,
- mcast->backoff * HZ);
- mutex_unlock(&mcast_mutex);
}
}
@@ -519,8 +515,11 @@ void ipoib_mcast_join_task(struct work_struct *work)
container_of(work, struct ipoib_dev_priv, mcast_task.work);
struct net_device *dev = priv->dev;
struct ib_port_attr port_attr;
+ unsigned long delay_until = 0;
+ struct ipoib_mcast *mcast = NULL;
+ int create = 1;
- if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
return;
if (ib_query_port(priv->ca, priv->port, &port_attr) ||
@@ -536,93 +535,118 @@ void ipoib_mcast_join_task(struct work_struct *work)
else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+ spin_lock_irq(&priv->lock);
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+ goto out;
+
if (!priv->broadcast) {
struct ipoib_mcast *broadcast;
- if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
- return;
-
- broadcast = ipoib_mcast_alloc(dev, 1);
+ broadcast = ipoib_mcast_alloc(dev, 0);
if (!broadcast) {
ipoib_warn(priv, "failed to allocate broadcast group\n");
- mutex_lock(&mcast_mutex);
- if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
- queue_delayed_work(ipoib_workqueue,
- &priv->mcast_task, HZ);
- mutex_unlock(&mcast_mutex);
- return;
+ /*
+ * Restart us after a 1 second delay to retry
+ * creating our broadcast group and attaching to
+ * it. Until this succeeds, this ipoib dev is
+ * completely stalled (multicast wise).
+ */
+ __ipoib_mcast_schedule_join_thread(priv, NULL, 1);
+ goto out;
}
- spin_lock_irq(&priv->lock);
memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
sizeof (union ib_gid));
priv->broadcast = broadcast;
__ipoib_mcast_add(dev, priv->broadcast);
- spin_unlock_irq(&priv->lock);
}
if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
- if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
- ipoib_mcast_join(dev, priv->broadcast, 0);
- return;
- }
-
- while (1) {
- struct ipoib_mcast *mcast = NULL;
-
- spin_lock_irq(&priv->lock);
- list_for_each_entry(mcast, &priv->multicast_list, list) {
- if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
- && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
- && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
- /* Found the next unjoined group */
- break;
+ if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
+ !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
+ mcast = priv->broadcast;
+ create = 0;
+ if (mcast->backoff > 1 &&
+ time_before(jiffies, mcast->delay_until)) {
+ delay_until = mcast->delay_until;
+ mcast = NULL;
}
}
- spin_unlock_irq(&priv->lock);
+ goto out;
+ }
- if (&mcast->list == &priv->multicast_list) {
- /* All done */
- break;
+ /*
+ * We'll never get here until the broadcast group is both allocated
+ * and attached
+ */
+ list_for_each_entry(mcast, &priv->multicast_list, list) {
+ if (IS_ERR_OR_NULL(mcast->mc) &&
+ !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
+ (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
+ !skb_queue_empty(&mcast->pkt_queue))) {
+ if (mcast->backoff == 1 ||
+ time_after_eq(jiffies, mcast->delay_until)) {
+ /* Found the next unjoined group */
+ init_completion(&mcast->done);
+ set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+ create = 0;
+ else
+ create = 1;
+ spin_unlock_irq(&priv->lock);
+ ipoib_mcast_join(dev, mcast, create);
+ spin_lock_irq(&priv->lock);
+ } else if (!delay_until ||
+ time_before(mcast->delay_until, delay_until))
+ delay_until = mcast->delay_until;
}
-
- ipoib_mcast_join(dev, mcast, 1);
- return;
}
- ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+ mcast = NULL;
+ ipoib_dbg_mcast(priv, "successfully started all multicast joins\n");
- clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+out:
+ if (delay_until) {
+ cancel_delayed_work(&priv->mcast_task);
+ queue_delayed_work(priv->wq, &priv->mcast_task,
+ delay_until - jiffies);
+ }
+ if (mcast) {
+ init_completion(&mcast->done);
+ set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ }
+ spin_unlock_irq(&priv->lock);
+ if (mcast)
+ ipoib_mcast_join(dev, mcast, create);
}
int ipoib_mcast_start_thread(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned long flags;
ipoib_dbg_mcast(priv, "starting multicast thread\n");
- mutex_lock(&mcast_mutex);
- if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
- queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
- mutex_unlock(&mcast_mutex);
+ spin_lock_irqsave(&priv->lock, flags);
+ __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+ spin_unlock_irqrestore(&priv->lock, flags);
return 0;
}
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+int ipoib_mcast_stop_thread(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned long flags;
ipoib_dbg_mcast(priv, "stopping multicast thread\n");
- mutex_lock(&mcast_mutex);
- clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+ spin_lock_irqsave(&priv->lock, flags);
cancel_delayed_work(&priv->mcast_task);
- mutex_unlock(&mcast_mutex);
+ spin_unlock_irqrestore(&priv->lock, flags);
- if (flush)
- flush_workqueue(ipoib_workqueue);
+ flush_workqueue(priv->wq);
return 0;
}
@@ -633,6 +657,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
int ret = 0;
if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+ ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
+
+ if (!IS_ERR_OR_NULL(mcast->mc))
ib_sa_free_multicast(mcast->mc);
if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
@@ -644,7 +671,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
be16_to_cpu(mcast->mcmember.mlid));
if (ret)
ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
- }
+ } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+ ipoib_dbg(priv, "leaving with no mcmember but not a "
+ "SENDONLY join\n");
return 0;
}
@@ -667,49 +696,37 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
}
mcast = __ipoib_mcast_find(dev, mgid);
- if (!mcast) {
- /* Let's create a new send only group now */
- ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
- mgid);
-
- mcast = ipoib_mcast_alloc(dev, 0);
+ if (!mcast || !mcast->ah) {
if (!mcast) {
- ipoib_warn(priv, "unable to allocate memory for "
- "multicast structure\n");
- ++dev->stats.tx_dropped;
- dev_kfree_skb_any(skb);
- goto out;
- }
-
- set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
- memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
- __ipoib_mcast_add(dev, mcast);
- list_add_tail(&mcast->list, &priv->multicast_list);
- }
+ /* Let's create a new send only group now */
+ ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
+ mgid);
+
+ mcast = ipoib_mcast_alloc(dev, 0);
+ if (!mcast) {
+ ipoib_warn(priv, "unable to allocate memory "
+ "for multicast structure\n");
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ goto unlock;
+ }
- if (!mcast->ah) {
+ set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+ memcpy(mcast->mcmember.mgid.raw, mgid,
+ sizeof (union ib_gid));
+ __ipoib_mcast_add(dev, mcast);
+ list_add_tail(&mcast->list, &priv->multicast_list);
+ }
if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
skb_queue_tail(&mcast->pkt_queue, skb);
else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
-
- if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
- ipoib_dbg_mcast(priv, "no address vector, "
- "but multicast join already started\n");
- else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
- ipoib_mcast_sendonly_join(mcast);
-
- /*
- * If lookup completes between here and out:, don't
- * want to send packet twice.
- */
- mcast = NULL;
- }
-
-out:
- if (mcast && mcast->ah) {
+ if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+ __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+ }
+ } else {
struct ipoib_neigh *neigh;
spin_unlock_irqrestore(&priv->lock, flags);
@@ -759,9 +776,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
spin_unlock_irqrestore(&priv->lock, flags);
- /* seperate between the wait to the leave*/
+ /*
+ * make sure the in-flight joins have finished before we attempt
+ * to leave
+ */
list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
- if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
+ if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
wait_for_completion(&mcast->done);
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@@ -792,9 +812,14 @@ void ipoib_mcast_restart_task(struct work_struct *work)
unsigned long flags;
struct ib_sa_mcmember_rec rec;
- ipoib_dbg_mcast(priv, "restarting multicast task\n");
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+ /*
+ * shortcut...on shutdown flush is called next, just
+ * let it do all the work
+ */
+ return;
- ipoib_mcast_stop_thread(dev, 0);
+ ipoib_dbg_mcast(priv, "restarting multicast task\n");
local_irq_save(flags);
netif_addr_lock(dev);
@@ -880,14 +905,27 @@ void ipoib_mcast_restart_task(struct work_struct *work)
netif_addr_unlock(dev);
local_irq_restore(flags);
- /* We have to cancel outside of the spinlock */
+ /*
+ * make sure the in-flight joins have finished before we attempt
+ * to leave
+ */
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+ if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+ wait_for_completion(&mcast->done);
+
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
ipoib_mcast_leave(mcast->dev, mcast);
ipoib_mcast_free(mcast);
}
- if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
- ipoib_mcast_start_thread(dev);
+ /*
+ * Double check that we are still up
+ */
+ if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+ spin_lock_irqsave(&priv->lock, flags);
+ __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ }
}
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index c56d5d44c53b..e5cc43074196 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -157,6 +157,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
goto out_free_pd;
}
+ /*
+ * the various IPoIB tasks assume they will never race against
+ * themselves, so always use a single thread workqueue
+ */
+ priv->wq = create_singlethread_workqueue("ipoib_wq");
+ if (!priv->wq) {
+ printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
+ goto out_free_mr;
+ }
+
size = ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(dev);
if (!ret) {
@@ -165,12 +175,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
else
size += ipoib_recvq_size * ipoib_max_conn_qp;
- }
+ } else
+ goto out_free_wq;
priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
if (IS_ERR(priv->recv_cq)) {
printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
- goto out_free_mr;
+ goto out_cm_dev_cleanup;
}
priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
@@ -216,15 +227,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
priv->tx_wr.send_flags = IB_SEND_SIGNALED;
priv->rx_sge[0].lkey = priv->mr->lkey;
- if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
- priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
- priv->rx_sge[1].length = PAGE_SIZE;
- priv->rx_sge[1].lkey = priv->mr->lkey;
- priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
- } else {
- priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
- priv->rx_wr.num_sge = 1;
- }
+
+ priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+ priv->rx_wr.num_sge = 1;
+
priv->rx_wr.next = NULL;
priv->rx_wr.sg_list = priv->rx_sge;
@@ -236,12 +242,19 @@ out_free_send_cq:
out_free_recv_cq:
ib_destroy_cq(priv->recv_cq);
+out_cm_dev_cleanup:
+ ipoib_cm_dev_cleanup(dev);
+
+out_free_wq:
+ destroy_workqueue(priv->wq);
+ priv->wq = NULL;
+
out_free_mr:
ib_dereg_mr(priv->mr);
- ipoib_cm_dev_cleanup(dev);
out_free_pd:
ib_dealloc_pd(priv->pd);
+
return -ENODEV;
}
@@ -265,11 +278,18 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
ipoib_cm_dev_cleanup(dev);
+ if (priv->wq) {
+ flush_workqueue(priv->wq);
+ destroy_workqueue(priv->wq);
+ priv->wq = NULL;
+ }
+
if (ib_dereg_mr(priv->mr))
ipoib_warn(priv, "ib_dereg_mr failed\n");
if (ib_dealloc_pd(priv->pd))
ipoib_warn(priv, "ib_dealloc_pd failed\n");
+
}
void ipoib_event(struct ib_event_handler *handler,