aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core/cache.c
diff options
context:
space:
mode:
authorParav Pandit <parav@mellanox.com>2018-06-05 08:40:15 +0300
committerJason Gunthorpe <jgg@mellanox.com>2018-06-18 11:09:05 -0600
commitb150c3862d21a4a9ce0f26d8067b9dcd41e2050c (patch)
treeeb5a9408a5aed358b372d00423452ee5ff8e75c8 /drivers/infiniband/core/cache.c
parentIB/core: Store default GID property per-table instead of per-entry (diff)
downloadlinux-dev-b150c3862d21a4a9ce0f26d8067b9dcd41e2050c.tar.xz
linux-dev-b150c3862d21a4a9ce0f26d8067b9dcd41e2050c.zip
IB/core: Introduce GID entry reference counts
In order to be able to expose pointers to the ib_gid_attrs in the GID table we need to make it so the value of the pointer cannot be changed. Thus each GID table entry gets a unique piece of kref'd memory that is written only during initialization and remains constant for its lifetime. This eventually will allow the struct ib_gid_attrs to be returned without copy from many of query the APIs, but it also provides a way to track when all users of a HW table index go away. For roce we no longer allow an in-use HW table index to be re-used for a new an different entry. When a GID table entry needs to be removed it is hidden from the find API, but remains as a valid HW index and all ib_gid_attr points remain valid. The HW index is not relased until all users put the kref. Later patches will broadly replace the use of the sgid_index integer with the kref'd structure. Ultimately this will prevent security problems where the OS changes the properties of a HW GID table entry while an active user object is still using the entry. Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'drivers/infiniband/core/cache.c')
-rw-r--r--drivers/infiniband/core/cache.c323
1 files changed, 220 insertions, 103 deletions
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index d4751f94a93a..09d83c69ec65 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -66,15 +66,24 @@ enum gid_attr_find_mask {
GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
};
-enum gid_table_entry_props {
- GID_TABLE_ENTRY_INVALID = 1UL << 0,
+enum gid_table_entry_state {
+ GID_TABLE_ENTRY_INVALID = 1,
+ GID_TABLE_ENTRY_VALID = 2,
+ /*
+ * Indicates that entry is pending to be removed, there may
+ * be active users of this GID entry.
+ * When last user of the GID entry releases reference to it,
+ * GID entry is detached from the table.
+ */
+ GID_TABLE_ENTRY_PENDING_DEL = 3,
};
struct ib_gid_table_entry {
- unsigned long props;
- union ib_gid gid;
- struct ib_gid_attr attr;
- void *context;
+ struct kref kref;
+ struct work_struct del_work;
+ struct ib_gid_attr attr;
+ void *context;
+ enum gid_table_entry_state state;
};
struct ib_gid_table {
@@ -90,15 +99,16 @@ struct ib_gid_table {
*
**/
/* Any writer to data_vec must hold this lock and the write side of
- * rwlock. readers must hold only rwlock. All writers must be in a
+ * rwlock. Readers must hold only rwlock. All writers must be in a
* sleepable context.
*/
struct mutex lock;
- /* rwlock protects data_vec[ix]->props. */
+ /* rwlock protects data_vec[ix]->state and entry pointer.
+ */
rwlock_t rwlock;
+ struct ib_gid_table_entry **data_vec;
/* bit field, each bit indicates the index of default GID */
u32 default_gid_indices;
- struct ib_gid_table_entry *data_vec;
};
static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
@@ -178,26 +188,113 @@ static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
return device->cache.ports[port - rdma_start_port(device)].gid;
}
-static void del_roce_gid(struct ib_device *device, u8 port_num,
- struct ib_gid_table *table, int ix)
+static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
+{
+ return !entry;
+}
+
+static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry)
+{
+ return entry && entry->state == GID_TABLE_ENTRY_VALID;
+}
+
+static void schedule_free_gid(struct kref *kref)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(kref, struct ib_gid_table_entry, kref);
+
+ queue_work(ib_wq, &entry->del_work);
+}
+
+static void free_gid_entry(struct ib_gid_table_entry *entry)
{
+ struct ib_device *device = entry->attr.device;
+ u8 port_num = entry->attr.port_num;
+ struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
- device->name, port_num, ix,
- table->data_vec[ix].gid.raw);
+ device->name, port_num, entry->attr.index,
+ entry->attr.gid.raw);
+
+ mutex_lock(&table->lock);
+ if (rdma_cap_roce_gid_table(device, port_num) &&
+ entry->state != GID_TABLE_ENTRY_INVALID)
+ device->del_gid(&entry->attr, &entry->context);
+ write_lock_irq(&table->rwlock);
- if (rdma_cap_roce_gid_table(device, port_num))
- device->del_gid(&table->data_vec[ix].attr,
- &table->data_vec[ix].context);
- dev_put(table->data_vec[ix].attr.ndev);
+ /*
+ * The only way to avoid overwriting NULL in table is
+ * by comparing if it is same entry in table or not!
+ * If new entry in table is added by the time we free here,
+ * don't overwrite the table entry.
+ */
+ if (entry == table->data_vec[entry->attr.index])
+ table->data_vec[entry->attr.index] = NULL;
+ /* Now this index is ready to be allocated */
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+
+ if (entry->attr.ndev)
+ dev_put(entry->attr.ndev);
+ kfree(entry);
+}
+
+/**
+ * free_gid_work - Release reference to the GID entry
+ * @work: Work structure to refer to GID entry which needs to be
+ * deleted.
+ *
+ * free_gid_work() frees the entry from the HCA's hardware table
+ * if provider supports it. It releases reference to netdevice.
+ */
+static void free_gid_work(struct work_struct *work)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(work, struct ib_gid_table_entry, del_work);
+ free_gid_entry(entry);
}
-static int add_roce_gid(struct ib_gid_table *table,
- const union ib_gid *gid,
- const struct ib_gid_attr *attr)
+static struct ib_gid_table_entry *
+alloc_gid_entry(const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry;
- int ix = attr->index;
- int ret = 0;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+ kref_init(&entry->kref);
+ memcpy(&entry->attr, attr, sizeof(*attr));
+ if (entry->attr.ndev)
+ dev_hold(entry->attr.ndev);
+ INIT_WORK(&entry->del_work, free_gid_work);
+ entry->state = GID_TABLE_ENTRY_INVALID;
+ return entry;
+}
+
+static void store_gid_entry(struct ib_gid_table *table,
+ struct ib_gid_table_entry *entry)
+{
+ entry->state = GID_TABLE_ENTRY_VALID;
+
+ pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+ entry->attr.device->name, entry->attr.port_num,
+ entry->attr.index, entry->attr.gid.raw);
+
+ lockdep_assert_held(&table->lock);
+ write_lock_irq(&table->rwlock);
+ table->data_vec[entry->attr.index] = entry;
+ write_unlock_irq(&table->rwlock);
+}
+
+static void put_gid_entry(struct ib_gid_table_entry *entry)
+{
+ kref_put(&entry->kref, schedule_free_gid);
+}
+
+static int add_roce_gid(struct ib_gid_table_entry *entry)
+{
+ const struct ib_gid_attr *attr = &entry->attr;
+ int ret;
if (!attr->ndev) {
pr_err("%s NULL netdev device=%s port=%d index=%d\n",
@@ -205,38 +302,22 @@ static int add_roce_gid(struct ib_gid_table *table,
attr->index);
return -EINVAL;
}
-
- entry = &table->data_vec[ix];
- if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) {
- WARN(1, "GID table corruption device=%s port=%d index=%d\n",
- attr->device->name, attr->port_num,
- attr->index);
- return -EINVAL;
- }
-
if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
- ret = attr->device->add_gid(gid, attr, &entry->context);
+ ret = attr->device->add_gid(&attr->gid, attr, &entry->context);
if (ret) {
pr_err("%s GID add failed device=%s port=%d index=%d\n",
__func__, attr->device->name, attr->port_num,
attr->index);
- goto add_err;
+ return ret;
}
}
- dev_hold(attr->ndev);
-
-add_err:
- if (!ret)
- pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
- attr->device->name, attr->port_num, ix, gid->raw);
- return ret;
+ return 0;
}
/**
* add_modify_gid - Add or modify GID table entry
*
* @table: GID table in which GID to be added or modified
- * @gid: GID content
* @attr: Attributes of the GID
*
* Returns 0 on success or appropriate error code. It accepts zero
@@ -244,34 +325,42 @@ add_err:
* GID. However such zero GIDs are not added to the cache.
*/
static int add_modify_gid(struct ib_gid_table *table,
- const union ib_gid *gid,
const struct ib_gid_attr *attr)
{
- int ret;
+ struct ib_gid_table_entry *entry;
+ int ret = 0;
+
+ /*
+ * Invalidate any old entry in the table to make it safe to write to
+ * this index.
+ */
+ if (is_gid_entry_valid(table->data_vec[attr->index]))
+ put_gid_entry(table->data_vec[attr->index]);
+
+ /*
+ * Some HCA's report multiple GID entries with only one valid GID, and
+ * leave other unused entries as the zero GID. Convert zero GIDs to
+ * empty table entries instead of storing them.
+ */
+ if (rdma_is_zero_gid(&attr->gid))
+ return 0;
+
+ entry = alloc_gid_entry(attr);
+ if (!entry)
+ return -ENOMEM;
if (rdma_protocol_roce(attr->device, attr->port_num)) {
- ret = add_roce_gid(table, gid, attr);
+ ret = add_roce_gid(entry);
if (ret)
- return ret;
- } else {
- /*
- * Some HCA's report multiple GID entries with only one
- * valid GID, but remaining as zero GID.
- * So ignore such behavior for IB link layer and don't
- * fail the call, but don't add such entry to GID cache.
- */
- if (rdma_is_zero_gid(gid))
- return 0;
+ goto done;
}
- lockdep_assert_held(&table->lock);
- memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid));
- memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr));
-
- write_lock_irq(&table->rwlock);
- table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID;
- write_unlock_irq(&table->rwlock);
+ store_gid_entry(table, entry);
return 0;
+
+done:
+ put_gid_entry(entry);
+ return ret;
}
/**
@@ -286,16 +375,25 @@ static int add_modify_gid(struct ib_gid_table *table,
static void del_gid(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table, int ix)
{
+ struct ib_gid_table_entry *entry;
+
lockdep_assert_held(&table->lock);
+
+ pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+ ib_dev->name, port, ix,
+ table->data_vec[ix]->attr.gid.raw);
+
write_lock_irq(&table->rwlock);
- table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+ entry = table->data_vec[ix];
+ entry->state = GID_TABLE_ENTRY_PENDING_DEL;
+ /*
+ * For non RoCE protocol, GID entry slot is ready to use.
+ */
+ if (!rdma_protocol_roce(ib_dev, port))
+ table->data_vec[ix] = NULL;
write_unlock_irq(&table->rwlock);
- if (rdma_protocol_roce(ib_dev, port))
- del_roce_gid(ib_dev, port, table, ix);
- memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid));
- memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
- table->data_vec[ix].context = NULL;
+ put_gid_entry(entry);
}
/* rwlock should be read locked, or lock should be held */
@@ -308,8 +406,8 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
int empty = pempty ? -1 : 0;
while (i < table->sz && (found < 0 || empty < 0)) {
- struct ib_gid_table_entry *data = &table->data_vec[i];
- struct ib_gid_attr *attr = &data->attr;
+ struct ib_gid_table_entry *data = table->data_vec[i];
+ struct ib_gid_attr *attr;
int curr_index = i;
i++;
@@ -320,9 +418,9 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
* so lookup free slot only if requested.
*/
if (pempty && empty < 0) {
- if (data->props & GID_TABLE_ENTRY_INVALID &&
- (default_gid ==
- is_gid_index_default(table, curr_index))) {
+ if (is_gid_entry_free(data) &&
+ default_gid ==
+ is_gid_index_default(table, curr_index)) {
/*
* Found an invalid (free) entry; allocate it.
* If default GID is requested, then our
@@ -337,22 +435,23 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
/*
* Additionally find_gid() is used to find valid entry during
- * lookup operation, where validity needs to be checked. So
- * find the empty entry first to continue to search for a free
- * slot and ignore its INVALID flag.
+ * lookup operation; so ignore the entries which are marked as
+ * pending for removal and the entries which are marked as
+ * invalid.
*/
- if (data->props & GID_TABLE_ENTRY_INVALID)
+ if (!is_gid_entry_valid(data))
continue;
if (found >= 0)
continue;
+ attr = &data->attr;
if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
attr->gid_type != val->gid_type)
continue;
if (mask & GID_ATTR_FIND_MASK_GID &&
- memcmp(gid, &data->gid, sizeof(*gid)))
+ memcmp(gid, &data->attr.gid, sizeof(*gid)))
continue;
if (mask & GID_ATTR_FIND_MASK_NETDEV &&
@@ -409,7 +508,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
attr->device = ib_dev;
attr->index = empty;
attr->port_num = port;
- ret = add_modify_gid(table, gid, attr);
+ attr->gid = *gid;
+ ret = add_modify_gid(table, attr);
if (!ret)
dispatch_gid_change_event(ib_dev, port);
@@ -505,7 +605,8 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
mutex_lock(&table->lock);
for (ix = 0; ix < table->sz; ix++) {
- if (table->data_vec[ix].attr.ndev == ndev) {
+ if (is_gid_entry_valid(table->data_vec[ix]) &&
+ table->data_vec[ix]->attr.ndev == ndev) {
del_gid(ib_dev, port, table, ix);
deleted = true;
}
@@ -529,12 +630,13 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
if (index < 0 || index >= table->sz)
return -EINVAL;
- if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
+ if (!is_gid_entry_valid(table->data_vec[index]))
return -EINVAL;
- memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+ memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
if (attr) {
- memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+ memcpy(attr, &table->data_vec[index]->attr,
+ sizeof(*attr));
if (attr->ndev)
dev_hold(attr->ndev);
}
@@ -681,13 +783,14 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
for (i = 0; i < table->sz; i++) {
struct ib_gid_attr attr;
- if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ if (!is_gid_entry_valid(table->data_vec[i]))
continue;
- if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ if (memcmp(gid, &table->data_vec[i]->attr.gid,
+ sizeof(*gid)))
continue;
- memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
+ memcpy(&attr, &table->data_vec[i]->attr, sizeof(attr));
if (filter(gid, &attr, context)) {
found = true;
@@ -705,9 +808,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
static struct ib_gid_table *alloc_gid_table(int sz)
{
- struct ib_gid_table *table =
- kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
- int i;
+ struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL);
if (!table)
return NULL;
@@ -720,12 +821,6 @@ static struct ib_gid_table *alloc_gid_table(int sz)
table->sz = sz;
rwlock_init(&table->rwlock);
-
- /* Mark all entries as invalid so that allocator can allocate
- * one of the invalid (free) entry.
- */
- for (i = 0; i < sz; i++)
- table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID;
return table;
err_free_table:
@@ -733,12 +828,30 @@ err_free_table:
return NULL;
}
-static void release_gid_table(struct ib_gid_table *table)
+static void release_gid_table(struct ib_device *device, u8 port,
+ struct ib_gid_table *table)
{
- if (table) {
- kfree(table->data_vec);
- kfree(table);
+ bool leak = false;
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; i++) {
+ if (is_gid_entry_free(table->data_vec[i]))
+ continue;
+ if (kref_read(&table->data_vec[i]->kref) > 1) {
+ pr_err("GID entry ref leak for %s (index %d) ref=%d\n",
+ device->name, i,
+ kref_read(&table->data_vec[i]->kref));
+ leak = true;
+ }
}
+ if (leak)
+ return;
+
+ kfree(table->data_vec);
+ kfree(table);
}
static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
@@ -752,7 +865,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
mutex_lock(&table->lock);
for (i = 0; i < table->sz; ++i) {
- if (!rdma_is_zero_gid(&table->data_vec[i].gid)) {
+ if (is_gid_entry_valid(table->data_vec[i])) {
del_gid(ib_dev, port, table, i);
deleted = true;
}
@@ -821,7 +934,7 @@ static void gid_table_release_one(struct ib_device *ib_dev)
for (port = 0; port < ib_dev->phys_port_cnt; port++) {
table = ib_dev->cache.ports[port].gid;
- release_gid_table(table);
+ release_gid_table(ib_dev, port, table);
ib_dev->cache.ports[port].gid = NULL;
}
}
@@ -1100,7 +1213,6 @@ static int config_non_roce_gid_cache(struct ib_device *device,
{
struct ib_gid_attr gid_attr = {};
struct ib_gid_table *table;
- union ib_gid gid;
int ret = 0;
int i;
@@ -1112,14 +1224,14 @@ static int config_non_roce_gid_cache(struct ib_device *device,
for (i = 0; i < gid_tbl_len; ++i) {
if (!device->query_gid)
continue;
- ret = device->query_gid(device, port, i, &gid);
+ ret = device->query_gid(device, port, i, &gid_attr.gid);
if (ret) {
pr_warn("query_gid failed (%d) for %s (index %d)\n",
ret, device->name, i);
goto err;
}
gid_attr.index = i;
- add_modify_gid(table, &gid, &gid_attr);
+ add_modify_gid(table, &gid_attr);
}
err:
mutex_unlock(&table->lock);
@@ -1302,4 +1414,9 @@ void ib_cache_cleanup_one(struct ib_device *device)
ib_unregister_event_handler(&device->cache.event_handler);
flush_workqueue(ib_wq);
gid_table_cleanup_one(device);
+
+ /*
+ * Flush the wq second time for any pending GID delete work.
+ */
+ flush_workqueue(ib_wq);
}