diff options
Diffstat (limited to 'drivers/infiniband/core')
24 files changed, 650 insertions, 522 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9b76a8fcdd24..6d7ec371e7b2 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -183,7 +183,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. @@ -352,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family) if (family == AF_INET) { rt = container_of(dst, struct rtable, dst); - return rt->rt_gw_family == AF_INET; + return rt->rt_uses_gateway; } rt6 = container_of(dst, struct rt6_info, dst); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 18e476b3ced0..00fb3eacda19 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -810,6 +810,7 @@ static void release_gid_table(struct ib_device *device, if (leak) return; + mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 19f1730a4f24..0e3cf3461999 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3046,7 +3046,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); - } else { + } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } @@ -4724,10 +4724,14 @@ static int __init cma_init(void) if (ret) goto err; - cma_configfs_init(); + ret = cma_configfs_init(); + if (ret) + goto err_ib; return 0; +err_ib: + ib_unregister_client(&cma_client); err: unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 3ec2c415bb70..8b0b5ae22e4c 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -342,12 +342,18 @@ static struct configfs_subsystem cma_subsys = { int __init cma_configfs_init(void) { + int ret; + config_group_init(&cma_subsys.su_group); mutex_init(&cma_subsys.su_mutex); - return configfs_register_subsystem(&cma_subsys); + ret = configfs_register_subsystem(&cma_subsys); + if (ret) + mutex_destroy(&cma_subsys.su_mutex); + return ret; } void __exit cma_configfs_exit(void) { configfs_unregister_subsystem(&cma_subsys); + mutex_destroy(&cma_subsys.su_mutex); } diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 888d89ce81df..3a8b0911c3bc 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -36,6 +36,8 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/cgroup_rdma.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <rdma/ib_verbs.h> #include <rdma/opa_addr.h> @@ -54,8 +56,26 @@ struct pkey_index_qp_list { struct list_head qp_list; }; +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @nl_sock: Pointer to netlink socket + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + struct sock *nl_sock; + possible_net_t net; + u32 id; +}; + extern const struct attribute_group ib_dev_attr_group; extern bool ib_devices_shared_netns; +extern unsigned int rdma_dev_net_id; + +static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) +{ + return net_generic(net, rdma_dev_net_id); +} int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); @@ -179,7 +199,6 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int rdma_nl_init(void); void rdma_nl_exit(void); int ib_nl_handle_resolve_resp(struct sk_buff *skb, @@ -302,7 +321,9 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, struct ib_udata *udata, struct ib_uobject *uobj) { + enum ib_qp_type qp_type = attr->qp_type; struct ib_qp *qp; + bool is_xrc; if (!dev->ops.create_qp) return ERR_PTR(-EOPNOTSUPP); @@ -320,7 +341,8 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, * and more importantly they are created internaly by driver, * see mlx5 create_dev_resources() as an example. */ - if (attr->qp_type < IB_QPT_XRC_INI) { + is_xrc = qp_type == IB_QPT_XRC_INI || qp_type == IB_QPT_XRC_TGT; + if ((qp_type < IB_QPT_MAX && !is_xrc) || qp_type == IB_QPT_DRIVER) { qp->res.type = RDMA_RESTRACK_QP; if (uobj) rdma_restrack_uadd(&qp->res); @@ -362,4 +384,7 @@ void ib_port_unregister_module_stat(struct kobject *kobj); int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd); + +int rdma_nl_net_init(struct rdma_dev_net *rnet); +void rdma_nl_net_exit(struct rdma_dev_net *rnet); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 01faef7bc061..680ad27f497d 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -38,6 +38,9 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, int ret; port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return -EOPNOTSUPP; + mutex_lock(&port_counter->lock); if (on) { ret = __counter_set_mode(&port_counter->mode, @@ -146,13 +149,11 @@ static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, struct auto_mode_param *param = &counter->mode.param; bool match = true; - if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) + if (!rdma_is_visible_in_pid_ns(&qp->res)) return false; - /* Ensure that counter belong to right PID */ - if (!rdma_is_kernel_res(&counter->res) && - !rdma_is_kernel_res(&qp->res) && - (task_pid_vnr(counter->res.task) != current->pid)) + /* Ensure that counter belongs to the right PID */ + if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task)) return false; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) @@ -393,6 +394,9 @@ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) u64 sum; port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return 0; + sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; @@ -418,7 +422,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) return qp; err: - rdma_restrack_put(&qp->res); + rdma_restrack_put(res); return NULL; } @@ -506,6 +510,9 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, if (!rdma_is_port_valid(dev, port)) return -EINVAL; + if (!dev->port_data[port].port_counter.hstats) + return -EOPNOTSUPP; + qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; @@ -592,9 +599,9 @@ int rdma_counter_get_mode(struct ib_device *dev, u8 port, void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; - u32 port; + u32 port, i; - if (!dev->ops.alloc_hw_stats || !dev->port_data) + if (!dev->port_data) return; rdma_for_each_port(dev, port) { @@ -602,6 +609,9 @@ void rdma_counter_init(struct ib_device *dev) port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); + if (!dev->ops.alloc_hw_stats) + continue; + port_counter->hstats = dev->ops.alloc_hw_stats(dev, port); if (!port_counter->hstats) goto fail; @@ -610,13 +620,12 @@ void rdma_counter_init(struct ib_device *dev) return; fail: - rdma_for_each_port(dev, port) { + for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; kfree(port_counter->hstats); port_counter->hstats = NULL; + mutex_destroy(&port_counter->lock); } - - return; } void rdma_counter_release(struct ib_device *dev) @@ -624,11 +633,9 @@ void rdma_counter_release(struct ib_device *dev) struct rdma_port_counter *port_counter; u32 port; - if (!dev->ops.alloc_hw_stats) - return; - rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; kfree(port_counter->hstats); + mutex_destroy(&port_counter->lock); } } diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 7c599878ccf7..bbfded6d5d3d 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -253,6 +253,34 @@ out_free_cq: EXPORT_SYMBOL(__ib_alloc_cq_user); /** + * __ib_alloc_cq_any - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @poll_ctx: context to poll the CQ from + * @caller: module owner name + * + * Attempt to spread ULP Completion Queues over each device's interrupt + * vectors. A simple best-effort mechanism is used. + */ +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, + int nr_cqe, enum ib_poll_context poll_ctx, + const char *caller) +{ + static atomic_t counter; + int comp_vector = 0; + + if (dev->num_comp_vectors > 1) + comp_vector = + atomic_inc_return(&counter) % + min_t(int, dev->num_comp_vectors, num_online_cpus()); + + return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx, + caller, NULL); +} +EXPORT_SYMBOL(__ib_alloc_cq_any); + +/** * ib_free_cq_user - free a completion queue * @cq: completion queue to free. * @udata: User data or NULL for kernel object diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9773145dee09..99c4a55545cf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -39,7 +39,6 @@ #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> -#include <net/netns/generic.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> @@ -94,28 +93,24 @@ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 -static LIST_HEAD(client_list); +static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); +static void ib_client_put(struct ib_client *client) +{ + if (refcount_dec_and_test(&client->uses)) + complete(&client->uses_zero); +} + /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 -/** - * struct rdma_dev_net - rdma net namespace metadata for a net - * @net: Pointer to owner net namespace - * @id: xarray id to identify the net namespace. - */ -struct rdma_dev_net { - possible_net_t net; - u32 id; -}; - -static unsigned int rdma_dev_net_id; +unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary @@ -508,6 +503,9 @@ static void ib_device_release(struct device *device) rcu_head); } + mutex_destroy(&dev->unregistration_lock); + mutex_destroy(&dev->compat_devs_mutex); + xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); @@ -661,6 +659,14 @@ static int add_client_context(struct ib_device *device, down_write(&device->client_data_rwsem); /* + * So long as the client is registered hold both the client and device + * unregistration locks. + */ + if (!refcount_inc_not_zero(&client->uses)) + goto out_unlock; + refcount_inc(&device->refcount); + + /* * Another caller to add_client_context got here first and has already * completely initialized context. */ @@ -683,6 +689,9 @@ static int add_client_context(struct ib_device *device, return 0; out: + ib_device_put(device); + ib_client_put(client); +out_unlock: up_write(&device->client_data_rwsem); return ret; } @@ -702,7 +711,7 @@ static void remove_client_context(struct ib_device *device, client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); - downgrade_write(&device->client_data_rwsem); + up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the @@ -712,17 +721,13 @@ static void remove_client_context(struct ib_device *device, * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. - * - * It tempting to drop the client_data_rwsem too, but this is required - * to ensure that unregister_client does not return until all clients - * are completely unregistered, which is required to avoid module - * unloading races. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); - up_read(&device->client_data_rwsem); + ib_device_put(device); + ib_client_put(client); } static int alloc_port_data(struct ib_device *device) @@ -1047,7 +1052,7 @@ int rdma_compatdev_set(u8 enable) static void rdma_dev_exit_net(struct net *net) { - struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; @@ -1081,25 +1086,32 @@ static void rdma_dev_exit_net(struct net *net) } up_read(&devices_rwsem); + rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { - struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; + write_pnet(&rnet->net, net); + + ret = rdma_nl_net_init(rnet); + if (ret) + return ret; + /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; - write_pnet(&rnet->net, net); - ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); - if (ret) + if (ret) { + rdma_nl_net_exit(rnet); return ret; + } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { @@ -1224,7 +1236,7 @@ static int setup_device(struct ib_device *device) static void disable_device(struct ib_device *device) { - struct ib_client *client; + u32 cid; WARN_ON(!refcount_read(&device->refcount)); @@ -1232,10 +1244,19 @@ static void disable_device(struct ib_device *device) xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); + /* + * Remove clients in LIFO order, see assign_client_id. This could be + * more efficient if xarray learns to reverse iterate. Since no new + * clients can be added to this ib_device past this point we only need + * the maximum possible client_id value here. + */ down_read(&clients_rwsem); - list_for_each_entry_reverse(client, &client_list, list) - remove_client_context(device, client->client_id); + cid = highest_client_id; up_read(&clients_rwsem); + while (cid) { + cid--; + remove_client_context(device, cid); + } /* Pairs with refcount_set in enable_device */ ib_device_put(device); @@ -1662,30 +1683,31 @@ static int assign_client_id(struct ib_client *client) /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in - * registration order, and retain a linked list we can reverse iterate - * to get the LIFO order. The extra linked list can go away if xarray - * learns to reverse iterate. + * registration order. */ - if (list_empty(&client_list)) { - client->client_id = 0; - } else { - struct ib_client *last; - - last = list_last_entry(&client_list, struct ib_client, list); - client->client_id = last->client_id + 1; - } + client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) goto out; + highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); - list_add_tail(&client->list, &client_list); out: up_write(&clients_rwsem); return ret; } +static void remove_client_id(struct ib_client *client) +{ + down_write(&clients_rwsem); + xa_erase(&clients, client->client_id); + for (; highest_client_id; highest_client_id--) + if (xa_load(&clients, highest_client_id - 1)) + break; + up_write(&clients_rwsem); +} + /** * ib_register_client - Register an IB client * @client:Client to register @@ -1705,6 +1727,8 @@ int ib_register_client(struct ib_client *client) unsigned long index; int ret; + refcount_set(&client->uses, 1); + init_completion(&client->uses_zero); ret = assign_client_id(client); if (ret) return ret; @@ -1740,21 +1764,30 @@ void ib_unregister_client(struct ib_client *client) unsigned long index; down_write(&clients_rwsem); + ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); - /* - * Every device still known must be serialized to make sure we are - * done with the client callbacks before we return. - */ - down_read(&devices_rwsem); - xa_for_each (&devices, index, device) + + /* We do not want to have locks while calling client->remove() */ + rcu_read_lock(); + xa_for_each (&devices, index, device) { + if (!ib_device_try_get(device)) + continue; + rcu_read_unlock(); + remove_client_context(device, client->client_id); - up_read(&devices_rwsem); - down_write(&clients_rwsem); - list_del(&client->list); - xa_erase(&clients, client->client_id); - up_write(&clients_rwsem); + ib_device_put(device); + rcu_read_lock(); + } + rcu_read_unlock(); + + /* + * remove_client_context() is not a fence, it can return even though a + * removal is ongoing. Wait until all removals are completed. + */ + wait_for_completion(&client->uses_zero); + remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); @@ -1940,31 +1973,64 @@ void ib_dispatch_event(struct ib_event *event) } EXPORT_SYMBOL(ib_dispatch_event); -/** - * ib_query_port - Query IB port attributes - * @device:Device to query - * @port_num:Port number to query - * @port_attr:Port attributes - * - * ib_query_port() returns the attributes of a port through the - * @port_attr pointer. - */ -int ib_query_port(struct ib_device *device, - u8 port_num, - struct ib_port_attr *port_attr) +static int iw_query_port(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr) { - union ib_gid gid; + struct in_device *inetdev; + struct net_device *netdev; int err; - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; + memset(port_attr, 0, sizeof(*port_attr)); + + netdev = ib_device_get_netdev(device, port_num); + if (!netdev) + return -ENODEV; + + dev_put(netdev); + + port_attr->max_mtu = IB_MTU_4096; + port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); + + if (!netif_carrier_ok(netdev)) { + port_attr->state = IB_PORT_DOWN; + port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } else { + inetdev = in_dev_get(netdev); + + if (inetdev && inetdev->ifa_list) { + port_attr->state = IB_PORT_ACTIVE; + port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + in_dev_put(inetdev); + } else { + port_attr->state = IB_PORT_INIT; + port_attr->phys_state = + IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; + } + } + + err = device->ops.query_port(device, port_num, port_attr); + if (err) + return err; + + return 0; +} + +static int __ib_query_port(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr) +{ + union ib_gid gid = {}; + int err; memset(port_attr, 0, sizeof(*port_attr)); + err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; - if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + if (rdma_port_get_link_layer(device, port_num) != + IB_LINK_LAYER_INFINIBAND) return 0; err = device->ops.query_gid(device, port_num, 0, &gid); @@ -1974,6 +2040,28 @@ int ib_query_port(struct ib_device *device, port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); return 0; } + +/** + * ib_query_port - Query IB port attributes + * @device:Device to query + * @port_num:Port number to query + * @port_attr:Port attributes + * + * ib_query_port() returns the attributes of a port through the + * @port_attr pointer. + */ +int ib_query_port(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr) +{ + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + if (rdma_protocol_iwarp(device, port_num)) + return iw_query_port(device, port_num, port_attr); + else + return __ib_query_port(device, port_num, port_attr); +} EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) @@ -2528,6 +2616,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, init_port); + SET_DEVICE_OP(dev_ops, invalidate_range); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); @@ -2626,12 +2715,6 @@ static int __init ib_core_init(void) goto err_comp_unbound; } - ret = rdma_nl_init(); - if (ret) { - pr_warn("Couldn't init IB netlink interface: err %d\n", ret); - goto err_sysfs; - } - ret = addr_init(); if (ret) { pr_warn("Could't init IB address resolution\n"); @@ -2677,8 +2760,6 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - rdma_nl_exit(); -err_sysfs: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 7d841b689a1e..e08aec427027 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -148,13 +148,6 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) hlist_del_init(&fmr->cache_node); fmr->remap_count = 0; list_add_tail(&fmr->fmr->list, &fmr_list); - -#ifdef DEBUG - if (fmr->ref_count !=0) { - pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); - } -#endif } list_splice_init(&pool->dirty_list, &unmap_list); @@ -496,12 +489,6 @@ void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) } } -#ifdef DEBUG - if (fmr->ref_count < 0) - pr_warn(PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); -#endif - spin_unlock_irqrestore(&pool->pool_lock, flags); } EXPORT_SYMBOL(ib_fmr_pool_unmap); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 2452b0ddcf0d..46686990a827 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -112,7 +112,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -124,8 +124,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) return ret; pid_query_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; @@ -202,7 +201,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -214,8 +213,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) add_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); add_mapping_error_nowarn: - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; @@ -297,7 +295,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -308,8 +306,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) query_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); query_mapping_error_nowarn: - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; @@ -364,7 +361,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 41929bb83739..13495b43dbc1 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -645,7 +645,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -655,8 +655,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); return ret; } @@ -674,7 +673,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; @@ -778,8 +777,7 @@ send_mapping_info_unlock: send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); @@ -824,7 +822,7 @@ int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) goto hello_num_error; nlmsg_end(skb, nlh); - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -834,7 +832,6 @@ int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) return 0; hello_num_error: pr_info("%s: %s\n", __func__, err_str); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); return ret; } diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index cc99479b2c09..9947d16edef2 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3224,18 +3224,18 @@ static int ib_mad_port_open(struct ib_device *device, if (has_smi) cq_size *= 2; + port_priv->pd = ib_alloc_pd(device, 0); + if (IS_ERR(port_priv->pd)) { + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); + ret = PTR_ERR(port_priv->pd); + goto error3; + } + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); - goto error3; - } - - port_priv->pd = ib_alloc_pd(device, 0); - if (IS_ERR(port_priv->pd)) { - dev_err(&device->dev, "Couldn't create ib_mad PD\n"); - ret = PTR_ERR(port_priv->pd); goto error4; } @@ -3278,11 +3278,11 @@ error8: error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: - ib_dealloc_pd(port_priv->pd); -error4: ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); +error4: + ib_dealloc_pd(port_priv->pd); error3: kfree(port_priv); @@ -3312,8 +3312,8 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_workqueue(port_priv->wq); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); - ib_dealloc_pd(port_priv->pd); ib_free_cq(port_priv->cq); + ib_dealloc_pd(port_priv->pd); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index eecfc0b377c9..81dbd5f41bed 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -36,20 +36,22 @@ #include <linux/export.h> #include <net/netlink.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> #include <linux/module.h> #include "core_priv.h" static DEFINE_MUTEX(rdma_nl_mutex); -static struct sock *nls; static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) { - return netlink_has_listeners(nls, group); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); + + return netlink_has_listeners(rnet->nl_sock, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); @@ -73,13 +75,21 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) return (op < max_num_ops[type]) ? true : false; } -static bool is_nl_valid(unsigned int type, unsigned int op) +static bool +is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op) { const struct rdma_nl_cbs *cb_table; if (!is_nl_msg_valid(type, op)) return false; + /* + * Currently only NLDEV client is supporting netlink commands in + * non init_net net namespace. + */ + if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) + return false; + if (!rdma_nl_types[type].cb_table) { mutex_unlock(&rdma_nl_mutex); request_module("rdma-netlink-subsys-%d", type); @@ -161,7 +171,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; - if (!is_nl_valid(index, op)) + if (!is_nl_valid(skb, index, op)) return -EINVAL; cb_table = rdma_nl_types[index].cb_table; @@ -185,7 +195,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = cb_table[op].dump, }; if (c.dump) - return netlink_dump_start(nls, skb, nlh, &c); + return netlink_dump_start(skb->sk, skb, nlh, &c); return -EINVAL; } @@ -258,52 +268,65 @@ static void rdma_nl_rcv(struct sk_buff *skb) mutex_unlock(&rdma_nl_mutex); } -int rdma_nl_unicast(struct sk_buff *skb, u32 pid) +int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast); -int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) +int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, 0); + err = netlink_unicast(rnet->nl_sock, skb, pid, 0); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast_wait); -int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct net *net, struct sk_buff *skb, + unsigned int group, gfp_t flags) { - return nlmsg_multicast(nls, skb, 0, group, flags); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + + return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); } EXPORT_SYMBOL(rdma_nl_multicast); -int __init rdma_nl_init(void) +void rdma_nl_exit(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + WARN(rdma_nl_types[idx].cb_table, + "Netlink client %d wasn't released prior to unloading %s\n", + idx, KBUILD_MODNAME); +} + +int rdma_nl_net_init(struct rdma_dev_net *rnet) { + struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, }; + struct sock *nls; - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); + nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); if (!nls) return -ENOMEM; nls->sk_sndtimeo = 10 * HZ; + rnet->nl_sock = nls; return 0; } -void rdma_nl_exit(void) +void rdma_nl_net_exit(struct rdma_dev_net *rnet) { - int idx; - - for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) - rdma_nl_unregister(idx); - - netlink_kernel_release(nls); + netlink_kernel_release(rnet->nl_sock); } MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 783e465e7c41..7a7474000100 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -382,8 +382,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(device, i, - task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -832,7 +831,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -972,7 +971,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1074,7 +1073,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1251,7 +1250,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -1596,7 +1595,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, put_device(data.cdev); if (ibdev) ib_device_put(ibdev); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); out_data: put_device(data.cdev); @@ -1636,7 +1635,7 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } nlmsg_end(msg, nlh); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1734,7 +1733,7 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: rdma_counter_unbind_qpn(device, port, qpn, cntn); @@ -1802,7 +1801,7 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: rdma_counter_bind_qpn(device, port, qpn, cntn); @@ -1893,7 +1892,7 @@ static int stat_get_doit_default_counter(struct sk_buff *skb, mutex_unlock(&stats->lock); nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_table: nla_nest_cancel(msg, table_attr); @@ -1952,16 +1951,20 @@ static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { + ret = -EMSGSIZE; goto err_msg; + } if ((mode == RDMA_COUNTER_MODE_AUTO) && - nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + ret = -EMSGSIZE; goto err_msg; + } nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg: nlmsg_free(msg); diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bddff426ee0f..a07665f7ef8c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -107,10 +107,8 @@ void rdma_restrack_clean(struct ib_device *dev) * rdma_restrack_count() - the current usage of specific object * @dev: IB device * @type: actual type of object to operate - * @ns: PID namespace */ -int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, - struct pid_namespace *ns) +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; @@ -119,10 +117,9 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, xa_lock(&rt->xa); xas_for_each(&xas, e, U32_MAX) { - if (ns == &init_pid_ns || - (!rdma_is_kernel_res(e) && - ns == task_active_pid_ns(e->task))) - cnt++; + if (!rdma_is_visible_in_pid_ns(e)) + continue; + cnt++; } xa_unlock(&rt->xa); return cnt; @@ -360,5 +357,7 @@ bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res) */ if (rdma_is_kernel_res(res)) return task_active_pid_ns(current) == &init_pid_ns; - return task_active_pid_ns(current) == task_active_pid_ns(res->task); + + /* PID 0 means that resource is not found in current namespace */ + return task_pid_vnr(res->task); } diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index dce06108c8c3..5337393d4dfe 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -583,8 +583,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - /* P2PDMA contexts do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(sg))) + if (is_pci_p2pdma_page(sg_page(sg))) + pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg, + sg_cnt, dir); + else ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7d8071c7e564..17fc2936c077 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -860,7 +860,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); + return rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index b477295a96c2..7a50cedcef1f 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -289,6 +289,24 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, ib_width_enum_to_int(attr.active_width), speed); } +static const char *phys_state_to_str(enum ib_port_phys_state phys_state) +{ + static const char * phys_state_str[] = { + "<unknown>", + "Sleep", + "Polling", + "Disabled", + "PortConfigurationTraining", + "LinkUp", + "LinkErrorRecovery", + "Phy Test", + }; + + if (phys_state < ARRAY_SIZE(phys_state_str)) + return phys_state_str[phys_state]; + return "<unknown>"; +} + static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { @@ -300,16 +318,8 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, if (ret) return ret; - switch (attr.phys_state) { - case 1: return sprintf(buf, "1: Sleep\n"); - case 2: return sprintf(buf, "2: Polling\n"); - case 3: return sprintf(buf, "3: Disabled\n"); - case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); - case 5: return sprintf(buf, "5: LinkUp\n"); - case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); - case 7: return sprintf(buf, "7: Phy Test\n"); - default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state); - } + return sprintf(buf, "%d: %s\n", attr.phys_state, + phys_state_to_str(attr.phys_state)); } static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 08da840ed7ee..24244a2f68cc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (umem->writable && dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); } sg_free_table(&umem->sg_head); @@ -184,9 +181,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * If access flags indicate ODP memory, avoid pinning. Instead, stores - * the mm for future page fault handling in conjunction with MMU notifiers. - * * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -231,36 +225,19 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - if (access & IB_ACCESS_ON_DEMAND) { - umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - umem->is_odp = 1; - } else { - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - } + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); - umem->context = context; + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->ibdev = context->device; umem->length = size; umem->address = addr; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); - if (access & IB_ACCESS_ON_DEMAND) { - if (WARN_ON_ONCE(!context->invalidate_range)) { - ret = -EINVAL; - goto umem_kfree; - } - - ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); - if (ret) - goto umem_kfree; - return umem; - } - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; @@ -346,15 +323,6 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void __ib_umem_release_tail(struct ib_umem *umem) -{ - mmdrop(umem->owning_mm); - if (umem->is_odp) - kfree(to_ib_umem_odp(umem)); - else - kfree(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release @@ -363,30 +331,22 @@ void ib_umem_release(struct ib_umem *umem) { if (!umem) return; + if (umem->is_odp) + return ib_umem_odp_release(to_ib_umem_odp(umem)); - if (umem->is_odp) { - ib_umem_odp_release(to_ib_umem_odp(umem)); - __ib_umem_release_tail(umem); - return; - } - - __ib_umem_release(umem->context->device, umem, 1); + __ib_umem_release(umem->ibdev, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); - __ib_umem_release_tail(umem); + mmdrop(umem->owning_mm); + kfree(umem); } EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - int i; - int n; + int i, n = 0; struct scatterlist *sg; - if (umem->is_odp) - return ib_umem_num_pages(umem); - - n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) n += sg_dma_len(sg) >> PAGE_SHIFT; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 2a75c6f8d827..f67a30fda1ed 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -39,44 +39,14 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hugetlb.h> -#include <linux/interval_tree_generic.h> +#include <linux/interval_tree.h> #include <linux/pagemap.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> -/* - * The ib_umem list keeps track of memory regions for which the HW - * device request to receive notification when the related memory - * mapping is changed. - * - * ib_umem_lock protects the list. - */ - -static u64 node_start(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_start(umem_odp); -} - -/* Note that the representation of the intervals in the interval tree - * considers the ending point as contained in the interval, while the - * function ib_umem_end returns the first address which is not contained - * in the umem. - */ -static u64 node_last(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_end(umem_odp) - 1; -} - -INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, - node_start, node_last, static, rbt_ib_umem) +#include "uverbs.h" static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { @@ -104,35 +74,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); } -static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, - u64 start, u64 end, void *cookie) -{ - /* - * Increase the number of notifiers running, to - * prevent any further fault handling on this MR. - */ - ib_umem_notifier_start_account(umem_odp); - umem_odp->dying = 1; - /* Make sure that the fact the umem is dying is out before we release - * all pending page faults. */ - smp_wmb(); - complete_all(&umem_odp->notifier_completion); - umem_odp->umem.context->invalidate_range( - umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - return 0; -} - static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + struct rb_node *node; down_read(&per_mm->umem_rwsem); - if (per_mm->active) - rbt_ib_umem_for_each_in_range( - &per_mm->umem_tree, 0, ULLONG_MAX, - ib_umem_notifier_release_trampoline, true, NULL); + if (!per_mm->mn.users) + goto out; + + for (node = rb_first_cached(&per_mm->umem_tree); node; + node = rb_next(node)) { + struct ib_umem_odp *umem_odp = + rb_entry(node, struct ib_umem_odp, interval_tree.rb); + + /* + * Increase the number of notifiers running, to prevent any + * further fault handling on this MR. + */ + ib_umem_notifier_start_account(umem_odp); + complete_all(&umem_odp->notifier_completion); + umem_odp->umem.ibdev->ops.invalidate_range( + umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + } + +out: up_read(&per_mm->umem_rwsem); } @@ -140,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); + item->umem.ibdev->ops.invalidate_range(item, start, end); return 0; } @@ -156,10 +125,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - if (!per_mm->active) { + if (!per_mm->mn.users) { up_read(&per_mm->umem_rwsem); /* - * At this point active is permanently set and visible to this + * At this point users is permanently zero and visible to this * CPU without a lock, that fact is relied on to skip the unlock * in range_end. */ @@ -189,7 +158,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (unlikely(!per_mm->active)) + if (unlikely(!per_mm->mn.users)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, @@ -198,212 +167,250 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, up_read(&per_mm->umem_rwsem); } -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, -}; - -static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - up_write(&per_mm->umem_rwsem); -} - -static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - - up_write(&per_mm->umem_rwsem); -} - -static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, - struct mm_struct *mm) +static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; - int ret; per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); if (!per_mm) return ERR_PTR(-ENOMEM); - per_mm->context = ctx; - per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - per_mm->active = true; + WARN_ON(mm != current->mm); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); + return &per_mm->mn; +} - WARN_ON(mm != current->mm); - - per_mm->mn.ops = &ib_umem_notifiers; - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); - if (ret) { - dev_err(&ctx->device->dev, - "Failed to register mmu_notifier %d\n", ret); - goto out_pid; - } +static void ib_umem_free_notifier(struct mmu_notifier *mn) +{ + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); - return per_mm; + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); -out_pid: put_pid(per_mm->tgid); kfree(per_mm); - return ERR_PTR(ret); } -static int get_per_mm(struct ib_umem_odp *umem_odp) +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, + .alloc_notifier = ib_umem_alloc_notifier, + .free_notifier = ib_umem_free_notifier, +}; + +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *ctx = umem_odp->umem.context; struct ib_ucontext_per_mm *per_mm; + struct mmu_notifier *mn; + int ret; - /* - * Generally speaking we expect only one or two per_mm in this list, - * so no reason to optimize this search today. - */ - mutex_lock(&ctx->per_mm_list_lock); - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { - if (per_mm->mm == umem_odp->umem.owning_mm) - goto found; + umem_odp->umem.is_odp = 1; + if (!umem_odp->is_implicit_odp) { + size_t page_size = 1UL << umem_odp->page_shift; + size_t pages; + + umem_odp->interval_tree.start = + ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, + &umem_odp->interval_tree.last)) + return -EOVERFLOW; + umem_odp->interval_tree.last = + ALIGN(umem_odp->interval_tree.last, page_size); + if (unlikely(umem_odp->interval_tree.last < page_size)) + return -EOVERFLOW; + + pages = (umem_odp->interval_tree.last - + umem_odp->interval_tree.start) >> + umem_odp->page_shift; + if (!pages) + return -EINVAL; + + /* + * Note that the representation of the intervals in the + * interval tree considers the ending point as contained in + * the interval. + */ + umem_odp->interval_tree.last--; + + umem_odp->page_list = kvcalloc( + pages, sizeof(*umem_odp->page_list), GFP_KERNEL); + if (!umem_odp->page_list) + return -ENOMEM; + + umem_odp->dma_list = kvcalloc( + pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); + if (!umem_odp->dma_list) { + ret = -ENOMEM; + goto out_page_list; + } } - per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); - if (IS_ERR(per_mm)) { - mutex_unlock(&ctx->per_mm_list_lock); - return PTR_ERR(per_mm); + mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); + if (IS_ERR(mn)) { + ret = PTR_ERR(mn); + goto out_dma_list; } + umem_odp->per_mm = per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); -found: - umem_odp->per_mm = per_mm; - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); + mutex_init(&umem_odp->umem_mutex); + init_completion(&umem_odp->notifier_completion); + + if (!umem_odp->is_implicit_odp) { + down_write(&per_mm->umem_rwsem); + interval_tree_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + up_write(&per_mm->umem_rwsem); + } + mmgrab(umem_odp->umem.owning_mm); return 0; -} -static void free_per_mm(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +out_dma_list: + kvfree(umem_odp->dma_list); +out_page_list: + kvfree(umem_odp->page_list); + return ret; } -static void put_per_mm(struct ib_umem_odp *umem_odp) +/** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem + * + * Implicit ODP umems do not have a VA range and do not have any page lists. + * They exist only to hold the per_mm reference to help the driver create + * children umems. + * + * @udata: udata from the syscall being used to create the umem + * @access: ib_reg_mr access flags + */ +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, + int access) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_ucontext *ctx = umem_odp->umem.context; - bool need_free; - - mutex_lock(&ctx->per_mm_list_lock); - umem_odp->per_mm = NULL; - per_mm->odp_mrs_count--; - need_free = per_mm->odp_mrs_count == 0; - if (need_free) - list_del(&per_mm->ucontext_list); - mutex_unlock(&ctx->per_mm_list_lock); - - if (!need_free) - return; + struct ib_ucontext *context = + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; + int ret; - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in an start/end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. - */ - down_write(&per_mm->umem_rwsem); - per_mm->active = false; - up_write(&per_mm->umem_rwsem); + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); - WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); - put_pid(per_mm->tgid); - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); + if (!context) + return ERR_PTR(-EIO); + if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + umem = &umem_odp->umem; + umem->ibdev = context->device; + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; + + ret = ib_init_umem_odp(umem_odp); + if (ret) { + kfree(umem_odp); + return ERR_PTR(ret); + } + return umem_odp; } +EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, - unsigned long addr, size_t size) +/** + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit + * parent ODP umem + * + * @root: The parent umem enclosing the child. This must be allocated using + * ib_alloc_implicit_odp_umem() + * @addr: The starting userspace VA + * @size: The length of the userspace VA + */ +struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, + unsigned long addr, size_t size) { - struct ib_ucontext_per_mm *per_mm = root->per_mm; - struct ib_ucontext *ctx = per_mm->context; + /* + * Caller must ensure that root cannot be freed during the call to + * ib_alloc_odp_umem. + */ struct ib_umem_odp *odp_data; struct ib_umem *umem; - int pages = size >> PAGE_SHIFT; int ret; + if (WARN_ON(!root->is_implicit_odp)) + return ERR_PTR(-EINVAL); + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = ctx; + umem->ibdev = root->umem.ibdev; umem->length = size; umem->address = addr; - odp_data->page_shift = PAGE_SHIFT; umem->writable = root->umem.writable; - umem->is_odp = 1; - odp_data->per_mm = per_mm; - umem->owning_mm = per_mm->mm; - mmgrab(umem->owning_mm); - - mutex_init(&odp_data->umem_mutex); - init_completion(&odp_data->notifier_completion); - - odp_data->page_list = - vzalloc(array_size(pages, sizeof(*odp_data->page_list))); - if (!odp_data->page_list) { - ret = -ENOMEM; - goto out_odp_data; - } + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; - odp_data->dma_list = - vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); - if (!odp_data->dma_list) { - ret = -ENOMEM; - goto out_page_list; + ret = ib_init_umem_odp(odp_data); + if (ret) { + kfree(odp_data); + return ERR_PTR(ret); } - - /* - * Caller must ensure that the umem_odp that the per_mm came from - * cannot be freed during the call to ib_alloc_odp_umem. - */ - mutex_lock(&ctx->per_mm_list_lock); - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - add_umem_to_per_mm(odp_data); - return odp_data; - -out_page_list: - vfree(odp_data->page_list); -out_odp_data: - mmdrop(umem->owning_mm); - kfree(odp_data); - return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_odp_umem); +EXPORT_SYMBOL(ib_umem_odp_alloc_child); -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +/** + * ib_umem_odp_get - Create a umem_odp for a userspace va + * + * @udata: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * + * The driver should use when the access flags indicate ODP memory. It avoids + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. + */ +struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, + size_t size, int access) { - struct ib_umem *umem = &umem_odp->umem; - /* - * NOTE: This must called in a process context where umem->owning_mm - * == current->mm - */ - struct mm_struct *mm = umem->owning_mm; - int ret_val; + struct ib_umem_odp *umem_odp; + struct ib_ucontext *context; + struct mm_struct *mm; + int ret; + + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); + + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || + WARN_ON_ONCE(!context->device->ops.invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + + umem_odp->umem.ibdev = context->device; + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = mm = current->mm; umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { @@ -414,63 +421,63 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) vma = find_vma(mm, ib_umem_start(umem_odp)); if (!vma || !is_vm_hugetlb_page(vma)) { up_read(&mm->mmap_sem); - return -EINVAL; + ret = -EINVAL; + goto err_free; } h = hstate_vma(vma); umem_odp->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); } - mutex_init(&umem_odp->umem_mutex); - - init_completion(&umem_odp->notifier_completion); - - if (ib_umem_odp_num_pages(umem_odp)) { - umem_odp->page_list = - vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->page_list) - return -ENOMEM; - - umem_odp->dma_list = - vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->dma_list) { - ret_val = -ENOMEM; - goto out_page_list; - } - } - - ret_val = get_per_mm(umem_odp); - if (ret_val) - goto out_dma_list; - add_umem_to_per_mm(umem_odp); - - return 0; + ret = ib_init_umem_odp(umem_odp); + if (ret) + goto err_free; + return umem_odp; -out_dma_list: - vfree(umem_odp->dma_list); -out_page_list: - vfree(umem_odp->page_list); - return ret_val; +err_free: + kfree(umem_odp); + return ERR_PTR(ret); } +EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + if (!umem_odp->is_implicit_odp) { + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + kvfree(umem_odp->dma_list); + kvfree(umem_odp->page_list); + } - remove_umem_from_per_mm(umem_odp); - put_per_mm(umem_odp); - vfree(umem_odp->dma_list); - vfree(umem_odp->page_list); + down_write(&per_mm->umem_rwsem); + if (!umem_odp->is_implicit_odp) { + interval_tree_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + } + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in a missing end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + * Thus we call the mmu_notifier_put under the rwsem and test the + * internal users count to reliably see if we are past this point. + */ + mmu_notifier_put(&per_mm->mn); + up_write(&per_mm->umem_rwsem); + + mmdrop(umem_odp->umem.owning_mm); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /* * Map for DMA and insert a single page into the on-demand paging page tables. @@ -497,8 +504,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_ucontext *context = umem_odp->umem.context; - struct ib_device *dev = context->device; + struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t dma_addr; int remove_existing_mapping = 0; int ret = 0; @@ -538,7 +544,7 @@ out: if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); - context->invalidate_range( + dev->ops.invalidate_range( umem_odp, ib_umem_start(umem_odp) + (page_index << umem_odp->page_shift), @@ -711,7 +717,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, { int idx; u64 addr; - struct ib_device *dev = umem_odp->umem.context->device; + struct ib_device *dev = umem_odp->umem.ibdev; virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); @@ -765,35 +771,21 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, void *cookie) { int ret_val = 0; - struct umem_odp_node *node, *next; + struct interval_tree_node *node, *next; struct ib_umem_odp *umem; if (unlikely(start == last)) return ret_val; - for (node = rbt_ib_umem_iter_first(root, start, last - 1); + for (node = interval_tree_iter_first(root, start, last - 1); node; node = next) { /* TODO move the blockable decision up to the callback */ if (!blockable) return -EAGAIN; - next = rbt_ib_umem_iter_next(node, start, last - 1); + next = interval_tree_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; } -EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); - -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, - u64 addr, u64 length) -{ - struct umem_odp_node *node; - - node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); - if (node) - return container_of(node, struct ib_umem_odp, interval_tree); - return NULL; - -} -EXPORT_SYMBOL(rbt_ib_umem_lookup); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 9f8a48016b41..d1407fa378e8 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -49,6 +49,7 @@ #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/slab.h> +#include <linux/nospec.h> #include <linux/uaccess.h> @@ -884,11 +885,14 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) if (get_user(id, arg)) return -EFAULT; + if (id >= IB_UMAD_MAX_AGENTS) + return -EINVAL; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); - if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + id = array_index_nospec(id, IB_UMAD_MAX_AGENTS); + if (!__get_agent(file, id)) { ret = -EINVAL; goto out; } @@ -1038,7 +1042,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) ib_unregister_mad_agent(file->agent[i]); mutex_unlock(&file->port->file_mutex); - + mutex_destroy(&file->mutex); kfree(file); return 0; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 7ddd0e5bc6b3..14a80fd9f464 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; - mutex_init(&ucontext->per_mm_list_lock); - INIT_LIST_HEAD(&ucontext->per_mm_list); - ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; @@ -275,8 +272,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); if (ret) goto err_file; - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; rdma_restrack_uadd(&ucontext->res); @@ -3484,7 +3479,8 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, err_copy: ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs)); - + /* It was released in ib_destroy_srq_user */ + srq = NULL; err_free: kfree(srq); err_put: diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 11c13c1381cf..db98111b47f4 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -120,6 +120,8 @@ static void ib_uverbs_release_dev(struct device *device) uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); + mutex_destroy(&dev->lists_mutex); + mutex_destroy(&dev->xrcd_tree_mutex); kfree(dev); } @@ -212,6 +214,8 @@ void ib_uverbs_release_file(struct kref *ref) if (file->disassociate_page) __free_pages(file->disassociate_page, 0); + mutex_destroy(&file->umap_lock); + mutex_destroy(&file->ucontext_lock); kfree(file); } @@ -1487,6 +1491,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + mmu_notifier_synchronize(); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 92349bf37589..f974b6854224 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2259,6 +2259,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata) if (ret) return ret; } + mutex_destroy(&xrcd->tgt_qp_mutex); return xrcd->device->ops.dealloc_xrcd(xrcd, udata); } |