aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5/main.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5/main.c')
-rw-r--r--drivers/infiniband/hw/mlx5/main.c447
1 files changed, 420 insertions, 27 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b0ec175cc6ba..ec737e2287fe 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -40,6 +40,8 @@
#include <linux/io-mapping.h>
#include <linux/sched.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include <linux/mlx5/vport.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
@@ -66,12 +68,14 @@ static char mlx5_version[] =
DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
+enum {
+ MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
+};
+
static enum rdma_link_layer
-mlx5_ib_port_link_layer(struct ib_device *device)
+mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
{
- struct mlx5_ib_dev *dev = to_mdev(device);
-
- switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
+ switch (port_type_cap) {
case MLX5_CAP_PORT_TYPE_IB:
return IB_LINK_LAYER_INFINIBAND;
case MLX5_CAP_PORT_TYPE_ETH:
@@ -81,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device)
}
}
+static enum rdma_link_layer
+mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+
+ return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+}
+
+static int mlx5_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
+ roce.nb);
+
+ if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
+ return NOTIFY_DONE;
+
+ write_lock(&ibdev->roce.netdev_lock);
+ if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
+ ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
+ write_unlock(&ibdev->roce.netdev_lock);
+
+ return NOTIFY_DONE;
+}
+
+static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
+ u8 port_num)
+{
+ struct mlx5_ib_dev *ibdev = to_mdev(device);
+ struct net_device *ndev;
+
+ /* Ensure ndev does not disappear before we invoke dev_hold()
+ */
+ read_lock(&ibdev->roce.netdev_lock);
+ ndev = ibdev->roce.netdev;
+ if (ndev)
+ dev_hold(ndev);
+ read_unlock(&ibdev->roce.netdev_lock);
+
+ return ndev;
+}
+
+static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+ struct ib_port_attr *props)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct net_device *ndev;
+ enum ib_mtu ndev_ib_mtu;
+ u16 qkey_viol_cntr;
+
+ memset(props, 0, sizeof(*props));
+
+ props->port_cap_flags |= IB_PORT_CM_SUP;
+ props->port_cap_flags |= IB_PORT_IP_BASED_GIDS;
+
+ props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev,
+ roce_address_table_size);
+ props->max_mtu = IB_MTU_4096;
+ props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
+ props->pkey_tbl_len = 1;
+ props->state = IB_PORT_DOWN;
+ props->phys_state = 3;
+
+ mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
+ props->qkey_viol_cntr = qkey_viol_cntr;
+
+ ndev = mlx5_ib_get_netdev(device, port_num);
+ if (!ndev)
+ return 0;
+
+ if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+ props->state = IB_PORT_ACTIVE;
+ props->phys_state = 5;
+ }
+
+ ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
+
+ dev_put(ndev);
+
+ props->active_mtu = min(props->max_mtu, ndev_ib_mtu);
+
+ props->active_width = IB_WIDTH_4X; /* TODO */
+ props->active_speed = IB_SPEED_QDR; /* TODO */
+
+ return 0;
+}
+
+static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ void *mlx5_addr)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+ char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+ source_l3_address);
+ void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+ source_mac_47_32);
+
+ if (!gid)
+ return;
+
+ ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
+
+ if (is_vlan_dev(attr->ndev)) {
+ MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
+ MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
+ }
+
+ switch (attr->gid_type) {
+ case IB_GID_TYPE_IB:
+ MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
+ break;
+ case IB_GID_TYPE_ROCE_UDP_ENCAP:
+ MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
+ break;
+
+ default:
+ WARN_ON(true);
+ }
+
+ if (attr->gid_type != IB_GID_TYPE_IB) {
+ if (ipv6_addr_v4mapped((void *)gid))
+ MLX5_SET_RA(mlx5_addr, roce_l3_type,
+ MLX5_ROCE_L3_TYPE_IPV4);
+ else
+ MLX5_SET_RA(mlx5_addr, roce_l3_type,
+ MLX5_ROCE_L3_TYPE_IPV6);
+ }
+
+ if ((attr->gid_type == IB_GID_TYPE_IB) ||
+ !ipv6_addr_v4mapped((void *)gid))
+ memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
+ else
+ memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
+}
+
+static int set_roce_addr(struct ib_device *device, u8 port_num,
+ unsigned int index,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ u32 in[MLX5_ST_SZ_DW(set_roce_address_in)];
+ u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
+ void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
+
+ if (ll != IB_LINK_LAYER_ETHERNET)
+ return -EINVAL;
+
+ memset(in, 0, sizeof(in));
+
+ ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
+
+ MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+ MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+
+ memset(out, 0, sizeof(out));
+ return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
+ unsigned int index, const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ __always_unused void **context)
+{
+ return set_roce_addr(device, port_num, index, gid, attr);
+}
+
+static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
+ unsigned int index, __always_unused void **context)
+{
+ return set_roce_addr(device, port_num, index, NULL, NULL);
+}
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+ int index)
+{
+ struct ib_gid_attr attr;
+ union ib_gid gid;
+
+ if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
+ return 0;
+
+ if (!attr.ndev)
+ return 0;
+
+ dev_put(attr.ndev);
+
+ if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+ return 0;
+
+ return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
+}
+
static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
{
return !dev->mdev->issi;
@@ -97,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev)
if (mlx5_use_mad_ifc(to_mdev(ibdev)))
return MLX5_VPORT_ACCESS_METHOD_MAD;
- if (mlx5_ib_port_link_layer(ibdev) ==
+ if (mlx5_ib_port_link_layer(ibdev, 1) ==
IB_LINK_LAYER_ETHERNET)
return MLX5_VPORT_ACCESS_METHOD_NIC;
return MLX5_VPORT_ACCESS_METHOD_HCA;
}
+static void get_atomic_caps(struct mlx5_ib_dev *dev,
+ struct ib_device_attr *props)
+{
+ u8 tmp;
+ u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+ u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+ u8 atomic_req_8B_endianness_mode =
+ MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
+
+ /* Check if HW supports 8 bytes standard atomic operations and capable
+ * of host endianness respond
+ */
+ tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
+ if (((atomic_operations & tmp) == tmp) &&
+ (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
+ (atomic_req_8B_endianness_mode)) {
+ props->atomic_cap = IB_ATOMIC_HCA;
+ } else {
+ props->atomic_cap = IB_ATOMIC_NONE;
+ }
+}
+
static int mlx5_query_system_image_guid(struct ib_device *ibdev,
__be64 *sys_image_guid)
{
@@ -119,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev,
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
- if (!err)
- *sys_image_guid = cpu_to_be64(tmp);
- return err;
+ break;
+
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
+ break;
default:
return -EINVAL;
}
+
+ if (!err)
+ *sys_image_guid = cpu_to_be64(tmp);
+
+ return err;
+
}
static int mlx5_query_max_pkeys(struct ib_device *ibdev,
@@ -179,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
case MLX5_VPORT_ACCESS_METHOD_HCA:
err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
- if (!err)
- *node_guid = cpu_to_be64(tmp);
- return err;
+ break;
+
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+ break;
default:
return -EINVAL;
}
+
+ if (!err)
+ *node_guid = cpu_to_be64(tmp);
+
+ return err;
}
struct mlx5_reg_node_desc {
@@ -263,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (MLX5_CAP_GEN(mdev, block_lb_mc))
props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+ if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+ (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
props->vendor_part_id = mdev->pdev->device;
props->hw_ver = mdev->pdev->revision;
@@ -278,7 +519,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_sge = min(max_rq_sg, max_sq_sg);
props->max_sge_rd = props->max_sge;
props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
- props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
+ props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
@@ -289,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
props->max_srq_sge = max_rq_sg - 1;
props->max_fast_reg_page_list_len = (unsigned int)-1;
- props->atomic_cap = IB_ATOMIC_NONE;
+ get_atomic_caps(dev, props);
props->masked_atomic_cap = IB_ATOMIC_NONE;
props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
props->max_mcast_grp;
props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+ props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
+ props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (MLX5_CAP_GEN(mdev, pg))
@@ -303,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->odp_caps = dev->odp_caps;
#endif
+ if (MLX5_CAP_GEN(mdev, cd))
+ props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
return 0;
}
@@ -483,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
case MLX5_VPORT_ACCESS_METHOD_HCA:
return mlx5_query_hca_port(ibdev, port, props);
+ case MLX5_VPORT_ACCESS_METHOD_NIC:
+ return mlx5_query_port_roce(ibdev, port, props);
+
default:
return -EINVAL;
}
@@ -583,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
struct ib_udata *udata)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
- struct mlx5_ib_alloc_ucontext_req_v2 req;
- struct mlx5_ib_alloc_ucontext_resp resp;
+ struct mlx5_ib_alloc_ucontext_req_v2 req = {};
+ struct mlx5_ib_alloc_ucontext_resp resp = {};
struct mlx5_ib_ucontext *context;
struct mlx5_uuar_info *uuari;
struct mlx5_uar *uars;
@@ -599,20 +848,22 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (!dev->ib_active)
return ERR_PTR(-EAGAIN);
- memset(&req, 0, sizeof(req));
+ if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
+ return ERR_PTR(-EINVAL);
+
reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
ver = 0;
- else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+ else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
ver = 2;
else
return ERR_PTR(-EINVAL);
- err = ib_copy_from_udata(&req, udata, reqlen);
+ err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
if (err)
return ERR_PTR(err);
- if (req.flags || req.reserved)
+ if (req.flags)
return ERR_PTR(-EINVAL);
if (req.total_num_uuars > MLX5_MAX_UUARS)
@@ -621,6 +872,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (req.total_num_uuars == 0)
return ERR_PTR(-EINVAL);
+ if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (reqlen > sizeof(req) &&
+ !ib_is_udata_cleared(udata, sizeof(req),
+ reqlen - sizeof(req)))
+ return ERR_PTR(-EOPNOTSUPP);
+
req.total_num_uuars = ALIGN(req.total_num_uuars,
MLX5_NON_FP_BF_REGS_PER_PAGE);
if (req.num_low_latency_uuars > req.total_num_uuars - 1)
@@ -636,6 +895,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+ resp.cqe_version = min_t(__u8,
+ (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+ req.max_cqe_version);
+ resp.response_length = min(offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length), udata->outlen);
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
@@ -681,22 +945,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
#endif
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
+ err = mlx5_core_alloc_transport_domain(dev->mdev,
+ &context->tdn);
+ if (err)
+ goto out_uars;
+ }
+
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
resp.tot_uuars = req.total_num_uuars;
resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
- err = ib_copy_to_udata(udata, &resp,
- sizeof(resp) - sizeof(resp.reserved));
+
+ if (field_avail(typeof(resp), cqe_version, udata->outlen))
+ resp.response_length += sizeof(resp.cqe_version);
+
+ if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+ resp.comp_mask |=
+ MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+ resp.hca_core_clock_offset =
+ offsetof(struct mlx5_init_seg, internal_timer_h) %
+ PAGE_SIZE;
+ resp.response_length += sizeof(resp.hca_core_clock_offset) +
+ sizeof(resp.reserved2) +
+ sizeof(resp.reserved3);
+ }
+
+ err = ib_copy_to_udata(udata, &resp, resp.response_length);
if (err)
- goto out_uars;
+ goto out_td;
uuari->ver = ver;
uuari->num_low_latency_uuars = req.num_low_latency_uuars;
uuari->uars = uars;
uuari->num_uars = num_uars;
+ context->cqe_version = resp.cqe_version;
+
return &context->ibucontext;
+out_td:
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+ mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
out_uars:
for (i--; i >= 0; i--)
mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -721,6 +1012,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
struct mlx5_uuar_info *uuari = &context->uuari;
int i;
+ if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+ mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
for (i = 0; i < uuari->num_uars; i++) {
if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
@@ -790,6 +1084,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
return -ENOSYS;
+ case MLX5_IB_MMAP_CORE_CLOCK:
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ /* Don't expose to user-space information it shouldn't have */
+ if (PAGE_SIZE > 4096)
+ return -EOPNOTSUPP;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ pfn = (dev->mdev->iseg_base +
+ offsetof(struct mlx5_init_seg, internal_timer_h)) >>
+ PAGE_SHIFT;
+ if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+
+ mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
+ vma->vm_start,
+ (unsigned long long)pfn << PAGE_SHIFT);
+ break;
+
default:
return -EINVAL;
}
@@ -1758,6 +2076,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
mlx5_ib_dealloc_pd(devr->p0);
}
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+ struct mlx5_ib_dev *dev = to_mdev(ibdev);
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+ u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+ u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+ u32 ret = 0;
+
+ if (ll == IB_LINK_LAYER_INFINIBAND)
+ return RDMA_CORE_PORT_IBA_IB;
+
+ if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+ return 0;
+
+ if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+ return 0;
+
+ if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+ ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+ if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+ ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+ return ret;
+}
+
static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
struct ib_port_immutable *immutable)
{
@@ -1770,20 +2114,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
- immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+ immutable->core_cap_flags = get_core_cap_flags(ibdev);
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
return 0;
}
+static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
+{
+ int err;
+
+ dev->roce.nb.notifier_call = mlx5_netdev_event;
+ err = register_netdevice_notifier(&dev->roce.nb);
+ if (err)
+ return err;
+
+ err = mlx5_nic_vport_enable_roce(dev->mdev);
+ if (err)
+ goto err_unregister_netdevice_notifier;
+
+ return 0;
+
+err_unregister_netdevice_notifier:
+ unregister_netdevice_notifier(&dev->roce.nb);
+ return err;
+}
+
+static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
+{
+ mlx5_nic_vport_disable_roce(dev->mdev);
+ unregister_netdevice_notifier(&dev->roce.nb);
+}
+
static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
{
struct mlx5_ib_dev *dev;
+ enum rdma_link_layer ll;
+ int port_type_cap;
int err;
int i;
- /* don't create IB instance over Eth ports, no RoCE yet! */
- if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+ port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+ ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+ if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
return NULL;
printk_once(KERN_INFO "%s", mlx5_version);
@@ -1794,6 +2168,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->mdev = mdev;
+ rwlock_init(&dev->roce.netdev_lock);
err = get_port_caps(dev);
if (err)
goto err_dealloc;
@@ -1843,7 +2218,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.query_device = mlx5_ib_query_device;
dev->ib_dev.query_port = mlx5_ib_query_port;
+ dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer;
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ dev->ib_dev.get_netdev = mlx5_ib_get_netdev;
dev->ib_dev.query_gid = mlx5_ib_query_gid;
+ dev->ib_dev.add_gid = mlx5_ib_add_gid;
+ dev->ib_dev.del_gid = mlx5_ib_del_gid;
dev->ib_dev.query_pkey = mlx5_ib_query_pkey;
dev->ib_dev.modify_device = mlx5_ib_modify_device;
dev->ib_dev.modify_port = mlx5_ib_modify_port;
@@ -1893,7 +2273,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
}
- if (mlx5_ib_port_link_layer(&dev->ib_dev) ==
+ if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
IB_LINK_LAYER_ETHERNET) {
dev->ib_dev.create_flow = mlx5_ib_create_flow;
dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
@@ -1908,9 +2288,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
mutex_init(&dev->flow_db.lock);
mutex_init(&dev->cap_mask_mutex);
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ err = mlx5_enable_roce(dev);
+ if (err)
+ goto err_dealloc;
+ }
+
err = create_dev_resources(&dev->devr);
if (err)
- goto err_dealloc;
+ goto err_disable_roce;
err = mlx5_ib_odp_init_one(dev);
if (err)
@@ -1947,6 +2333,10 @@ err_odp:
err_rsrc:
destroy_dev_resources(&dev->devr);
+err_disable_roce:
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ mlx5_disable_roce(dev);
+
err_dealloc:
ib_dealloc_device((struct ib_device *)dev);
@@ -1956,11 +2346,14 @@ err_dealloc:
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
+ enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ mlx5_disable_roce(dev);
ib_dealloc_device(&dev->ib_dev);
}