aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core/device.c (follow)
AgeCommit message (Collapse)AuthorFilesLines
2018-07-30RDMA/core: Remove {create,destroy}_ah from mandatory verbsKamal Heib1-2/+0
{create,destroy}_ah aren't mandatory verbs, because not all providers are implementing them. Signed-off-by: Kamal Heib <kamalheib1@gmail.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-18IB: Replace ib_query_gid/ib_get_cached_gid with rdma_query_gidParav Pandit1-20/+1
If the gid_attr argument is NULL then the functions behave identically to rdma_query_gid. ib_query_gid just calls ib_get_cached_gid, so everything can be consolidated to one function. Now that all callers either use rdma_query_gid() or ib_get_cached_gid(), ib_query_gid() API is removed. Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-12treewide: kzalloc() -> kcalloc()Kees Cook1-2/+2
The kzalloc() function has a 2-factor argument form, kcalloc(). This patch replaces cases of: kzalloc(a * b, gfp) with: kcalloc(a * b, gfp) as well as handling cases of: kzalloc(a * b * c, gfp) with: kzalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kzalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kzalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kzalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kzalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kzalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(char) * COUNT + COUNT , ...) | kzalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kzalloc + kcalloc ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kzalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kzalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kzalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kzalloc(C1 * C2 * C3, ...) | kzalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kzalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kzalloc(sizeof(THING) * C2, ...) | kzalloc(sizeof(TYPE) * C2, ...) | kzalloc(C1 * C2 * C3, ...) | kzalloc(C1 * C2, ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - (E1) * E2 + E1, E2 , ...) | - kzalloc + kcalloc ( - (E1) * (E2) + E1, E2 , ...) | - kzalloc + kcalloc ( - E1 * E2 + E1, E2 , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-05-29RDMA/core: Remove indirection through ib_cache_setup()Jason Gunthorpe1-2/+2
This once might have made sense when cache.c was in a different module from device.c, but today it just obfuscation. Get rid of the wrappers and call roge_gid_mgmt_init()/cleanup() directly. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
2018-04-06Merge tag 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdmaLinus Torvalds1-13/+5
Pull rdma updates from Jason Gunthorpe: "Doug and I are at a conference next week so if another PR is sent I expect it to only be bug fixes. Parav noted yesterday that there are some fringe case behavior changes in his work that he would like to fix, and I see that Intel has a number of rc looking patches for HFI1 they posted yesterday. Parav is again the biggest contributor by patch count with his ongoing work to enable container support in the RDMA stack, followed by Leon doing syzkaller inspired cleanups, though most of the actual fixing went to RC. There is one uncomfortable series here fixing the user ABI to actually work as intended in 32 bit mode. There are lots of notes in the commit messages, but the basic summary is we don't think there is an actual 32 bit kernel user of drivers/infiniband for several good reasons. However we are seeing people want to use a 32 bit user space with 64 bit kernel, which didn't completely work today. So in fixing it we required a 32 bit rxe user to upgrade their userspace. rxe users are still already quite rare and we think a 32 bit one is non-existing. - Fix RDMA uapi headers to actually compile in userspace and be more complete - Three shared with netdev pull requests from Mellanox: * 7 patches, mostly to net with 1 IB related one at the back). This series addresses an IRQ performance issue (patch 1), cleanups related to the fix for the IRQ performance problem (patches 2-6), and then extends the fragmented completion queue support that already exists in the net side of the driver to the ib side of the driver (patch 7). * Mostly IB, with 5 patches to net that are needed to support the remaining 10 patches to the IB subsystem. This series extends the current 'representor' framework when the mlx5 driver is in switchdev mode from being a netdev only construct to being a netdev/IB dev construct. The IB dev is limited to raw Eth queue pairs only, but by having an IB dev of this type attached to the representor for a switchdev port, it enables DPDK to work on the switchdev device. * All net related, but needed as infrastructure for the rdma driver - Updates for the hns, i40iw, bnxt_re, cxgb3, cxgb4, hns drivers - SRP performance updates - IB uverbs write path cleanup patch series from Leon - Add RDMA_CM support to ib_srpt. This is disabled by default. Users need to set the port for ib_srpt to listen on in configfs in order for it to be enabled (/sys/kernel/config/target/srpt/discovery_auth/rdma_cm_port) - TSO and Scatter FCS support in mlx4 - Refactor of modify_qp routine to resolve problems seen while working on new code that is forthcoming - More refactoring and updates of RDMA CM for containers support from Parav - mlx5 'fine grained packet pacing', 'ipsec offload' and 'device memory' user API features - Infrastructure updates for the new IOCTL interface, based on increased usage - ABI compatibility bug fixes to fully support 32 bit userspace on 64 bit kernel as was originally intended. See the commit messages for extensive details - Syzkaller bugs and code cleanups motivated by them" * tag 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (199 commits) IB/rxe: Fix for oops in rxe_register_device on ppc64le arch IB/mlx5: Device memory mr registration support net/mlx5: Mkey creation command adjustments IB/mlx5: Device memory support in mlx5_ib net/mlx5: Query device memory capabilities IB/uverbs: Add device memory registration ioctl support IB/uverbs: Add alloc/free dm uverbs ioctl support IB/uverbs: Add device memory capabilities reporting IB/uverbs: Expose device memory capabilities to user RDMA/qedr: Fix wmb usage in qedr IB/rxe: Removed GID add/del dummy routines RDMA/qedr: Zero stack memory before copying to user space IB/mlx5: Add ability to hash by IPSEC_SPI when creating a TIR IB/mlx5: Add information for querying IPsec capabilities IB/mlx5: Add IPsec support for egress and ingress {net,IB}/mlx5: Add ipsec helper IB/mlx5: Add modify_flow_action_esp verb IB/mlx5: Add implementation for create and destroy action_xfrm IB/uverbs: Introduce ESP steering match filter IB/uverbs: Add modify ESP flow_action ...
2018-04-03IB/core: Simplify ib_query_gid to always refer to cacheParav Pandit1-12/+3
Currently following inconsistencies exist. 1. ib_query_gid() returns GID from the software cache for a RoCE port and returns GID from the HCA for an IB port. This is incorrect because software GID cache is maintained regardless of HCA port type. 2. GID is queries from the HCA via ib_query_gid and updated in the software cache for IB link layer. Both of them might not be in sync. ULPs such as SRP initiator, SRP target, IPoIB driver have historically used ib_query_gid() API to query the GID. However CM used cached version during CM processing, When software cache was introduced, this inconsitency remained. In order to simplify, improve readability and avoid link layer specific above inconsistencies, this patch brings following changes. 1. ib_query_gid() always refers to the cache layer regardless of link layer. 2. cache module who reads the GID entry from HCA and builds the cache, directly invokes the HCA provider verb's query_gid() callback function. 3. ib_query_port() is being called in early stage where GID cache is not yet build while reading port immutable property. Therefore it needs to read the default GID from the HCA for IB link layer to publish the subnet prefix. Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-04-03RDMA/providers: Simplify query_gid callback of RoCE providersParav Pandit1-1/+3
ib_query_gid() fetches the GID from the software cache maintained in ib_core for RoCE ports. Therefore, simplify the provider drivers for RoCE to treat query_gid() callback as never called for RoCE, and only require non-RoCE devices to implement it. Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-27IB/core: Search GID only for IB link layerParav Pandit1-1/+1
Even though API is only used by IPoIB driver, its incorrect to refer RoCE GID table property to search for GID. Look for only IB link layer to search for the GID. Fixes: dbb12562f7c2 ("IB/{core, ipoib}: Simplify ib_find_gid to search only for IB link layer") Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-22IB/core: Refer to RoCE port property instead of GID table propertyParav Pandit1-1/+1
ib_query_gid() in commit [1] refers to RoCE GID table capability of the HCA using rdma_cap_roce_gid_table(). ib_core maintains the GID table cache regardless of the HCA provider drivers capability to maintain RoCE GID table. Therefore, whether to return a GID table entry from the software cache or from HCA should be done based on whether the port is RoCE or not. [1] commit 03db3a2d81e6 ("IB/core: Add RoCE GID table management") Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-21RDMA/restrack: Move restrack_clean to be symmetrical to restrack_initLeon Romanovsky1-2/+1
The fact that resource tracking commit 02d8883f520e ("RDMA/restrack: Add general infrastructure to track RDMA resources") was added immediately after commit 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") caused us to miss the fact that PD and CQ are created after ib_register_device, but released after ib_unregister_device() and not before as it is expected from normal flow. Fix introduced in commit 42cea83f9524 ("IB/mlx5: Fix cleanup order on unload") revealed this fact, so this patch is needed to avoid from restrack warnings It fixes resource tracking warnings during shutdown. [ 43.473906] CPU: 5 PID: 3016 Comm: modprobe Not tainted 4.16.0-rc5-for-linust-perf-2018-03-19_07-01-58-14 #1 [ 43.473907] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu2 04/01/2014 [ 43.473919] RIP: 0010:rdma_restrack_clean+0x25/0x30 [ib_core] [ 43.473921] RSP: 0018:ffffc9000267be48 EFLAGS: 00010282 [ 43.473924] RAX: 0000000000000000 RBX: ffff88033c690070 RCX: 0000000180080006 [ 43.473925] RDX: ffff88035ce922e0 RSI: ffffea000cf1a200 RDI: ffff88033c6907c8 [ 43.473926] RBP: ffff88033c690070 R08: ffff88033c689000 R09: 0000000180080006 [ 43.473927] R10: 000000003c68a001 R11: ffff88033c689000 R12: ffff88033c690000 [ 43.473929] R13: ffff88033c69005c R14: 0000000000000000 R15: 0000000000000000 [ 43.473932] FS: 00007f5928359740(0000) GS:ffff88036c540000(0000) knlGS:0000000000000000 [ 43.473933] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 43.473935] CR2: 00007ffffc760cc8 CR3: 000000035620c000 CR4: 00000000000006e0 [ 43.473940] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 43.473941] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 43.473942] Call Trace: [ 43.473969] ib_unregister_device+0xf5/0x190 [ib_core] [ 43.474000] __mlx5_ib_remove+0x2e/0x40 [mlx5_ib] [ 43.474098] mlx5_remove_device+0xf5/0x120 [mlx5_core] [ 43.474132] mlx5_unregister_interface+0x37/0x90 [mlx5_core] [ 43.474142] mlx5_ib_cleanup+0xc/0x16a [mlx5_ib] [ 43.474152] SyS_delete_module+0x159/0x260 [ 43.474159] do_syscall_64+0x61/0x110 [ 43.474165] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 43.474168] RIP: 0033:0x7f59278466b7 [ 43.474170] RSP: 002b:00007ffffc763e38 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [ 43.474172] RAX: ffffffffffffffda RBX: 000000000130d590 RCX: 00007f59278466b7 [ 43.474173] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 000000000130d5f8 [ 43.474175] RBP: 0000000000000000 R08: 00007f5927b0b060 R09: 00007f59278b6a40 [ 43.474176] R10: 00007ffffc763bc0 R11: 0000000000000202 R12: 0000000000000000 [ 43.474177] R13: 0000000000000001 R14: 000000000130d5f8 R15: 0000000000000000 [ 43.474179] Code: 84 00 00 00 00 00 0f 1f 44 00 00 48 83 c7 28 31 c0 eb 0c 48 83 c0 08 48 3d 00 08 00 00 74 0f 48 8d 14 07 48 8b 12 48 85 d2 74 e8 <0f> 0b c3 f3 c3 66 0f 1f 44 00 00 0f 1f 44 00 00 53 48 8b 47 28 [ 43.474221] ---[ end trace e89771e2250ffc23 ]--- Fixes: 42cea83f9524 ("IB/mlx5: Fix cleanup order on unload") Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-15IB/{core, ipoib}: Simplify ib_find_gid() for unused ndevParav Pandit1-2/+1
ib_find_gid() is only used by IPoIB driver. For IB link layer, GID table entries are not based on netdevice. Netdevice parameter is unused here. Therefore, it is removed. Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-28IB/core: Fix missing RDMA cgroups release in case of failure to register deviceParav Pandit1-2/+4
During IB device registration process, if query_device() fails or if ib_core fails to registers sysfs entries, rdma cgroup cleanup is skipped. Cc: <stable@vger.kernel.org> # v4.2+ Fixes: 4be3a4fa51f4 ("IB/core: Fix kernel crash during fail to initialize device") Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-29RDMA/restrack: Add general infrastructure to track RDMA resourcesLeon Romanovsky1-0/+4
The RDMA subsystem has very strict set of objects to work with, but it completely lacks tracking facilities and has no visibility of resource utilization. The following patch adds such infrastructure to keep track of RDMA resources to help with debugging of user space applications. The primary user of this infrastructure is RDMA nldev netlink (following patches), to be exposed to userspace via rdmatool, but it is not limited too that. At this stage, the main three objects (PD, CQ and QP) are added, and more will be added later. Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-08Merge branch 'bart-srpt-for-next' into k.o/wip/dl-for-nextDoug Ledford1-1/+17
Merging in 12 patch series from Bart that required changes in the current for-rc branch in order to apply cleanly. Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-01-03IB/core: Fix two kernel warnings triggered by rxe registrationBart Van Assche1-6/+14
Eliminate the WARN_ONs that create following two warnings when registering an rxe device: WARNING: CPU: 2 PID: 1005 at drivers/infiniband/core/device.c:449 ib_register_device+0x591/0x640 [ib_core] CPU: 2 PID: 1005 Comm: run_tests Not tainted 4.15.0-rc4-dbg+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:ib_register_device+0x591/0x640 [ib_core] Call Trace: rxe_register_device+0x3c6/0x470 [rdma_rxe] rxe_add+0x543/0x5e0 [rdma_rxe] rxe_net_add+0x37/0xb0 [rdma_rxe] rxe_param_set_add+0x5a/0x120 [rdma_rxe] param_attr_store+0x5e/0xc0 module_attr_store+0x19/0x30 sysfs_kf_write+0x3d/0x50 kernfs_fop_write+0x116/0x1a0 __vfs_write+0x23/0x120 vfs_write+0xbe/0x1b0 SyS_write+0x44/0xa0 entry_SYSCALL_64_fastpath+0x23/0x9a WARNING: CPU: 2 PID: 1005 at drivers/infiniband/core/sysfs.c:1279 ib_device_register_sysfs+0x11d/0x160 [ib_core] CPU: 2 PID: 1005 Comm: run_tests Tainted: G W 4.15.0-rc4-dbg+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:ib_device_register_sysfs+0x11d/0x160 [ib_core] Call Trace: ib_register_device+0x3f7/0x640 [ib_core] rxe_register_device+0x3c6/0x470 [rdma_rxe] rxe_add+0x543/0x5e0 [rdma_rxe] rxe_net_add+0x37/0xb0 [rdma_rxe] rxe_param_set_add+0x5a/0x120 [rdma_rxe] param_attr_store+0x5e/0xc0 module_attr_store+0x19/0x30 sysfs_kf_write+0x3d/0x50 kernfs_fop_write+0x116/0x1a0 __vfs_write+0x23/0x120 vfs_write+0xbe/0x1b0 SyS_write+0x44/0xa0 entry_SYSCALL_64_fastpath+0x23/0x9a The code should accept either a parent pointer or a fully specified DMA specification without producing warnings. Fixes: 99db9494035f ("IB/core: Remove ib_device.dma_device") Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Cc: Leon Romanovsky <leon@kernel.org> Cc: stable@vger.kernel.org # v4.11 Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-02RDMA/netlink: Fix locking around __ib_get_device_by_indexLeon Romanovsky1-1/+17
Holding locks is mandatory when calling __ib_device_get_by_index, otherwise there are races during the list iteration with device removal. Since the locks are static to device.c, __ib_device_get_by_index can never be called correctly by any user out side the file. Make the function static and provide a safe function that gets the correct locks and returns a kref'd pointer. Fix all callers. Fixes: e5c9469efcb1 ("RDMA/netlink: Add nldev device doit implementation") Fixes: c3f66f7b0052 ("RDMA/netlink: Implement nldev port doit callback") Fixes: 7d02f605f0dc ("RDMA/netlink: Add nldev port dumpit implementation") Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-02RDMA/core: Replace open-coded variant of put_deviceLeon Romanovsky1-1/+1
There is an existing function to decrease reference counter of the device, let's use it. Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2017-12-27Merge branch 'from-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.gitJason Gunthorpe1-2/+2
Patches for 4.16 that are dependent on patches sent to 4.15-rc. These are small clean ups for the vmw_pvrdma and i40iw drivers. * 'from-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git: RDMA/vmw_pvrdma: Remove usage of BIT() from UAPI header RDMA/vmw_pvrdma: Use refcount_t instead of atomic_t RDMA/vmw_pvrdma: Use more specific sizeof in kcalloc RDMA/vmw_pvrdma: Clarify QP and CQ is_kernel logic RDMA/vmw_pvrdma: Add UAR SRQ macros in ABI header file i40iw: Change accelerated flag to bool
2017-12-18IB/{core, ipoib}: Simplify ib_find_gid to search only for IB link layerParav Pandit1-13/+3
Currently there are no users of ib_find_gid for RoCE transport. It is only used by IPoIB. Therefore its simplified to ignore RoCE ports and GID type check which was previously done for every port. Signed-off-by: Parav Pandit <parav@mellanox.com> Reviewed-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2017-12-07RDMA/netlink: Fix general protection faultLeon Romanovsky1-1/+1
The RDMA netlink core code checks validity of messages by ensuring that type and operand are in range. It works well for almost all clients except NLDEV, which has cb_table less than number of operands. Request to access such operand will trigger the following kernel panic. This patch updates all places where cb_table is declared for the consistency, but only NLDEV is actually need it. general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 0 PID: 522 Comm: syz-executor6 Not tainted 4.13.0+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 task: ffff8800657799c0 task.stack: ffff8800695d000 RIP: 0010:rdma_nl_rcv_msg+0x13a/0x4c0 RSP: 0018:ffff8800695d7838 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff1000d2baf0b RCX: 00000000704ff4d7 RDX: 0000000000000000 RSI: ffffffff81ddb03c RDI: 00000003827fa6bc RBP: ffff8800695d7900 R08: ffffffff82ec0578 R09: 0000000000000000 R10: ffff8800695d7900 R11: 0000000000000001 R12: 000000000000001c R13: ffff880069d31e00 R14: 00000000ffffffff R15: ffff880069d357c0 FS: 00007fee6acb8700(0000) GS:ffff88006ca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000201a9000 CR3: 0000000059766000 CR4: 00000000000006b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? rdma_nl_multicast+0x80/0x80 rdma_nl_rcv+0x36b/0x4d0 ? ibnl_put_attr+0xc0/0xc0 netlink_unicast+0x4bd/0x6d0 ? netlink_sendskb+0x50/0x50 ? drop_futex_key_refs.isra.4+0x68/0xb0 netlink_sendmsg+0x9ab/0xbd0 ? nlmsg_notify+0x140/0x140 ? wake_up_q+0xa1/0xf0 ? drop_futex_key_refs.isra.4+0x68/0xb0 sock_sendmsg+0x88/0xd0 sock_write_iter+0x228/0x3c0 ? sock_sendmsg+0xd0/0xd0 ? do_futex+0x3e5/0xb20 ? iov_iter_init+0xaf/0x1d0 __vfs_write+0x46e/0x640 ? sched_clock_cpu+0x1b/0x190 ? __vfs_read+0x620/0x620 ? __fget+0x23a/0x390 ? rw_verify_area+0xca/0x290 vfs_write+0x192/0x490 SyS_write+0xde/0x1c0 ? SyS_read+0x1c0/0x1c0 ? trace_hardirqs_on_thunk+0x1a/0x1c entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x7fee6a74a219 RSP: 002b:00007fee6acb7d58 EFLAGS: 00000212 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000638000 RCX: 00007fee6a74a219 RDX: 0000000000000078 RSI: 0000000020141000 RDI: 0000000000000006 RBP: 0000000000000046 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: ffff8800695d7f98 R13: 0000000020141000 R14: 0000000000000006 R15: 00000000ffffffff Code: d6 48 b8 00 00 00 00 00 fc ff df 66 41 81 e4 ff 03 44 8d 72 ff 4a 8d 3c b5 c0 a6 7f 82 44 89 b5 4c ff ff ff 48 89 f9 48 c1 e9 03 <0f> b6 0c 01 48 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 RIP: rdma_nl_rcv_msg+0x13a/0x4c0 RSP: ffff8800695d7838 ---[ end trace ba085d123959c8ec ]--- Kernel panic - not syncing: Fatal exception Cc: syzkaller <syzkaller@googlegroups.com> Fixes: b4c598a67ea1 ("RDMA/netlink: Implement nldev device dumpit calback") Reviewed-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-12-01IB/core: Init subsys if compiled to vmlinuz-coreDmitry Monakhov1-1/+1
Once infiniband is compiled as a core component its subsystem must be enabled before device initialization. Otherwise there is a NULL pointer dereference during mlx4_core init, calltrace: ->device_add if (dev->class) { deref dev->class->p =>NULLPTR #Config CONFIG_NET_DEVLINK=y CONFIG_MAY_USE_DEVLINK=y CONFIG_MLX4_EN=y Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2017-08-24Merge branch 'mellanox' into k.o/for-nextDoug Ledford1-6/+2
Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-24IB: Avoid ib_modify_port() failure for RoCE devicesSelvin Xavier1-4/+7
IB CM calls ib_modify_port() irrespective of link layer. If the failure is returned, the mad agent gets unregistered for those devices. Recently, modify_port() hook was removed from some of the low level drivers as it was always returning success. This breaks rdma connection establishment over those devices. For ethernet devices, Qkey violation and port capabilities are not applicable. So returning success for RoCE when modify_port hook is is not implemented. Cc: Leon Romanovsky <leon@kernel.org> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-24RDMA/(core, ulp): Convert register/unregister event handler to be voidLeon Romanovsky1-6/+2
The functions ib_register_event_handler() and ib_unregister_event_handler() always returned success and they can't fail. Let's convert those functions to be void, remove redundant checks and cleanup tons of goto statements. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-22rdma: Autoload netlink client modulesJason Gunthorpe1-0/+2
If a message comes in and we do not have the client in the table, then try to load the module supplying that client using MODULE_ALIAS to find it. This duplicates the scheme seen in other netlink muxes (eg nfnetlink). Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-18Merge branch 'k.o/for-4.13-rc' into k.o/for-nextDoug Ledford1-2/+3
Merging our (hopefully) final -rc pull branch into our for-next branch because some of our pending patches won't apply cleanly without having the -rc patches in our tree. Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-18Merge branch 'misc' into k.o/for-nextDoug Ledford1-2/+2
Conflicts: drivers/infiniband/core/iwcm.c - The rdma_netlink patches in HEAD and the iwarp cm workqueue fix (don't use WQ_MEM_RECLAIM, we aren't safe for that context) touched the same code. Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-18RDMA/core: make ib_device.add method optionalSagi Grimberg1-2/+2
ib_clients can indeed fill .add to NULL, but then they will not see any device removal notifications. The reason is that that ib_register_client and ib_register_device checked existence of .add before adding the creating a corresponding client_data and adding it to the list. Simple condition reverse fixes the issue. Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-16IB/core: Protect sysfs entry on ib_unregister_deviceShiraz Saleem1-2/+3
ib_unregister_device is not protecting removal of sysfs entries. A call to ib_register_device in that window can result in duplicate sysfs entry warning. Move mutex_unlock to after ib_device_unregister_sysfs to protect against sysfs entry creation. This issue is exposed during driver load/unload stress test. WARNING: CPU: 5 PID: 4445 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x5f/0x70 sysfs: cannot create duplicate filename '/class/infiniband/i40iw0' Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./Q87M-D2H BIOS F7 01/17/2014 Workqueue: i40e i40e_service_task [i40e] Call Trace: dump_stack+0x67/0x98 __warn+0xcc/0xf0 warn_slowpath_fmt+0x4a/0x50 ? kernfs_path_from_node+0x4b/0x60 sysfs_warn_dup+0x5f/0x70 sysfs_do_create_link_sd.isra.2+0xb7/0xc0 sysfs_create_link+0x20/0x40 device_add+0x28c/0x600 ib_device_register_sysfs+0x58/0x170 [ib_core] ib_register_device+0x325/0x570 [ib_core] ? i40iw_register_rdma_device+0x1f4/0x400 [i40iw] ? kmem_cache_alloc_trace+0x143/0x330 ? __raw_spin_lock_init+0x2d/0x50 i40iw_register_rdma_device+0x2dc/0x400 [i40iw] i40iw_open+0x10a6/0x1950 [i40iw] ? i40iw_open+0xeab/0x1950 [i40iw] ? i40iw_make_cm_node+0x9c0/0x9c0 [i40iw] i40e_client_subtask+0xa4/0x110 [i40e] i40e_service_task+0xc2d/0x1320 [i40e] process_one_work+0x203/0x710 ? process_one_work+0x16f/0x710 worker_thread+0x126/0x4a0 ? trace_hardirqs_on+0xd/0x10 kthread+0x112/0x150 ? process_one_work+0x710/0x710 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 ---[ end trace fd11b69e21ea7653 ]--- Couldn't register device i40iw0 with driver model Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com> Signed-off-by: Sindhu Devale <sindhu.devale@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-08-10RDMA: Simplify get firmware interfaceLeon Romanovsky1-2/+2
There is a need to forward FW version to user space application through RDMA netlink. In order to make it safe, there is need to declare nla_policy and limit the size of FW string. The new define IB_FW_VERSION_NAME_MAX will limit the size of FW version string. That define was chosen to be equal to ETHTOOL_FWVERS_LEN, because many drivers anyway are limited by that value indirectly. The introduction of this define allows us to remove the string size from get_fw_str function signature. Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
2017-08-10RDMA/netlink: Add nldev initialization flowsLeon Romanovsky1-0/+2
Add nldev init and exit flows to the RDMA/core. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
2017-08-10RDMA/netlink: Convert LS to doit callbackLeon Romanovsky1-3/+3
RDMA_NL_LS protocol is actually does not dump anything, but sets data and it should be handled by doit callback. This patch actually converts RDMA_NL_LS to doit callback, while preserving IWCM and RDMA_CM flows through netlink_dump_start(). Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
2017-08-10RDMA/core: Add and expose static device indexLeon Romanovsky1-1/+36
This patch adds static device index in similar fashion to already available in netdev world (struct net->ifindex). In downstream patches, the RDMA nelink will use this idx-to-ib_device conversion, so as part of this commit, we are exposing the translation function to be visible for IB/core users. Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
2017-08-10RDMA/core: Add iterator over ib_devicesLeon Romanovsky1-0/+25
The coming nldev needs to iterate over all IB devices in the system and in order to not expose the ib_devices list outside the devices.c, it is necessary to provide function iterator. Current version is written explicitly for nldev callback to avoid over-engineering at this stage, but it can be easily extended for other types. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
2017-08-10RDMA/netlink: Rename netlink callback structLeon Romanovsky1-1/+1
The RDMA netlink client infrastructure was removed and made obsolete. The old infrastructure defined struct ibnl_client_cbs. Now that all uses of this have been updated to the new infrastructure, rename the struct to be compliant with the current stack naming standards: struct rdma_nl_cbs. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
2017-08-10RDMA/netlink: Add flag to consolidate common handlingLeon Romanovsky1-3/+9
Add ability to provide flags to control RDMA netlink callbacks and convert addr.c and sa_query.c to be first users of such infrastructure. It allows to move their CAP_NET_ADMIN checks into netlink core. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Steve Wise <swise@opengridcomputing.com>
2017-08-10RDMA/netlink: Remove netlink clients infrastructureLeon Romanovsky1-33/+12
RDMA netlink has a complicated infrastructure for dynamically registering and de-registering netlink clients to the NETLINK_RDMA group. The complicated portion of this code is not widely used because 2 of the 3 current clients are statically compiled together with netlink.c. The infrastructure, therefore, is deemed overkill. Refactor the code to eliminate the dynamically added clients. Now all clients are pre-registered in a client array at compile time, and at run time they merely check-in with the infrastructure to pass their callback table for inclusion in the pre-sized client array. This also allows for future cleanups and removal of unneeded code in the iwcm* netlink handler. Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Reviewed-by: Chien Tin Tung <chien.tin.tung@intel.com>
2017-07-07IB/core: Fix static analysis warning in ib_policy_change_taskDaniel Jurgens1-1/+2
ib_get_cached_subnet_prefix can technically fail, but the only way it could is not possible based on the loop conditions. Check the return value before using the variable sp to resolve a static analysis warning. -v1: - Fix check to !ret. Paul Moore Fixes: 8f408ab64be6 ("selinux lsm IB/core: Implement LSM notification system") Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Paul Moore <paul@paul-moore.com> Signed-off-by: James Morris <james.l.morris@oracle.com>
2017-05-23selinux lsm IB/core: Implement LSM notification systemDaniel Jurgens1-0/+53
Add a generic notificaiton mechanism in the LSM. Interested consumers can register a callback with the LSM and security modules can produce events. Because access to Infiniband QPs are enforced in the setup phase of a connection security should be enforced again if the policy changes. Register infiniband devices for policy change notification and check all QPs on that device when the notification is received. Add a call to the notification mechanism from SELinux when the AVC cache changes or setenforce is cleared. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Acked-by: James Morris <james.l.morris@oracle.com> Acked-by: Doug Ledford <dledford@redhat.com> Signed-off-by: Paul Moore <paul@paul-moore.com>
2017-05-23IB/core: Enforce PKey security on QPsDaniel Jurgens1-0/+33
Add new LSM hooks to allocate and free security contexts and check for permission to access a PKey. Allocate and free a security context when creating and destroying a QP. This context is used for controlling access to PKeys. When a request is made to modify a QP that changes the port, PKey index, or alternate path, check that the QP has permission for the PKey in the PKey table index on the subnet prefix of the port. If the QP is shared make sure all handles to the QP also have access. Store which port and PKey index a QP is using. After the reset to init transition the user can modify the port, PKey index and alternate path independently. So port and PKey settings changes can be a merge of the previous settings and the new ones. In order to maintain access control if there are PKey table or subnet prefix change keep a list of all QPs are using each PKey index on each port. If a change occurs all QPs using that device and port must have access enforced for the new cache settings. These changes add a transaction to the QP modify process. Association with the old port and PKey index must be maintained if the modify fails, and must be removed if it succeeds. Association with the new port and PKey index must be established prior to the modify and removed if the modify fails. 1. When a QP is modified to a particular Port, PKey index or alternate path insert that QP into the appropriate lists. 2. Check permission to access the new settings. 3. If step 2 grants access attempt to modify the QP. 4a. If steps 2 and 3 succeed remove any prior associations. 4b. If ether fails remove the new setting associations. If a PKey table or subnet prefix changes walk the list of QPs and check that they have permission. If not send the QP to the error state and raise a fatal error event. If it's a shared QP make sure all the QPs that share the real_qp have permission as well. If the QP that owns a security structure is denied access the security structure is marked as such and the QP is added to an error_list. Once the moving the QP to error is complete the security structure mark is cleared. Maintaining the lists correctly turns QP destroy into a transaction. The hardware driver for the device frees the ib_qp structure, so while the destroy is in progress the ib_qp pointer in the ib_qp_security struct is undefined. When the destroy process begins the ib_qp_security structure is marked as destroying. This prevents any action from being taken on the QP pointer. After the QP is destroyed successfully it could still listed on an error_list wait for it to be processed by that flow before cleaning up the structure. If the destroy fails the QPs port and PKey settings are reinserted into the appropriate lists, the destroying flag is cleared, and access control is enforced, in case there were any cache changes during the destroy flow. To keep the security changes isolated a new file is used to hold security related functionality. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Acked-by: Doug Ledford <dledford@redhat.com> [PM: merge fixup in ib_verbs.h and uverbs_cmd.c] Signed-off-by: Paul Moore <paul@paul-moore.com>
2017-04-21IB/core: Fix kernel crash during fail to initialize deviceParav Pandit1-11/+22
This patch fixes the kernel crash that occurs during ib_dealloc_device() called due to provider driver fails with an error after ib_alloc_device() and before it can register using ib_register_device(). This crashed seen in tha lab as below which can occur with any IB device which fails to perform its device initialization before invoking ib_register_device(). This patch avoids touching cache and port immutable structures if device is not yet initialized. It also releases related memory when cache and port immutable data structure initialization fails during register_device() state. [81416.561946] BUG: unable to handle kernel NULL pointer dereference at (null) [81416.570340] IP: ib_cache_release_one+0x29/0x80 [ib_core] [81416.576222] PGD 78da66067 [81416.576223] PUD 7f2d7c067 [81416.579484] PMD 0 [81416.582720] [81416.587242] Oops: 0000 [#1] SMP [81416.722395] task: ffff8807887515c0 task.stack: ffffc900062c0000 [81416.729148] RIP: 0010:ib_cache_release_one+0x29/0x80 [ib_core] [81416.735793] RSP: 0018:ffffc900062c3a90 EFLAGS: 00010202 [81416.741823] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 [81416.749785] RDX: 0000000000000000 RSI: 0000000000000282 RDI: ffff880859fec000 [81416.757757] RBP: ffffc900062c3aa0 R08: ffff8808536e5ac0 R09: ffff880859fec5b0 [81416.765708] R10: 00000000536e5c01 R11: ffff8808536e5ac0 R12: ffff880859fec000 [81416.773672] R13: 0000000000000000 R14: ffff8808536e5ac0 R15: ffff88084ebc0060 [81416.781621] FS: 00007fd879fab740(0000) GS:ffff88085fac0000(0000) knlGS:0000000000000000 [81416.790522] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [81416.797094] CR2: 0000000000000000 CR3: 00000007eb215000 CR4: 00000000003406e0 [81416.805051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [81416.812997] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [81416.820950] Call Trace: [81416.824226] ib_device_release+0x1e/0x40 [ib_core] [81416.829858] device_release+0x32/0xa0 [81416.834370] kobject_cleanup+0x63/0x170 [81416.839058] kobject_put+0x25/0x50 [81416.843319] ib_dealloc_device+0x25/0x40 [ib_core] [81416.848986] mlx5_ib_add+0x163/0x1990 [mlx5_ib] [81416.854414] mlx5_add_device+0x5a/0x160 [mlx5_core] [81416.860191] mlx5_register_interface+0x8d/0xc0 [mlx5_core] [81416.866587] ? 0xffffffffa09e9000 [81416.870816] mlx5_ib_init+0x15/0x17 [mlx5_ib] [81416.876094] do_one_initcall+0x51/0x1b0 [81416.880861] ? __vunmap+0x85/0xd0 [81416.885113] ? kmem_cache_alloc_trace+0x14b/0x1b0 [81416.890768] ? vfree+0x2e/0x70 [81416.894762] do_init_module+0x60/0x1fa [81416.899441] load_module+0x15f6/0x1af0 [81416.904114] ? __symbol_put+0x60/0x60 [81416.908709] ? ima_post_read_file+0x3d/0x80 [81416.913828] ? security_kernel_post_read_file+0x6b/0x80 [81416.920006] SYSC_finit_module+0xa6/0xf0 [81416.924888] SyS_finit_module+0xe/0x10 [81416.929568] entry_SYSCALL_64_fastpath+0x1a/0xa9 [81416.935089] RIP: 0033:0x7fd879494949 [81416.939543] RSP: 002b:00007ffdbc1b4e58 EFLAGS: 00000202 ORIG_RAX: 0000000000000139 [81416.947982] RAX: ffffffffffffffda RBX: 0000000001b66f00 RCX: 00007fd879494949 [81416.955965] RDX: 0000000000000000 RSI: 000000000041a13c RDI: 0000000000000003 [81416.963926] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000001b652a0 [81416.971861] R10: 0000000000000003 R11: 0000000000000202 R12: 00007ffdbc1b3e70 [81416.979763] R13: 00007ffdbc1b3e50 R14: 0000000000000005 R15: 0000000000000000 [81417.008005] RIP: ib_cache_release_one+0x29/0x80 [ib_core] RSP: ffffc900062c3a90 [81417.016045] CR2: 0000000000000000 Fixes: 55aeed0654 ("IB/core: Make ib_alloc_device init the kobject") Fixes: 7738613e7c ("IB/core: Add per port immutable struct to ib_device") Cc: <stable@vger.kernel.org> # v4.2+ Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-03-24IB/device: Convert ib-comp-wq to be CPU-boundSagi Grimberg1-2/+1
This workqueue is used by our storage target mode ULPs via the new CQ API. Recent observations when working with very high-end flash storage devices reveal that UNBOUND workqueue threads can migrate between cpu cores and even numa nodes (although some numa locality is accounted for). While this attribute can be useful in some workloads, it does not fit in very nicely with the normal run-to-completion model we usually use in our target-mode ULPs and the block-mq irq<->cpu affinity facilities. The whole block-mq concept is that the completion will land on the same cpu where the submission was performed. The fact that our submitter thread is migrating cpus can break this locality. We assume that as a target mode ULP, we will serve multiple initiators/clients and we can spread the load enough without having to use unbound kworkers. Also, while we're at it, expose this workqueue via sysfs which is harmless and can be useful for debug. Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>-- Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-03-24IB/core: Restore I/O MMU, s390 and powerpc supportBart Van Assche1-6/+20
Avoid that the following error message is reported on the console while loading an RDMA driver with I/O MMU support enabled: DMAR: Allocating domain for mlx5_0 failed Ensure that DMA mapping operations that use to_pci_dev() to access to struct pci_dev see the correct PCI device. E.g. the s390 and powerpc DMA mapping operations use to_pci_dev() even with I/O MMU support disabled. This patch preserves the following changes of the DMA mapping updates patch series: - Introduction of dma_virt_ops. - Removal of ib_device.dma_ops. - Removal of struct ib_dma_mapping_ops. - Removal of an if-statement from each ib_dma_*() operation. - IB HW drivers no longer set dma_device directly. Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com> Reported-by: Parav Pandit <parav@mellanox.com> Fixes: commit 99db9494035f ("IB/core: Remove ib_device.dma_device") Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Reviewed-by: parav@mellanox.com Tested-by: parav@mellanox.com Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-02-27Merge branch 'for-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroupLinus Torvalds1-0/+10
Pull cgroup updates from Tejun Heo: "Several noteworthy changes. - Parav's rdma controller is finally merged. It is very straight forward and can limit the abosolute numbers of common rdma constructs used by different cgroups. - kernel/cgroup.c got too chubby and disorganized. Created kernel/cgroup/ subdirectory and moved all cgroup related files under kernel/ there and reorganized the core code. This hurts for backporting patches but was long overdue. - cgroup v2 process listing reimplemented so that it no longer depends on allocating a buffer large enough to cache the entire result to sort and uniq the output. v2 has always mangled the sort order to ensure that users don't depend on the sorted output, so this shouldn't surprise anybody. This makes the pid listing functions use the same iterators that are used internally, which have to have the same iterating capabilities anyway. - perf cgroup filtering now works automatically on cgroup v2. This patch was posted a long time ago but somehow fell through the cracks. - misc fixes asnd documentation updates" * 'for-4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (27 commits) kernfs: fix locking around kernfs_ops->release() callback cgroup: drop the matching uid requirement on migration for cgroup v2 cgroup, perf_event: make perf_event controller work on cgroup2 hierarchy cgroup: misc cleanups cgroup: call subsys->*attach() only for subsystems which are actually affected by migration cgroup: track migration context in cgroup_mgctx cgroup: cosmetic update to cgroup_taskset_add() rdmacg: Fixed uninitialized current resource usage cgroup: Add missing cgroup-v2 PID controller documentation. rdmacg: Added documentation for rdmacg IB/core: added support to use rdma cgroup controller rdmacg: Added rdma cgroup controller cgroup: fix a comment typo cgroup: fix RCU related sparse warnings cgroup: move namespace code to kernel/cgroup/namespace.c cgroup: rename functions for consistency cgroup: move v1 mount functions to kernel/cgroup/cgroup-v1.c cgroup: separate out cgroup1_kf_syscall_ops cgroup: refactor mount path and clearly distinguish v1 and v2 paths cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c ...
2017-02-25Merge tag 'for-next-dma_ops' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdmaLinus Torvalds1-0/+9
Pull rdma DMA mapping updates from Doug Ledford: "Drop IB DMA mapping code and use core DMA code instead. Bart Van Assche noted that the ib DMA mapping code was significantly similar enough to the core DMA mapping code that with a few changes it was possible to remove the IB DMA mapping code entirely and switch the RDMA stack to use the core DMA mapping code. This resulted in a nice set of cleanups, but touched the entire tree and has been kept separate for that reason." * tag 'for-next-dma_ops' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (37 commits) IB/rxe, IB/rdmavt: Use dma_virt_ops instead of duplicating it IB/core: Remove ib_device.dma_device nvme-rdma: Switch from dma_device to dev.parent RDS: net: Switch from dma_device to dev.parent IB/srpt: Modify a debug statement IB/srp: Switch from dma_device to dev.parent IB/iser: Switch from dma_device to dev.parent IB/IPoIB: Switch from dma_device to dev.parent IB/rxe: Switch from dma_device to dev.parent IB/vmw_pvrdma: Switch from dma_device to dev.parent IB/usnic: Switch from dma_device to dev.parent IB/qib: Switch from dma_device to dev.parent IB/qedr: Switch from dma_device to dev.parent IB/ocrdma: Switch from dma_device to dev.parent IB/nes: Remove a superfluous assignment statement IB/mthca: Switch from dma_device to dev.parent IB/mlx5: Switch from dma_device to dev.parent IB/mlx4: Switch from dma_device to dev.parent IB/i40iw: Remove a superfluous assignment statement IB/hns: Switch from dma_device to dev.parent ...
2017-01-27IB/core: Add inline function to validate portYuval Shaia1-2/+2
Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> Reviewed-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-01-24IB/core: Remove ib_device.dma_deviceBart Van Assche1-8/+9
Add code in ib_register_device() for copying the DMA masks. Use &ib_device.dev in DMA mapping operations instead of dma_device. Remove ib_device.dma_device because due to this and previous patches it is no longer used. Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-01-24IB/core: Initialize ib_device.dev.parent earlierBart Van Assche1-0/+8
Move the ib_device.dev.parent initialization code from ib_device_register_sysfs() to ib_register_device(). Additionally, allow HBA drivers to set ib_device.dev.parent without setting ib_device.dma_device. This is the first step towards removing ib_device.dma_device. Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2017-01-10IB/core: added support to use rdma cgroup controllerParav Pandit1-0/+10
Added support APIs for IB core to register/unregister every IB/RDMA device with rdma cgroup for tracking rdma resources. IB core registers with rdma cgroup controller. Added support APIs for uverbs layer to make use of rdma controller. Added uverbs layer to perform resource charge/uncharge functionality. Added support during query_device uverb operation to ensure it returns resource limits by honoring rdma cgroup configured limits. Signed-off-by: Parav Pandit <pandit.parav@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org>
2016-12-03IB/core: Remove debug prints after allocation failureLeon Romanovsky1-4/+1
The prints after [k|v][m|z|c]alloc() functions are not needed, because in case of failure, allocator will print their internal error prints anyway. Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>