aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/hfi1 (follow)
AgeCommit message (Collapse)AuthorFilesLines
2018-07-09net: allow ndo_select_queue to pass netdevAlexander Duyck1-1/+1
This patch makes it so that instead of passing a void pointer as the accel_priv we instead pass a net_device pointer as sb_dev. Making this change allows us to pass the subordinate device through to the fallback function eventually so that we can keep the actual code in the ndo_select_queue call as focused on possible on the exception cases. Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Andrew Bowers <andrewx.bowers@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2018-06-12treewide: Use array_size() in kvzalloc_node()Kees Cook1-1/+2
The kvzalloc_node() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: kvzalloc_node(a * b, gfp, node) with: kvzalloc_node(array_size(a, b), gfp, node) as well as handling cases of: kvzalloc_node(a * b * c, gfp, node) with: kvzalloc_node(array3_size(a, b, c), gfp, node) This does, however, attempt to ignore constant size factors like: kvzalloc_node(4 * 1024, gfp, node) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kvzalloc_node( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kvzalloc_node( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kvzalloc_node( - sizeof(u8) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(__u8) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(char) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(u8) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(__u8) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(char) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( kvzalloc_node( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ kvzalloc_node( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kvzalloc_node( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kvzalloc_node( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc_node( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kvzalloc_node( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kvzalloc_node(C1 * C2 * C3, ...) | kvzalloc_node( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( kvzalloc_node(C1 * C2, ...) | kvzalloc_node( - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-07Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdmaLinus Torvalds31-788/+1560
Pull rdma updates from Jason Gunthorpe: "This has been a quiet cycle for RDMA, the big bulk is the usual smallish driver updates and bug fixes. About four new uAPI related things. Not as much Szykaller patches this time, the bugs it finds are getting harder to fix. Summary: - More work cleaning up the RDMA CM code - Usual driver bug fixes and cleanups for qedr, qib, hfi1, hns, i40iw, iw_cxgb4, mlx5, rxe - Driver specific resource tracking and reporting via netlink - Continued work for name space support from Parav - MPLS support for the verbs flow steering uAPI - A few tricky IPoIB fixes improving robustness - HFI1 driver support for the '16B' management packet format - Some auditing to not print kernel pointers via %llx or similar - Mark the entire 'UCM' user-space interface as BROKEN with the intent to remove it entirely. The user space side of this was long ago replaced with RDMA-CM and syzkaller is finding bugs in the residual UCM interface nobody wishes to fix because nobody uses it. - Purge more bogus BUG_ON's from Leon - 'flow counters' verbs uAPI - T10 fixups for iser/isert, these are Acked by Martin but going through the RDMA tree due to dependencies" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (138 commits) RDMA/mlx5: Update SPDX tags to show proper license RDMA/restrack: Change SPDX tag to properly reflect license IB/hfi1: Fix comment on default hdr entry size IB/hfi1: Rename exp_lock to exp_mutex IB/hfi1: Add bypass register defines and replace blind constants IB/hfi1: Remove unused variable IB/hfi1: Ensure VL index is within bounds IB/hfi1: Fix user context tail allocation for DMA_RTAIL IB/hns: Use zeroing memory allocator instead of allocator/memset infiniband: fix a possible use-after-free bug iw_cxgb4: add INFINIBAND_ADDR_TRANS dependency IB/isert: use T10-PI check mask definitions from core layer IB/iser: use T10-PI check mask definitions from core layer RDMA/core: introduce check masks for T10-PI offload IB/isert: fix T10-pi check mask setting IB/mlx5: Add counters read support IB/mlx5: Add flow counters read support IB/mlx5: Add flow counters binding support IB/mlx5: Add counters create and destroy support IB/uverbs: Add support for flow counters ...
2018-06-07Merge tag 'pci-v4.18-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pciLinus Torvalds1-16/+8
Pull PCI updates from Bjorn Helgaas: - unify AER decoding for native and ACPI CPER sources (Alexandru Gagniuc) - add TLP header info to AER tracepoint (Thomas Tai) - add generic pcie_wait_for_link() interface (Oza Pawandeep) - handle AER ERR_FATAL by removing and re-enumerating devices, as Downstream Port Containment does (Oza Pawandeep) - factor out common code between AER and DPC recovery (Oza Pawandeep) - stop triggering DPC for ERR_NONFATAL errors (Oza Pawandeep) - share ERR_FATAL recovery path between AER and DPC (Oza Pawandeep) - disable ASPM L1.2 substate if we don't have LTR (Bjorn Helgaas) - respect platform ownership of LTR (Bjorn Helgaas) - clear interrupt status in top half to avoid interrupt storm (Oza Pawandeep) - neaten pci=earlydump output (Andy Shevchenko) - avoid errors when extended config space inaccessible (Gilles Buloz) - prevent sysfs disable of device while driver attached (Christoph Hellwig) - use core interface to report PCIe link properties in bnx2x, bnxt_en, cxgb4, ixgbe (Bjorn Helgaas) - remove unused pcie_get_minimum_link() (Bjorn Helgaas) - fix use-before-set error in ibmphp (Dan Carpenter) - fix pciehp timeouts caused by Command Completed errata (Bjorn Helgaas) - fix refcounting in pnv_php hotplug (Julia Lawall) - clear pciehp Presence Detect and Data Link Layer Status Changed on resume so we don't miss hotplug events (Mika Westerberg) - only request pciehp control if we support it, so platform can use ACPI hotplug otherwise (Mika Westerberg) - convert SHPC to be builtin only (Mika Westerberg) - request SHPC control via _OSC if we support it (Mika Westerberg) - simplify SHPC handoff from firmware (Mika Westerberg) - fix an SHPC quirk that mistakenly included *all* AMD bridges as well as devices from any vendor with device ID 0x7458 (Bjorn Helgaas) - assign a bus number even to non-native hotplug bridges to leave space for acpiphp additions, to fix a common Thunderbolt xHCI hot-add failure (Mika Westerberg) - keep acpiphp from scanning native hotplug bridges, to fix common Thunderbolt hot-add failures (Mika Westerberg) - improve "partially hidden behind bridge" messages from core (Mika Westerberg) - add macros for PCIe Link Control 2 register (Frederick Lawler) - replace IB/hfi1 custom macros with PCI core versions (Frederick Lawler) - remove dead microblaze and xtensa code (Bjorn Helgaas) - use dev_printk() when possible in xtensa and mips (Bjorn Helgaas) - remove unused pcie_port_acpi_setup() and portdrv_acpi.c (Bjorn Helgaas) - add managed interface to get PCI host bridge resources from OF (Jan Kiszka) - add support for unbinding generic PCI host controller (Jan Kiszka) - fix memory leaks when unbinding generic PCI host controller (Jan Kiszka) - request legacy VGA framebuffer only for VGA devices to avoid false device conflicts (Bjorn Helgaas) - turn on PCI_COMMAND_IO & PCI_COMMAND_MEMORY in pci_enable_device() like everybody else, not in pcibios_fixup_bus() (Bjorn Helgaas) - add generic enable function for simple SR-IOV hardware (Alexander Duyck) - use generic SR-IOV enable for ena, nvme (Alexander Duyck) - add ACS quirk for Intel 7th & 8th Gen mobile (Alex Williamson) - add ACS quirk for Intel 300 series (Mika Westerberg) - enable register clock for Armada 7K/8K (Gregory CLEMENT) - reduce Keystone "link already up" log level (Fabio Estevam) - move private DT functions to drivers/pci/ (Rob Herring) - factor out dwc CONFIG_PCI Kconfig dependencies (Rob Herring) - add DesignWare support to the endpoint test driver (Gustavo Pimentel) - add DesignWare support for endpoint mode (Gustavo Pimentel) - use devm_ioremap_resource() instead of devm_ioremap() in dra7xx and artpec6 (Gustavo Pimentel) - fix Qualcomm bitwise NOT issue (Dan Carpenter) - add Qualcomm runtime PM support (Srinivas Kandagatla) - fix DesignWare enumeration below bridges (Koen Vandeputte) - use usleep() instead of mdelay() in endpoint test (Jia-Ju Bai) - add configfs entries for pci_epf_driver device IDs (Kishon Vijay Abraham I) - clean up pci_endpoint_test driver (Gustavo Pimentel) - update Layerscape maintainer email addresses (Minghuan Lian) - add COMPILE_TEST to improve build test coverage (Rob Herring) - fix Hyper-V bus registration failure caused by domain/serial number confusion (Sridhar Pitchai) - improve Hyper-V refcounting and coding style (Stephen Hemminger) - avoid potential Hyper-V hang waiting for a response that will never come (Dexuan Cui) - implement Mediatek chained IRQ handling (Honghui Zhang) - fix vendor ID & class type for Mediatek MT7622 (Honghui Zhang) - add Mobiveil PCIe host controller driver (Subrahmanya Lingappa) - add Mobiveil MSI support (Subrahmanya Lingappa) - clean up clocks, MSI, IRQ mappings in R-Car probe failure paths (Marek Vasut) - poll more frequently (5us vs 5ms) while waiting for R-Car data link active (Marek Vasut) - use generic OF parsing interface in R-Car (Vladimir Zapolskiy) - add R-Car V3H (R8A77980) "compatible" string (Sergei Shtylyov) - add R-Car gen3 PHY support (Sergei Shtylyov) - improve R-Car PHYRDY polling (Sergei Shtylyov) - clean up R-Car macros (Marek Vasut) - use runtime PM for R-Car controller clock (Dien Pham) - update arm64 defconfig for Rockchip (Shawn Lin) - refactor Rockchip code to facilitate both root port and endpoint mode (Shawn Lin) - add Rockchip endpoint mode driver (Shawn Lin) - support VMD "membar shadow" feature (Jon Derrick) - support VMD bus number offsets (Jon Derrick) - add VMD "no AER source ID" quirk for more device IDs (Jon Derrick) - remove unnecessary host controller CONFIG_PCIEPORTBUS Kconfig selections (Bjorn Helgaas) - clean up quirks.c organization and whitespace (Bjorn Helgaas) * tag 'pci-v4.18-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci: (144 commits) PCI/AER: Replace struct pcie_device with pci_dev PCI/AER: Remove unused parameters PCI: qcom: Include gpio/consumer.h PCI: Improve "partially hidden behind bridge" log message PCI: Improve pci_scan_bridge() and pci_scan_bridge_extend() doc PCI: Move resource distribution for single bridge outside loop PCI: Account for all bridges on bus when distributing bus numbers ACPI / hotplug / PCI: Drop unnecessary parentheses ACPI / hotplug / PCI: Mark stale PCI devices disconnected ACPI / hotplug / PCI: Don't scan bridges managed by native hotplug PCI: hotplug: Add hotplug_is_native() PCI: shpchp: Add shpchp_is_native() PCI: shpchp: Fix AMD POGO identification PCI: mobiveil: Add MSI support PCI: mobiveil: Add Mobiveil PCIe Host Bridge IP driver PCI/AER: Decode Error Source Requester ID PCI/AER: Remove aer_recover_work_func() forward declaration PCI/DPC: Use the generic pcie_do_fatal_recovery() path PCI/AER: Pass service type to pcie_do_fatal_recovery() PCI/DPC: Disable ERR_NONFATAL handling by DPC ...
2018-06-04IB/hfi1: Fix comment on default hdr entry sizeMike Marciniszyn1-2/+2
The comment for the default header queue entry size is incorrect. Correct the comment and fix the resulting S_IRUGO warning that shows up in the widened patch context. Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-04IB/hfi1: Rename exp_lock to exp_mutexKaike Wan3-8/+8
The mutex exp_lock in struct hfi1_ctxtdata is used to protect all Expected TID data of a user context. This patch renames it to exp_mutex to better reflect its identity and prepare for upcoming patches. Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Harish Chegondi <harish.chegondi@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-04IB/hfi1: Add bypass register defines and replace blind constantsMike Marciniszyn2-1/+9
These registers were not added in the 16B work. Add them and replace blind constants with the correct defines. Fixes: 72c07e2b671e ("IB/hfi1: Add support to receive 16B bypass packets") Reviewed-by: Don Hiatt <don.hiatt@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-04IB/hfi1: Remove unused variableKaike Wan1-3/+1
The variable extended_psn was not used any more. Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-04IB/hfi1: Ensure VL index is within boundsKaike Wan1-9/+3
Improve the safety of the code and ensure the array cannot be indexed out of bounds when picking the CPU for a given SDMA engine. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-06-04IB/hfi1: Fix user context tail allocation for DMA_RTAILMike Marciniszyn3-10/+9
The following code fails to allocate a buffer for the tail address that the hardware DMAs into when the user context DMA_RTAIL is set. if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, gfp_flags); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } So the rcvhdrtail_kvaddr would then be NULL. The mmap logic fails to check for a NULL rcvhdrtail_kvaddr. The fix is to test for both user and kernel DMA_TAIL options during the allocation as well as testing for a NULL rcvhdrtail_kvaddr during the mmap processing. Additionally, all downstream testing of the capmask for DMA_RTAIL have been eliminated in favor of testing rcvhdrtail_kvaddr. Cc: <stable@vger.kernel.org> # 4.9.x Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-05-28Merge branch 'mr_fix' into git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma for-nextJason Gunthorpe1-0/+4
Update mlx4 to support user MR creation against read-only memory, previously it required the memory to be writable. Based on rdma for-rc due to dependencies. * mr_fix: (2 commits) IB/mlx4: Mark user MR as writable if actual virtual memory is writable IB/core: Make testing MR flags for writability a static inline function
2018-05-24IB/{rdmavt,hfi1}: Change hrtimer add to use pinned versionMike Marciniszyn1-1/+1
Given we are dealing with nano-second level timers, when the timer pops, ensure it happens on the CPU which caused the timer to be set in the first place. This avoids excessive jitter from the desired expiration time by avoiding the cost of switching our context to another CPU that is cache cold for this given timer. Reviewed-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-24IB/hfi1: Set port number for errorinfo MAD responseMichael J. Ruhl1-0/+1
For errorinfo MAD requests, the response has a 0 port number left over from a memset. Instead we should always set the port number in the response. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-24IB/hfi1: Cleanup of exp_rcvMike Marciniszyn4-25/+56
The knowledge of the internal workings of the expect receive is too distributed. Fix by: - right size several rcd fields associated with expect receive - making an init entrance to init all the lists - consolidate all the allocations into an array anchored in the rcd Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-24IB/hfi1: Add 16B Management Packet trace supportDon Hiatt2-70/+130
Add trace support for 16B Management Packets. Reviewed-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Don Hiatt <don.hiatt@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-24IB/hfi1: Add support for 16B Management PacketsDon Hiatt4-33/+110
16B Management Packets (L4=0x08) replace the BTH and DETH of normal MAD packet packets with a header containing the the source and destination queue pair numbers; fields that were originally retrieved from the BTH/DETH are now populated from this header as well as from the 16B LRH (e.g. pkey). 16B Management Packets are used as an optimized management format on 16B fabrics. These management packets have an opcode of IB_OPCODE_UD_SEND_ONLY, a fixed 3Byte pad, and a header length of 24Bytes. The decision as to when we send a management packet is based upon either the source or destination queue pair number being 0 or 1. Reviewed-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Don Hiatt <don.hiatt@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-24IB/hfi1: Define 16B Management PacketsDon Hiatt2-0/+8
Add 16B Management Packet definition. This optimized packet format replaces the ib_other_headers and BTH with a source and destination QP number. To support these packets we introduce struct opa_16b_mgmt into the struct hfi1_16b_header. This packet format is only used for MAD packets using the IB_OPCODE_UD_SEND_ONLY opcode on QP0/1. The original 16B implementation failed to use 16B management packets so now we add their definition. Reviewed-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Don Hiatt <don.hiatt@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-15RDMA/hfi1: Fix build error with debugfs disabledDoug Ledford1-15/+0
A recent patch set to rework the usage of debugfs and to add fault injection capabilities via debugfs files to the hfi1 driver introduced a build error that only shows up when debugfs is fully disabled. The patchset mistakenly defines some empty stub functions in two different headers when debugfs is disabled. Remove the set that shouldn't have been there to resolve the issue. Fixes: a74d5307caba ("IB/hfi1: Rework fault injection machinery") Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/{hfi1, qib, rdmavt}: Move logic to allocate receive WQE into rdmavtBrian Welty5-162/+10
Moving receive-side WQE allocation logic into rdmavt will allow further code reuse between qib and hfi1 drivers. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Brian Welty <brian.welty@intel.com> Signed-off-by: Harish Chegondi <harish.chegondi@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/{hfi1, rdmavt, qib}: Implement CQ completion vector supportSebastian Sanchez8-16/+444
Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Create common functions for affinity CPU mask operationsSebastian Sanchez1-23/+60
CPU masks are used to keep track of affinity assignments for IRQs and processes. Operations performed on these affinity CPU masks are duplicated throughout the code. Create common functions for affinity CPU mask operations to remove duplicate code. Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/Hfi1: Read CCE Revision register to verify the device is responsiveKamenee Arumugam2-7/+8
When Hfi1 device is unresponsive, reading the RcvArrayCnt register will return all 1's. This value is then used to remap chip's RcvArray. The incorrect all ones value used in remapping RcvArray will cause warn on as shown by trace below: [<ffffffff81685eac>] dump_stack+0x19/0x1b [<ffffffff81085820>] warn_slowpath_common+0x70/0xb0 [<ffffffff810858bc>] warn_slowpath_fmt+0x5c/0x80 [<ffffffff81065c29>] __ioremap_caller+0x279/0x320 [<ffffffff8142873c>] ? _dev_info+0x6c/0x90 [<ffffffffa021d155>] ? hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [<ffffffff81065d62>] ioremap_wc+0x32/0x40 [<ffffffffa021d155>] hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [<ffffffffa0204851>] hfi1_init_dd+0x1d1/0x2440 [hfi1] [<ffffffff813503dc>] ? pci_write_config_word+0x1c/0x20 Read CCE revision register first to verify that WFR device is responsive. If the read return "all ones", bail out from init and fail the driver load. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Rework fault injection machineryMitko Haralanov10-358/+577
The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/{hfi1, qib}: Add handling of kernel restartAlex Estrin2-0/+14
A warm restart will fail to unload the driver, leaving link state potentially flapping up to the point the BIOS resets the adapter. Correct the issue by hooking the shutdown pci method, which will bring port down. Cc: <stable@vger.kernel.org> # 4.9.x Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Reorder incorrect send context disableMichael J. Ruhl2-11/+35
User send context integrity bits are cleared before the context is disabled. If the send context is still processing data, any packets that need those integrity bits will cause an error and halt the send context. During the disable handling, the driver waits for the context to drain. If the context is halted, the driver will eventually timeout because the context won't drain and then incorrectly bounce the link. Reorder the bit clearing and the context disable. Examine the software state and send context status as well as the egress status to determine if a send context is in the halted state. Promote the check macros to static functions for consistency with the new check and to follow kernel style. Remove an unused define that refers to the egress timeout. Cc: <stable@vger.kernel.org> # 4.9.x Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Return correct value for device stateMichael J. Ruhl1-2/+2
The driver_pstate() function is used to map internal driver state information to externally defined states. The VERIFY_CAP and GOING_UP states are config/training states, but the mapping routing returns the POLLING value. Update the return values for VERIFY_CAP and GOING_UP to return the correct value: TRAINING. Reviewed-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Fix fault injection init/exit issuesMike Marciniszyn1-2/+6
There are config dependent code paths that expose panics in unload paths both in this file and in debugfs_remove_recursive() because CONFIG_FAULT_INJECTION and CONFIG_FAULT_INJECTION_DEBUG_FS can be set independently. Having CONFIG_FAULT_INJECTION set and CONFIG_FAULT_INJECTION_DEBUG_FS reset causes fault_create_debugfs_attr() to return an error. The debugfs.c routines tolerate failures, but the module unload panics dereferencing a NULL in the two exit routines. If that is fixed, the dir passed to debugfs_remove_recursive comes from a memory location that was freed and potentially reused causing a segfault or corrupting memory. Here is an example of the NULL deref panic: [66866.286829] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 [66866.295602] IP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.301138] PGD 858496067 P4D 858496067 PUD 8433a7067 PMD 0 [66866.307452] Oops: 0000 [#1] SMP [66866.310953] Modules linked in: hfi1(-) rdmavt rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm iw_cm ib_cm ib_core rpcsec_gss_krb5 nfsv4 dns_resolver nfsv3 nfs fscache sb_edac x86_pkg_temp_thermal intel_powerclamp vfat fat coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel iTCO_wdt iTCO_vendor_support crypto_simd mei_me glue_helper cryptd mxm_wmi ipmi_si pcspkr lpc_ich sg mei ioatdma ipmi_devintf i2c_i801 mfd_core shpchp ipmi_msghandler wmi acpi_power_meter acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt igb fb_sys_fops ttm ahci ptp crc32c_intel libahci pps_core drm dca libata i2c_algo_bit i2c_core [last unloaded: opa_vnic] [66866.385551] CPU: 8 PID: 7470 Comm: rmmod Not tainted 4.14.0-mam-tid-rdma #2 [66866.393317] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 [66866.405252] task: ffff88084f28c380 task.stack: ffffc90008454000 [66866.411866] RIP: 0010:hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.417984] RSP: 0018:ffffc90008457da0 EFLAGS: 00010202 [66866.423812] RAX: 0000000000000000 RBX: ffff880857de0000 RCX: 0000000180040001 [66866.431773] RDX: 0000000180040002 RSI: ffffea0021088200 RDI: 0000000040000000 [66866.439734] RBP: ffffc90008457da8 R08: ffff88084220e000 R09: 0000000180040001 [66866.447696] R10: 000000004220e001 R11: ffff88084220e000 R12: ffff88085a31c000 [66866.455657] R13: ffffffffa07c9820 R14: ffffffffa07c9890 R15: ffff881059d78100 [66866.463618] FS: 00007f6876047740(0000) GS:ffff88085f800000(0000) knlGS:0000000000000000 [66866.472644] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [66866.479053] CR2: 0000000000000088 CR3: 0000000856357006 CR4: 00000000001606e0 [66866.487013] Call Trace: [66866.489747] remove_one+0x1f/0x220 [hfi1] [66866.494221] pci_device_remove+0x39/0xc0 [66866.498596] device_release_driver_internal+0x141/0x210 [66866.504424] driver_detach+0x3f/0x80 [66866.508409] bus_remove_driver+0x55/0xd0 [66866.512784] driver_unregister+0x2c/0x50 [66866.517164] pci_unregister_driver+0x2a/0xa0 [66866.521934] hfi1_mod_cleanup+0x10/0xaa2 [hfi1] [66866.526988] SyS_delete_module+0x171/0x250 [66866.531558] do_syscall_64+0x67/0x1b0 [66866.535644] entry_SYSCALL64_slow_path+0x25/0x25 [66866.540792] RIP: 0033:0x7f6875525c27 [66866.544777] RSP: 002b:00007ffd48528e78 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [66866.553224] RAX: ffffffffffffffda RBX: 0000000001cc01d0 RCX: 00007f6875525c27 [66866.561185] RDX: 00007f6875596000 RSI: 0000000000000800 RDI: 0000000001cc0238 [66866.569146] RBP: 0000000000000000 R08: 00007f68757e9060 R09: 00007f6875596000 [66866.577120] R10: 00007ffd48528c00 R11: 0000000000000206 R12: 00007ffd48529db4 [66866.585080] R13: 0000000000000000 R14: 0000000001cc01d0 R15: 0000000001cc0010 [66866.593040] Code: 90 0f 1f 44 00 00 48 83 3d a3 8b 03 00 00 55 48 89 e5 53 48 89 fb 74 4e 48 8d bf 18 0c 00 00 e8 9d f2 ff ff 48 8b 83 20 0c 00 00 <48> 8b b8 88 00 00 00 e8 2a 21 b3 e0 48 8b bb 20 0c 00 00 e8 0e [66866.614127] RIP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] RSP: ffffc90008457da0 [66866.621885] CR2: 0000000000000088 [66866.625618] ---[ end trace c4817425783fb092 ]--- Fix by insuring that upon failure from fault_create_debugfs_attr() the parent pointer for the routines is always set to NULL and guards added in the exit routines to insure that debugfs_remove_recursive() is not called when when the parent pointer is NULL. Fixes: 0181ce31b260 ("IB/hfi1: Add receive fault injection feature") Cc: <stable@vger.kernel.org> # 4.14.x Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Complete check for locally terminated smpAlex Estrin1-16/+20
For lid routed packets 'hop_cnt' is zero, therefore current test is incomplete. Fix it by using local mad check for both lid routed and direct routed MADs. Reviewed-by: Mike Mariciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Return actual error value from program_rcvarray()Michael J. Ruhl1-1/+0
A failure of program_rcvarray() is treated inconsistently by the calling function. In one case the error is returned, in a second case, the error is overwritten with EFAULT. In both cases the code path is doing the same thing, allocating memory for groups, so it should be consistent. Make the error path consistent and return the error generated by program_rcvarray(). Reviewed-by: Harish Chegondi <harish.chegondi@intel.com> Fixes: 7e7a436ecb6e ("staging/hfi1: Add TID entry program function body") Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Prevent LNI hang when LCB can't obtain lanesSebastian Sanchez3-20/+53
When the LCB isn't able to get any lanes operational on the first transition into mission mode, the link transfer active never happens and the LNI stays in the polling state indefinitely. Reset LCB upon receiving an 8051 interrupt for LCB to try to obtain lanes with firmware version 1.25.0 or later. Also, update the LCB reset value in other parts of the code with a macro defined to make the code more maintainable and rename functions with the link_width label to link_mode to reflect the fact that those functions set and read link related data not just the link width. Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09Merge branch 'k.o/for-rc' into k.o/wip/dl-for-nextDoug Ledford9-41/+100
Several items of conflict have arisen between the RDMA stack's for-rc branch and upcoming for-next work: 9fd4350ba895 ("IB/rxe: avoid double kfree_skb") directly conflicts with 2e47350789eb ("IB/rxe: optimize the function duplicate_request") Patches already submitted by Intel for the hfi1 driver will fail to apply cleanly without this merge Other people on the mailing list have notified that their upcoming patches also fail to apply cleanly without this merge Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-09IB/hfi1: Use after free race condition in send context error pathMichael J. Ruhl1-0/+4
A pio send egress error can occur when the PSM library attempts to to send a bad packet. That issue is still being investigated. The pio error interrupt handler then attempts to progress the recovery of the errored pio send context. Code inspection reveals that the handling lacks the necessary locking if that recovery interleaves with a PSM close of the "context" object contains the pio send context. The lack of the locking can cause the recovery to access the already freed pio send context object and incorrectly deduce that the pio send context is actually a kernel pio send context as shown by the NULL deref stack below: [<ffffffff8143d78c>] _dev_info+0x6c/0x90 [<ffffffffc0613230>] sc_restart+0x70/0x1f0 [hfi1] [<ffffffff816ab124>] ? __schedule+0x424/0x9b0 [<ffffffffc06133c5>] sc_halted+0x15/0x20 [hfi1] [<ffffffff810aa3ba>] process_one_work+0x17a/0x440 [<ffffffff810ab086>] worker_thread+0x126/0x3c0 [<ffffffff810aaf60>] ? manage_workers.isra.24+0x2a0/0x2a0 [<ffffffff810b252f>] kthread+0xcf/0xe0 [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40 [<ffffffff816b8798>] ret_from_fork+0x58/0x90 [<ffffffff810b2460>] ? insert_kthread_work+0x40/0x40 This is the best case scenario and other scenarios can corrupt the already freed memory. Fix by adding the necessary locking in the pio send context error handler. Cc: <stable@vger.kernel.org> # 4.9.x Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/hfi1: Fix memory leak in exception path in get_irq_affinity()Sebastian Sanchez1-6/+5
When IRQ affinity is set and the interrupt type is unknown, a cpu mask allocated within the function is never freed. Fix this memory leak by allocating memory within the scope where it is used. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/{hfi1, rdmavt}: Fix memory leak in hfi1_alloc_devdata() upon failureSebastian Sanchez3-10/+30
When allocating device data, if there's an allocation failure, the already allocated memory won't be freed such as per-cpu counters. Fix memory leaks in exception path by creating a common reentrant clean up function hfi1_clean_devdata() to be used at driver unload time and device data allocation failure. To accomplish this, free_platform_config() and clean_up_i2c() are changed to be reentrant to remove dependencies when they are called in different order. This helps avoid NULL pointer dereferences introduced by this patch if those two functions weren't reentrant. In addition, set dd->int_counter, dd->rcv_limit, dd->send_schedule and dd->tx_opstats to NULL after they're freed in hfi1_clean_devdata(), so that hfi1_clean_devdata() is fully reentrant. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/hfi1: Fix NULL pointer dereference when invalid num_vls is usedSebastian Sanchez2-3/+2
When an invalid num_vls is used as a module parameter, the code execution follows an exception path where the macro dd_dev_err() expects dd->pcidev->dev not to be NULL in hfi1_init_dd(). This causes a NULL pointer dereference. Fix hfi1_init_dd() by initializing dd->pcidev and dd->pcidev->dev earlier in the code. If a dd exists, then dd->pcidev and dd->pcidev->dev always exists. BUG: unable to handle kernel NULL pointer dereference at 00000000000000f0 IP: __dev_printk+0x15/0x90 Workqueue: events work_for_cpu_fn RIP: 0010:__dev_printk+0x15/0x90 Call Trace: dev_err+0x6c/0x90 ? hfi1_init_pportdata+0x38d/0x3f0 [hfi1] hfi1_init_dd+0xdd/0x2530 [hfi1] ? pci_conf1_read+0xb2/0xf0 ? pci_read_config_word.part.9+0x64/0x80 ? pci_conf1_write+0xb0/0xf0 ? pcie_capability_clear_and_set_word+0x57/0x80 init_one+0x141/0x490 [hfi1] local_pci_probe+0x3f/0xa0 work_for_cpu_fn+0x10/0x20 process_one_work+0x152/0x350 worker_thread+0x1cf/0x3e0 kthread+0xf5/0x130 ? max_active_store+0x80/0x80 ? kthread_bind+0x10/0x10 ? do_syscall_64+0x6e/0x1a0 ? SyS_exit_group+0x10/0x10 ret_from_fork+0x35/0x40 Cc: <stable@vger.kernel.org> # 4.9.x Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/hfi1: Fix loss of BECN with AHGMike Marciniszyn1-10/+40
AHG may be armed to use the stored header, which by design is limited to edits in the PSN/A 32 bit word (bth2). When the code is trying to send a BECN, the use of the stored header will lose the BECN bit. Fix by avoiding AHG when getting ready to send a BECN. This is accomplished by always claiming the packet is not a middle packet which is an AHG precursor. BECNs are not a normal case and this should not hurt AHG optimizations. Cc: <stable@vger.kernel.org> # 4.14.x Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/hfi1 Use correct type for num_user_contextMichael J. Ruhl1-2/+2
The module parameter num_user_context is defined as 'int' and defaults to -1. The module_param_named() says that it is uint. Correct module_param_named() type information and update the modinfo text to reflect the default value. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-05-03IB/hfi1: Fix handling of FECN marked multicast packetMike Marciniszyn3-10/+21
The code for handling a marked UD packet unconditionally returns the dlid in the header of the FECN marked packet. This is not correct for multicast packets where the DLID is in the multicast range. The subsequent attempt to send the CNP with the multicast lid will cause the chip to halt the ack send context because the source lid doesn't match the chip programming. The send context will be halted and flush any other pending packets in the pio ring causing the CNP to not be sent. A part of investigating the fix, it was determined that the 16B work broke the FECN routine badly with inconsistent use of 16 bit and 32 bits types for lids and pkeys. Since the port's source lid was correctly 32 bits the type mixmatches need to be dealt with at the same time as fixing the CNP header issue. Fix these issues by: - Using the ports lid for as the SLID for responding to FECN marked UD packets - Insure pkey is always 16 bit in this and subordinate routines - Insure lids are 32 bits in this and subordinate routines Cc: <stable@vger.kernel.org> # 4.14.x Fixes: 88733e3b8450 ("IB/hfi1: Add 16B UD support") Reviewed-by: Don Hiatt <don.hiatt@intel.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-04-27IB/hfi1: Replace custom hfi1 macros with PCIe macrosFrederick Lawler1-16/+8
IB/hfi1 contains custom macros for PCIe link configuration. Remove the custom macros in favor of the PCIe link macros. No functional change intended. Signed-off-by: Frederick Lawler <fred@fredlawl.com> [bhelgaas: use "GT" instead of "GB"] Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
2018-04-27infiniband: hw: hfi1: Change return type to vm_fault_tSouptick Joarder1-2/+2
Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-04-06Merge tag 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdmaLinus Torvalds5-5/+4
Pull rdma updates from Jason Gunthorpe: "Doug and I are at a conference next week so if another PR is sent I expect it to only be bug fixes. Parav noted yesterday that there are some fringe case behavior changes in his work that he would like to fix, and I see that Intel has a number of rc looking patches for HFI1 they posted yesterday. Parav is again the biggest contributor by patch count with his ongoing work to enable container support in the RDMA stack, followed by Leon doing syzkaller inspired cleanups, though most of the actual fixing went to RC. There is one uncomfortable series here fixing the user ABI to actually work as intended in 32 bit mode. There are lots of notes in the commit messages, but the basic summary is we don't think there is an actual 32 bit kernel user of drivers/infiniband for several good reasons. However we are seeing people want to use a 32 bit user space with 64 bit kernel, which didn't completely work today. So in fixing it we required a 32 bit rxe user to upgrade their userspace. rxe users are still already quite rare and we think a 32 bit one is non-existing. - Fix RDMA uapi headers to actually compile in userspace and be more complete - Three shared with netdev pull requests from Mellanox: * 7 patches, mostly to net with 1 IB related one at the back). This series addresses an IRQ performance issue (patch 1), cleanups related to the fix for the IRQ performance problem (patches 2-6), and then extends the fragmented completion queue support that already exists in the net side of the driver to the ib side of the driver (patch 7). * Mostly IB, with 5 patches to net that are needed to support the remaining 10 patches to the IB subsystem. This series extends the current 'representor' framework when the mlx5 driver is in switchdev mode from being a netdev only construct to being a netdev/IB dev construct. The IB dev is limited to raw Eth queue pairs only, but by having an IB dev of this type attached to the representor for a switchdev port, it enables DPDK to work on the switchdev device. * All net related, but needed as infrastructure for the rdma driver - Updates for the hns, i40iw, bnxt_re, cxgb3, cxgb4, hns drivers - SRP performance updates - IB uverbs write path cleanup patch series from Leon - Add RDMA_CM support to ib_srpt. This is disabled by default. Users need to set the port for ib_srpt to listen on in configfs in order for it to be enabled (/sys/kernel/config/target/srpt/discovery_auth/rdma_cm_port) - TSO and Scatter FCS support in mlx4 - Refactor of modify_qp routine to resolve problems seen while working on new code that is forthcoming - More refactoring and updates of RDMA CM for containers support from Parav - mlx5 'fine grained packet pacing', 'ipsec offload' and 'device memory' user API features - Infrastructure updates for the new IOCTL interface, based on increased usage - ABI compatibility bug fixes to fully support 32 bit userspace on 64 bit kernel as was originally intended. See the commit messages for extensive details - Syzkaller bugs and code cleanups motivated by them" * tag 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (199 commits) IB/rxe: Fix for oops in rxe_register_device on ppc64le arch IB/mlx5: Device memory mr registration support net/mlx5: Mkey creation command adjustments IB/mlx5: Device memory support in mlx5_ib net/mlx5: Query device memory capabilities IB/uverbs: Add device memory registration ioctl support IB/uverbs: Add alloc/free dm uverbs ioctl support IB/uverbs: Add device memory capabilities reporting IB/uverbs: Expose device memory capabilities to user RDMA/qedr: Fix wmb usage in qedr IB/rxe: Removed GID add/del dummy routines RDMA/qedr: Zero stack memory before copying to user space IB/mlx5: Add ability to hash by IPSEC_SPI when creating a TIR IB/mlx5: Add information for querying IPsec capabilities IB/mlx5: Add IPsec support for egress and ingress {net,IB}/mlx5: Add ipsec helper IB/mlx5: Add modify_flow_action_esp verb IB/mlx5: Add implementation for create and destroy action_xfrm IB/uverbs: Introduce ESP steering match filter IB/uverbs: Add modify ESP flow_action ...
2018-03-29RDMA: Use u64_to_user_ptr everywhereJason Gunthorpe1-2/+2
This is already used in many places, get the rest of them too, only to make the code a bit clearer & simpler. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-28treewide: remove large struct-pass-by-value from tracepoint argumentsAlexei Starovoitov2-7/+7
- fix trace_hfi1_ctxt_info() to pass large struct by reference instead of by value - convert 'type array[]' tracepoint arguments into 'type *array', since compiler will warn that sizeof('type array[]') == sizeof('type *array') and later should be used instead The CAST_TO_U64 macro in the later patch will enforce that tracepoint arguments can only be integers, pointers, or less than 8 byte structures. Larger structures should be passed by reference. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-03-19IB/uverbs: Extend uverbs_ioctl header with driver_idMatan Barak1-1/+1
Extending uverbs_ioctl header with driver_id and another reserved field. driver_id should be used in order to identify the driver. Since every driver could have its own parsing tree, this is necessary for strace support. Downstream patches take off the EXPERIMENTAL flag from the ioctl() IB support and thus we add some reserved fields for future usage. Reviewed-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Matan Barak <matanb@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-03-13IB: remove duplicate header filesZhu Yanjun1-1/+0
In hfi.h, the header file opa_addr.h is included twice. In vt.h, the header file mmap.h is included twice. Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com> Acked-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-03-13IB/hfi1: Fix a kernel-doc warningBart Van Assche1-1/+0
Avoid that building with W=1 causes the following warning to appear: drivers/infiniband/hw/hfi1/qp.c:484: warning: Cannot understand * on line 484 - I thought it was a doc line Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com> Cc: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2018-03-06IB/hfi1: Add a missing rcu_read_unlock()Bart Van Assche1-0/+1
This patch avoids that sparse reports the following: drivers/infiniband/hw/hfi1/driver.c:251:13: warning: context imbalance in 'rcv_hdrerr' - different lock contexts for basic block Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com> Cc: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-11vfs: do bulk POLL* -> EPOLL* replacementLinus Torvalds1-4/+4
This is the mindless scripted replacement of kernel use of POLL* variables as described by Al, done by this script: for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'` for f in $L; do sed -i "-es/^\([^\"]*\)\(\<POLL$V\>\)/\\1E\\2/" $f; done done with de-mangling cleanups yet to come. NOTE! On almost all architectures, the EPOLL* constants have the same values as the POLL* constants do. But they keyword here is "almost". For various bad reasons they aren't the same, and epoll() doesn't actually work quite correctly in some cases due to this on Sparc et al. The next patch from Al will sort out the final differences, and we should be all done. Scripted-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-06Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdmaLinus Torvalds26-234/+469
Pull more rdma updates from Doug Ledford: "Items of note: - two patches fix a regression in the 4.15 kernel. The 4.14 kernel worked fine with NVMe over Fabrics and mlx5 adapters. That broke in 4.15. The fix is here. - one of the patches (the endian notation patch from Lijun) looks like a lot of lines of change, but it's mostly mechanical in nature. It amounts to the biggest chunk of change in it (it's about 2/3rds of the overall pull request). Summary: - Clean up some function signatures in rxe for clarity - Tidy the RDMA netlink header to remove unimplemented constants - bnxt_re driver fixes, one is a regression this window. - Minor hns driver fixes - Various fixes from Dan Carpenter and his tool - Fix IRQ cleanup race in HFI1 - HF1 performance optimizations and a fix to report counters in the right units - Fix for an IPoIB startup sequence race with the external manager - Oops fix for the new kabi path - Endian cleanups for hns - Fix for mlx5 related to the new automatic affinity support" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (38 commits) net/mlx5: increase async EQ to avoid EQ overrun mlx5: fix mlx5_get_vector_affinity to start from completion vector 0 RDMA/hns: Fix the endian problem for hns IB/uverbs: Use the standard kConfig format for experimental IB: Update references to libibverbs IB/hfi1: Add 16B rcvhdr trace support IB/hfi1: Convert kzalloc_node and kcalloc to use kcalloc_node IB/core: Avoid a potential OOPs for an unused optional parameter IB/core: Map iWarp AH type to undefined in rdma_ah_find_type IB/ipoib: Fix for potential no-carrier state IB/hfi1: Show fault stats in both TX and RX directions IB/hfi1: Remove blind constants from 16B update IB/hfi1: Convert PortXmitWait/PortVLXmitWait counters to flit times IB/hfi1: Do not override given pcie_pset value IB/hfi1: Optimize process_receive_ib() IB/hfi1: Remove unnecessary fecn and becn fields IB/hfi1: Look up ibport using a pointer in receive path IB/hfi1: Optimize packet type comparison using 9B and bypass code paths IB/hfi1: Compute BTH only for RDMA_WRITE_LAST/SEND_LAST packet IB/hfi1: Remove dependence on qp->s_hdrwords ...
2018-02-01IB/hfi1: Add 16B rcvhdr trace supportDon Hiatt2-4/+6
Add trace_hfi1_rcvhdr support for bypass packets. While here, remove the etype argument as it is available in struct hfi1_packet. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Don Hiatt <don.hiatt@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>