From b03e7495a862b028294f59fc87286d6d78ee7fa1 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Wed, 20 Jul 2011 15:20:54 -0500 Subject: PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 08a95b369d85..466fad6e6ee2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE; + /* * The default CLS is used if arch didn't set CLS explicitly and not * all pci devices agree on the same value. Arch can override either @@ -3222,6 +3224,67 @@ out: } EXPORT_SYMBOL(pcie_set_readrq); +/** + * pcie_get_mps - get PCI Express maximum payload size + * @dev: PCI device to query + * + * Returns maximum payload size in bytes + * or appropriate error value. + */ +int pcie_get_mps(struct pci_dev *dev) +{ + int ret, cap; + u16 ctl; + + cap = pci_pcie_cap(dev); + if (!cap) + return -EINVAL; + + ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (!ret) + ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5); + + return ret; +} + +/** + * pcie_set_mps - set PCI Express maximum payload size + * @dev: PCI device to query + * @rq: maximum payload size in bytes + * valid values are 128, 256, 512, 1024, 2048, 4096 + * + * If possible sets maximum payload size + */ +int pcie_set_mps(struct pci_dev *dev, int mps) +{ + int cap, err = -EINVAL; + u16 ctl, v; + + if (mps < 128 || mps > 4096 || !is_power_of_2(mps)) + goto out; + + v = ffs(mps) - 8; + if (v > dev->pcie_mpss) + goto out; + v <<= 5; + + cap = pci_pcie_cap(dev); + if (!cap) + goto out; + + err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (err) + goto out; + + if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) { + ctl &= ~PCI_EXP_DEVCTL_PAYLOAD; + ctl |= v; + err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl); + } +out: + return err; +} + /** * pci_select_bars - Make BAR mask from the type of resource * @dev: the PCI device for which BAR mask is made @@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "pcie_bus_safe", 13)) { + pcie_bus_config = PCIE_BUS_SAFE; + } else if (!strncmp(str, "pcie_bus_perf", 13)) { + pcie_bus_config = PCIE_BUS_PERFORMANCE; } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); -- cgit v1.2.3-59-g8ed1b From 47c08f3107270e5a439bc0106a308f7c48c9621d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Aug 2011 11:49:43 -0700 Subject: pci: fix new kernel-doc warning in pci.c Fix new kernel-doc warning in pci.c: Warning(drivers/pci/pci.c:3259): No description found for parameter 'mps' Warning(drivers/pci/pci.c:3259): Excess function parameter 'rq' description in 'pcie_set_mps' Signed-off-by: Randy Dunlap Cc: Jesse Barnes Signed-off-by: Linus Torvalds --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 466fad6e6ee2..0ce67423a0a3 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3250,7 +3250,7 @@ int pcie_get_mps(struct pci_dev *dev) /** * pcie_set_mps - set PCI Express maximum payload size * @dev: PCI device to query - * @rq: maximum payload size in bytes + * @mps: maximum payload size in bytes * valid values are 128, 256, 512, 1024, 2048, 4096 * * If possible sets maximum payload size -- cgit v1.2.3-59-g8ed1b From ed2888e906b56769b4ffabb9c577190438aa68b8 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Thu, 8 Sep 2011 16:41:18 -0500 Subject: PCI: Remove MRRS modification from MPS setting code Modifying the Maximum Read Request Size to 0 (value of 128Bytes) has massive negative ramifications on some devices. Without knowing which devices have this issue, do not modify from the default value when walking the PCI-E bus in pcie_bus_safe mode. Also, make pcie_bus_safe the default procedure. Tested-by: Sven Schnelle Tested-by: Simon Kirby Tested-by: Stephen M. Cameron Reported-and-tested-by: Eric Dumazet Reported-and-tested-by: Niels Ole Salscheider References: https://bugzilla.kernel.org/show_bug.cgi?id=42162 Signed-off-by: Jon Mason Acked-by: Jesse Barnes Signed-off-by: Linus Torvalds --- drivers/pci/pci.c | 2 +- drivers/pci/probe.c | 41 ++++++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0ce67423a0a3..4e84fd4a4312 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -77,7 +77,7 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; -enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE; +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_SAFE; /* * The default CLS is used if arch didn't set CLS explicitly and not diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0820fc1544e8..b1187ff31d89 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1396,34 +1396,37 @@ static void pcie_write_mps(struct pci_dev *dev, int mps) static void pcie_write_mrrs(struct pci_dev *dev, int mps) { - int rc, mrrs; + int rc, mrrs, dev_mpss; - if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { - int dev_mpss = 128 << dev->pcie_mpss; + /* In the "safe" case, do not configure the MRRS. There appear to be + * issues with setting MRRS to 0 on a number of devices. + */ - /* For Max performance, the MRRS must be set to the largest - * supported value. However, it cannot be configured larger - * than the MPS the device or the bus can support. This assumes - * that the largest MRRS available on the device cannot be - * smaller than the device MPSS. - */ - mrrs = mps < dev_mpss ? mps : dev_mpss; - } else - /* In the "safe" case, configure the MRRS for fairness on the - * bus by making all devices have the same size - */ - mrrs = mps; + if (pcie_bus_config != PCIE_BUS_PERFORMANCE) + return; + + dev_mpss = 128 << dev->pcie_mpss; + /* For Max performance, the MRRS must be set to the largest supported + * value. However, it cannot be configured larger than the MPS the + * device or the bus can support. This assumes that the largest MRRS + * available on the device cannot be smaller than the device MPSS. + */ + mrrs = min(mps, dev_mpss); /* MRRS is a R/W register. Invalid values can be written, but a - * subsiquent read will verify if the value is acceptable or not. + * subsequent read will verify if the value is acceptable or not. * If the MRRS value provided is not acceptable (e.g., too large), * shrink the value until it is acceptable to the HW. */ while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) { + dev_warn(&dev->dev, "Attempting to modify the PCI-E MRRS value" + " to %d. If any issues are encountered, please try " + "running with pci=pcie_bus_safe\n", mrrs); rc = pcie_set_readrq(dev, mrrs); if (rc) - dev_err(&dev->dev, "Failed attempting to set the MRRS\n"); + dev_err(&dev->dev, + "Failed attempting to set the MRRS\n"); mrrs /= 2; } @@ -1436,13 +1439,13 @@ static int pcie_bus_configure_set(struct pci_dev *dev, void *data) if (!pci_is_pcie(dev)) return 0; - dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", + dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); pcie_write_mps(dev, mps); pcie_write_mrrs(dev, mps); - dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", + dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); return 0; -- cgit v1.2.3-59-g8ed1b