aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/pci/pci.c
diff options
context:
space:
mode:
authorBjorn Helgaas <bhelgaas@google.com>2019-11-28 08:54:35 -0600
committerBjorn Helgaas <bhelgaas@google.com>2019-11-28 08:54:35 -0600
commit7cfe16393c3c9fed45545b234b852e1154c7cc5b (patch)
tree417dc942f05a9e1669ecdc107c5ff8abcff94527 /drivers/pci/pci.c
parentMerge branch 'pci/msi' (diff)
parentPCI/PM: Move pci_dev_wait() definition earlier (diff)
downloadwireguard-linux-7cfe16393c3c9fed45545b234b852e1154c7cc5b.tar.xz
wireguard-linux-7cfe16393c3c9fed45545b234b852e1154c7cc5b.zip
Merge branch 'pci/pm'
- Always return devices to D0 when thawing to fix hibernation with drivers like mlx4 that used legacy power management (previously we only did it for drivers with new power management ops) (Dexuan Cui) - Clear PCIe PME Status even for legacy power management (Bjorn Helgaas) - Fix PCI PM documentation errors (Bjorn Helgaas) - Use dev_printk() for more power management messages (Bjorn Helgaas) - Apply D2 delay as milliseconds, not microseconds (Bjorn Helgaas) - Convert xen-platform from legacy to generic power management (Bjorn Helgaas) - Removed unused .resume_early() and .suspend_late() legacy power management hooks (Bjorn Helgaas) - Rearrange power management code for clarity (Rafael J. Wysocki) - Decode power states more clearly ("4" or "D4" really refers to "D3cold") (Bjorn Helgaas) - Notice when reading PM Control register returns an error (~0) instead of interpreting it as being in D3hot (Bjorn Helgaas) - Add missing link delays required by the PCIe spec (Mika Westerberg) * pci/pm: PCI/PM: Move pci_dev_wait() definition earlier PCI/PM: Add missing link delays required by the PCIe spec PCI/PM: Add pcie_wait_for_link_delay() PCI/PM: Return error when changing power state from D3cold PCI/PM: Decode D3cold power state correctly PCI/PM: Fold __pci_complete_power_transition() into its caller PCI/PM: Avoid exporting __pci_complete_power_transition() PCI/PM: Fold __pci_start_power_transition() into its caller PCI/PM: Use pci_power_up() in pci_set_power_state() PCI/PM: Move power state update away from pci_power_up() PCI/PM: Remove unused pci_driver.suspend_late() hook PCI/PM: Remove unused pci_driver.resume_early() hook xen-platform: Convert to generic power management PCI/PM: Simplify pci_set_power_state() PCI/PM: Expand PM reset messages to mention D3hot (not just D3) PCI/PM: Apply D2 delay as milliseconds, not microseconds PCI/PM: Use pci_WARN() to include device information PCI/PM: Use PCI dev_printk() wrappers for consistency PCI/PM: Wrap long lines in documentation PCI/PM: Note that PME can be generated from D0 PCI/PM: Make power management op coding style consistent PCI/PM: Run resume fixups before disabling wakeup events PCI/PM: Clear PCIe PME Status even for legacy power management PCI/PM: Correct pci_pm_thaw_noirq() documentation PCI/PM: Always return devices to D0 when thawing
Diffstat (limited to 'drivers/pci/pci.c')
-rw-r--r--drivers/pci/pci.c336
1 files changed, 224 insertions, 112 deletions
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index f5532468dd31..61df2e04176d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -835,14 +835,16 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
return -EINVAL;
/*
- * Validate current state:
- * Can enter D0 from any state, but if we can only go deeper
- * to sleep if we're already in a low power state
+ * Validate transition: We can enter D0 from any state, but if
+ * we're already in a low-power state, we can only go deeper. E.g.,
+ * we can go from D1 to D3, but we can't go directly from D3 to D1;
+ * we'd have to go from D3 to D0, then to D1.
*/
if (state != PCI_D0 && dev->current_state <= PCI_D3cold
&& dev->current_state > state) {
- pci_err(dev, "invalid power transition (from state %d to %d)\n",
- dev->current_state, state);
+ pci_err(dev, "invalid power transition (from %s to %s)\n",
+ pci_power_name(dev->current_state),
+ pci_power_name(state));
return -EINVAL;
}
@@ -852,6 +854,12 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
return -EIO;
pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
+ if (pmcsr == (u16) ~0) {
+ pci_err(dev, "can't change power state from %s to %s (config space inaccessible)\n",
+ pci_power_name(dev->current_state),
+ pci_power_name(state));
+ return -EIO;
+ }
/*
* If we're (effectively) in D3, force entire word to 0.
@@ -887,13 +895,14 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
if (state == PCI_D3hot || dev->current_state == PCI_D3hot)
pci_dev_d3_sleep(dev);
else if (state == PCI_D2 || dev->current_state == PCI_D2)
- udelay(PCI_PM_D2_DELAY);
+ msleep(PCI_PM_D2_DELAY);
pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
if (dev->current_state != state)
- pci_info_ratelimited(dev, "Refused to change power state, currently in D%d\n",
- dev->current_state);
+ pci_info_ratelimited(dev, "refused to change power state from %s to %s\n",
+ pci_power_name(dev->current_state),
+ pci_power_name(state));
/*
* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT
@@ -964,7 +973,7 @@ void pci_refresh_power_state(struct pci_dev *dev)
* @dev: PCI device to handle.
* @state: State to put the device into.
*/
-static int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
+int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
{
int error;
@@ -980,6 +989,7 @@ static int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state)
return error;
}
+EXPORT_SYMBOL_GPL(pci_platform_power_transition);
/**
* pci_wakeup - Wake up a PCI device
@@ -1003,34 +1013,70 @@ void pci_wakeup_bus(struct pci_bus *bus)
pci_walk_bus(bus, pci_wakeup, NULL);
}
+static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
+{
+ int delay = 1;
+ u32 id;
+
+ /*
+ * After reset, the device should not silently discard config
+ * requests, but it may still indicate that it needs more time by
+ * responding to them with CRS completions. The Root Port will
+ * generally synthesize ~0 data to complete the read (except when
+ * CRS SV is enabled and the read was for the Vendor ID; in that
+ * case it synthesizes 0x0001 data).
+ *
+ * Wait for the device to return a non-CRS completion. Read the
+ * Command register instead of Vendor ID so we don't have to
+ * contend with the CRS SV value.
+ */
+ pci_read_config_dword(dev, PCI_COMMAND, &id);
+ while (id == ~0) {
+ if (delay > timeout) {
+ pci_warn(dev, "not ready %dms after %s; giving up\n",
+ delay - 1, reset_type);
+ return -ENOTTY;
+ }
+
+ if (delay > 1000)
+ pci_info(dev, "not ready %dms after %s; waiting\n",
+ delay - 1, reset_type);
+
+ msleep(delay);
+ delay *= 2;
+ pci_read_config_dword(dev, PCI_COMMAND, &id);
+ }
+
+ if (delay > 1000)
+ pci_info(dev, "ready %dms after %s\n", delay - 1,
+ reset_type);
+
+ return 0;
+}
+
/**
- * __pci_start_power_transition - Start power transition of a PCI device
- * @dev: PCI device to handle.
- * @state: State to put the device into.
+ * pci_power_up - Put the given device into D0
+ * @dev: PCI device to power up
*/
-static void __pci_start_power_transition(struct pci_dev *dev, pci_power_t state)
+int pci_power_up(struct pci_dev *dev)
{
- if (state == PCI_D0) {
- pci_platform_power_transition(dev, PCI_D0);
+ pci_platform_power_transition(dev, PCI_D0);
+
+ /*
+ * Mandatory power management transition delays are handled in
+ * pci_pm_resume_noirq() and pci_pm_runtime_resume() of the
+ * corresponding bridge.
+ */
+ if (dev->runtime_d3cold) {
/*
- * Mandatory power management transition delays, see
- * PCI Express Base Specification Revision 2.0 Section
- * 6.6.1: Conventional Reset. Do not delay for
- * devices powered on/off by corresponding bridge,
- * because have already delayed for the bridge.
+ * When powering on a bridge from D3cold, the whole hierarchy
+ * may be powered on into D0uninitialized state, resume them to
+ * give them a chance to suspend again
*/
- if (dev->runtime_d3cold) {
- if (dev->d3cold_delay && !dev->imm_ready)
- msleep(dev->d3cold_delay);
- /*
- * When powering on a bridge from D3cold, the
- * whole hierarchy may be powered on into
- * D0uninitialized state, resume them to give
- * them a chance to suspend again
- */
- pci_wakeup_bus(dev->subordinate);
- }
+ pci_wakeup_bus(dev->subordinate);
}
+
+ return pci_raw_set_power_state(dev, PCI_D0);
}
/**
@@ -1058,27 +1104,6 @@ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state)
}
/**
- * __pci_complete_power_transition - Complete power transition of a PCI device
- * @dev: PCI device to handle.
- * @state: State to put the device into.
- *
- * This function should not be called directly by device drivers.
- */
-int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state)
-{
- int ret;
-
- if (state <= PCI_D0)
- return -EINVAL;
- ret = pci_platform_power_transition(dev, state);
- /* Power off the bridge may power off the whole hierarchy */
- if (!ret && state == PCI_D3cold)
- pci_bus_set_current_state(dev->subordinate, PCI_D3cold);
- return ret;
-}
-EXPORT_SYMBOL_GPL(__pci_complete_power_transition);
-
-/**
* pci_set_power_state - Set the power state of a PCI device
* @dev: PCI device to handle.
* @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
@@ -1118,7 +1143,8 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
if (dev->current_state == state)
return 0;
- __pci_start_power_transition(dev, state);
+ if (state == PCI_D0)
+ return pci_power_up(dev);
/*
* This device is quirked not to be put into D3, so don't put it in
@@ -1134,23 +1160,16 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
error = pci_raw_set_power_state(dev, state > PCI_D3hot ?
PCI_D3hot : state);
- if (!__pci_complete_power_transition(dev, state))
- error = 0;
+ if (pci_platform_power_transition(dev, state))
+ return error;
- return error;
-}
-EXPORT_SYMBOL(pci_set_power_state);
+ /* Powering off a bridge may power off the whole hierarchy */
+ if (state == PCI_D3cold)
+ pci_bus_set_current_state(dev->subordinate, PCI_D3cold);
-/**
- * pci_power_up - Put the given device into D0 forcibly
- * @dev: PCI device to power up
- */
-void pci_power_up(struct pci_dev *dev)
-{
- __pci_start_power_transition(dev, PCI_D0);
- pci_raw_set_power_state(dev, PCI_D0);
- pci_update_current_state(dev, PCI_D0);
+ return 0;
}
+EXPORT_SYMBOL(pci_set_power_state);
/**
* pci_choose_state - Choose the power state of a PCI device
@@ -4431,47 +4450,6 @@ int pci_wait_for_pending_transaction(struct pci_dev *dev)
}
EXPORT_SYMBOL(pci_wait_for_pending_transaction);
-static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
-{
- int delay = 1;
- u32 id;
-
- /*
- * After reset, the device should not silently discard config
- * requests, but it may still indicate that it needs more time by
- * responding to them with CRS completions. The Root Port will
- * generally synthesize ~0 data to complete the read (except when
- * CRS SV is enabled and the read was for the Vendor ID; in that
- * case it synthesizes 0x0001 data).
- *
- * Wait for the device to return a non-CRS completion. Read the
- * Command register instead of Vendor ID so we don't have to
- * contend with the CRS SV value.
- */
- pci_read_config_dword(dev, PCI_COMMAND, &id);
- while (id == ~0) {
- if (delay > timeout) {
- pci_warn(dev, "not ready %dms after %s; giving up\n",
- delay - 1, reset_type);
- return -ENOTTY;
- }
-
- if (delay > 1000)
- pci_info(dev, "not ready %dms after %s; waiting\n",
- delay - 1, reset_type);
-
- msleep(delay);
- delay *= 2;
- pci_read_config_dword(dev, PCI_COMMAND, &id);
- }
-
- if (delay > 1000)
- pci_info(dev, "ready %dms after %s\n", delay - 1,
- reset_type);
-
- return 0;
-}
-
/**
* pcie_has_flr - check if a device supports function level resets
* @dev: device to check
@@ -4606,16 +4584,19 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
pci_dev_d3_sleep(dev);
- return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
+ return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
}
+
/**
- * pcie_wait_for_link - Wait until link is active or inactive
+ * pcie_wait_for_link_delay - Wait until link is active or inactive
* @pdev: Bridge device
* @active: waiting for active or inactive?
+ * @delay: Delay to wait after link has become active (in ms)
*
* Use this to wait till link becomes active or inactive.
*/
-bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+static bool pcie_wait_for_link_delay(struct pci_dev *pdev, bool active,
+ int delay)
{
int timeout = 1000;
bool ret;
@@ -4652,13 +4633,144 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
timeout -= 10;
}
if (active && ret)
- msleep(100);
+ msleep(delay);
else if (ret != active)
pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
active ? "set" : "cleared");
return ret == active;
}
+/**
+ * pcie_wait_for_link - Wait until link is active or inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+ return pcie_wait_for_link_delay(pdev, active, 100);
+}
+
+/*
+ * Find maximum D3cold delay required by all the devices on the bus. The
+ * spec says 100 ms, but firmware can lower it and we allow drivers to
+ * increase it as well.
+ *
+ * Called with @pci_bus_sem locked for reading.
+ */
+static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
+{
+ const struct pci_dev *pdev;
+ int min_delay = 100;
+ int max_delay = 0;
+
+ list_for_each_entry(pdev, &bus->devices, bus_list) {
+ if (pdev->d3cold_delay < min_delay)
+ min_delay = pdev->d3cold_delay;
+ if (pdev->d3cold_delay > max_delay)
+ max_delay = pdev->d3cold_delay;
+ }
+
+ return max(min_delay, max_delay);
+}
+
+/**
+ * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
+ * @dev: PCI bridge
+ *
+ * Handle necessary delays before access to the devices on the secondary
+ * side of the bridge are permitted after D3cold to D0 transition.
+ *
+ * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For
+ * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section
+ * 4.3.2.
+ */
+void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev)
+{
+ struct pci_dev *child;
+ int delay;
+
+ if (pci_dev_is_disconnected(dev))
+ return;
+
+ if (!pci_is_bridge(dev) || !dev->bridge_d3)
+ return;
+
+ down_read(&pci_bus_sem);
+
+ /*
+ * We only deal with devices that are present currently on the bus.
+ * For any hot-added devices the access delay is handled in pciehp
+ * board_added(). In case of ACPI hotplug the firmware is expected
+ * to configure the devices before OS is notified.
+ */
+ if (!dev->subordinate || list_empty(&dev->subordinate->devices)) {
+ up_read(&pci_bus_sem);
+ return;
+ }
+
+ /* Take d3cold_delay requirements into account */
+ delay = pci_bus_max_d3cold_delay(dev->subordinate);
+ if (!delay) {
+ up_read(&pci_bus_sem);
+ return;
+ }
+
+ child = list_first_entry(&dev->subordinate->devices, struct pci_dev,
+ bus_list);
+ up_read(&pci_bus_sem);
+
+ /*
+ * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before
+ * accessing the device after reset (that is 1000 ms + 100 ms). In
+ * practice this should not be needed because we don't do power
+ * management for them (see pci_bridge_d3_possible()).
+ */
+ if (!pci_is_pcie(dev)) {
+ pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
+ msleep(1000 + delay);
+ return;
+ }
+
+ /*
+ * For PCIe downstream and root ports that do not support speeds
+ * greater than 5 GT/s need to wait minimum 100 ms. For higher
+ * speeds (gen3) we need to wait first for the data link layer to
+ * become active.
+ *
+ * However, 100 ms is the minimum and the PCIe spec says the
+ * software must allow at least 1s before it can determine that the
+ * device that did not respond is a broken device. There is
+ * evidence that 100 ms is not always enough, for example certain
+ * Titan Ridge xHCI controller does not always respond to
+ * configuration requests if we only wait for 100 ms (see
+ * https://bugzilla.kernel.org/show_bug.cgi?id=203885).
+ *
+ * Therefore we wait for 100 ms and check for the device presence.
+ * If it is still not present give it an additional 100 ms.
+ */
+ if (!pcie_downstream_port(dev))
+ return;
+
+ if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
+ pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
+ msleep(delay);
+ } else {
+ pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
+ delay);
+ if (!pcie_wait_for_link_delay(dev, true, delay)) {
+ /* Did not train, no need to wait any further */
+ return;
+ }
+ }
+
+ if (!pci_device_is_present(child)) {
+ pci_dbg(child, "waiting additional %d ms to become accessible\n", delay);
+ msleep(delay);
+ }
+}
+
void pci_reset_secondary_bus(struct pci_dev *dev)
{
u16 ctrl;