From 4132a577a0a7e75b938d2ae49c7a16b358f60661 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 18 Sep 2016 05:39:20 +0200 Subject: PCI: Afford direct-complete to devices with non-standard PM There are devices not power-manageable by the platform, but still able to runtime suspend to D3cold with a non-standard mechanism. One example is laptop hybrid graphics where the discrete GPU and its built-in HDA controller are power-managed either with a _DSM (AMD PowerXpress, Nvidia Optimus) or a separate gmux controller (MacBook Pro). Another example is Thunderbolt on Macs which is power-managed with custom ACPI methods. When putting the system to sleep, we currently handle such devices improperly by transitioning them from D3cold to D3hot (the default power state defined at the top of pci_target_state()). This wastes energy and prolongs the suspend sequence (powering up the Thunderbolt controller takes 2 seconds). Avoid that by assuming that a non-standard PM mechanism is at work if the device is not platform-power-manageable but currently in D3cold. If the device is wakeup enabled, we might still have to wake it up from D3cold if PME cannot be signaled from that power state. The check for devices without PM capability comes before the check for D3cold since such devices could in theory also be powered down by non-standard means and should then be afforded direct-complete as well. Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- drivers/pci/pci.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index aab9d5115a5f..2f818c3e6571 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1959,9 +1959,22 @@ static pci_power_t pci_target_state(struct pci_dev *dev) default: target_state = state; } - } else if (!dev->pm_cap) { + + return target_state; + } + + if (!dev->pm_cap) target_state = PCI_D0; - } else if (device_may_wakeup(&dev->dev)) { + + /* + * If the device is in D3cold even though it's not power-manageable by + * the platform, it may have been powered down by non-standard means. + * Best to let it slumber. + */ + if (dev->current_state == PCI_D3cold) + target_state = PCI_D3cold; + + if (device_may_wakeup(&dev->dev)) { /* * Find the deepest state from which the device can generate * wake-up events, make it the target state and enable device -- cgit v1.2.3-59-g8ed1b From cc7cc02bada84f0d707aa5b6d2ef8728a2e1f911 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 18 Sep 2016 05:39:20 +0200 Subject: PCI: Query platform firmware for device power state Usually the most accurate way to determine a PCI device's power state is to read its PM Control & Status Register. There are two cases however when this is not an option: If the device doesn't have the PM capability at all, or if it is in D3cold (in which case its config space is inaccessible). In both cases, we can alternatively query the platform firmware for its opinion on the device's power state. To facilitate this, augment struct pci_platform_pm_ops with a ->get_power callback and implement it for acpi_pci_platform_pm (the only pci_platform_pm_ops existing so far). It is used by a forthcoming commit to let pci_update_current_state() recognize D3cold. Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- drivers/pci/pci-acpi.c | 22 ++++++++++++++++++++++ drivers/pci/pci.c | 10 ++++++++-- drivers/pci/pci.h | 3 +++ 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 9a033e8ee9a4..d966d47c9e80 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -452,6 +452,27 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) return error; } +static pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) +{ + struct acpi_device *adev = ACPI_COMPANION(&dev->dev); + static const pci_power_t state_conv[] = { + [ACPI_STATE_D0] = PCI_D0, + [ACPI_STATE_D1] = PCI_D1, + [ACPI_STATE_D2] = PCI_D2, + [ACPI_STATE_D3_HOT] = PCI_D3hot, + [ACPI_STATE_D3_COLD] = PCI_D3cold, + }; + int state; + + if (!adev || !acpi_device_power_manageable(adev)) + return PCI_UNKNOWN; + + if (acpi_device_get_power(adev, &state) || state == ACPI_STATE_UNKNOWN) + return PCI_UNKNOWN; + + return state_conv[state]; +} + static bool acpi_pci_can_wakeup(struct pci_dev *dev) { struct acpi_device *adev = ACPI_COMPANION(&dev->dev); @@ -534,6 +555,7 @@ static bool acpi_pci_need_resume(struct pci_dev *dev) static const struct pci_platform_pm_ops acpi_pci_platform_pm = { .is_manageable = acpi_pci_power_manageable, .set_state = acpi_pci_set_power_state, + .get_state = acpi_pci_get_power_state, .choose_state = acpi_pci_choose_state, .sleep_wake = acpi_pci_sleep_wake, .run_wake = acpi_pci_run_wake, diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2f818c3e6571..2e18e9adcc15 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -552,8 +552,9 @@ static const struct pci_platform_pm_ops *pci_platform_pm; int pci_set_platform_pm(const struct pci_platform_pm_ops *ops) { - if (!ops->is_manageable || !ops->set_state || !ops->choose_state || - !ops->sleep_wake || !ops->run_wake || !ops->need_resume) + if (!ops->is_manageable || !ops->set_state || !ops->get_state || + !ops->choose_state || !ops->sleep_wake || !ops->run_wake || + !ops->need_resume) return -EINVAL; pci_platform_pm = ops; return 0; @@ -570,6 +571,11 @@ static inline int platform_pci_set_power_state(struct pci_dev *dev, return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS; } +static inline pci_power_t platform_pci_get_power_state(struct pci_dev *dev) +{ + return pci_platform_pm ? pci_platform_pm->get_state(dev) : PCI_UNKNOWN; +} + static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) { return pci_platform_pm ? diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9730c474b016..01d520648e1d 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -42,6 +42,8 @@ int pci_probe_reset_function(struct pci_dev *dev); * * @set_state: invokes the platform firmware to set the device's power state * + * @get_state: queries the platform firmware for a device's current power state + * * @choose_state: returns PCI power state of given device preferred by the * platform; to be used during system-wide transitions from a * sleeping state to the working state and vice versa @@ -62,6 +64,7 @@ int pci_probe_reset_function(struct pci_dev *dev); struct pci_platform_pm_ops { bool (*is_manageable)(struct pci_dev *dev); int (*set_state)(struct pci_dev *dev, pci_power_t state); + pci_power_t (*get_state)(struct pci_dev *dev); pci_power_t (*choose_state)(struct pci_dev *dev); int (*sleep_wake)(struct pci_dev *dev, bool enable); int (*run_wake)(struct pci_dev *dev, bool enable); -- cgit v1.2.3-59-g8ed1b From a6a64026c0cd1a76a0c8ab1c05a421aa4821887b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 18 Sep 2016 05:39:20 +0200 Subject: PCI: Recognize D3cold in pci_update_current_state() Whenever a device is resumed or its power state is changed using the platform, its new power state is read from the PM Control & Status Register and cached in pci_dev->current_state by calling pci_update_current_state(). If the device is in D3cold, reading from config space typically results in a fabricated "all ones" response. But if it's in D3hot, the two bits representing the power state in the PMCSR are *also* set to 1. Thus D3hot and D3cold are not discernible by just reading the PMCSR. To account for this, pci_update_current_state() uses two workarounds: - When transitioning to D3cold using pci_platform_power_transition(), the new power state is set blindly by pci_update_current_state(), i.e. without verifying that the device actually *is* in D3cold. This is achieved by setting the "state" argument to PCI_D3cold. The "state" argument was originally intended to convey the new state in case the device doesn't have the PM capability. It is *also* used to convey the device state if the PM capability is present and the new state is D3cold, but this was never explained in the kerneldoc. - Once the current_state is set to D3cold, further invocations of pci_update_current_state() will blindly assume that the device is still in D3cold and leave the current_state unmodified. To get out of this impasse, the current_state has to be set directly, typically by calling pci_raw_set_power_state() or pci_enable_device(). It would be desirable if pci_update_current_state() could reliably detect D3cold by itself. That would allow us to do away with these workarounds, and it would allow for a smarter, more energy conserving runtime resume strategy after system sleep: Currently devices which utilize direct_complete are mandatorily runtime resumed in their ->complete stage. This can be avoided if their power state after system sleep is the same as before, but it requires a mechanism to detect the power state reliably. We've just gained the ability to query the platform firmware for its opinion on the device's power state. On platforms conforming to ACPI 4.0 or newer, this allows recognition of D3cold. Pre-4.0 platforms lack _PR3 and therefore the deepest power state that will ever be reported is D3hot, even though the device may actually be in D3cold. To detect D3cold in those cases, accessibility of the vendor ID in config space is probed using pci_device_is_present(). This also works for devices which are not platform-power-manageable at all, but can be suspended to D3cold using a nonstandard mechanism (e.g. some hybrid graphics laptops or Thunderbolt on the Mac). Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- drivers/pci/pci.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2e18e9adcc15..b2be8957a290 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -707,26 +707,25 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) } /** - * pci_update_current_state - Read PCI power state of given device from its - * PCI PM registers and cache it + * pci_update_current_state - Read power state of given device and cache it * @dev: PCI device to handle. * @state: State to cache in case the device doesn't have the PM capability + * + * The power state is read from the PMCSR register, which however is + * inaccessible in D3cold. The platform firmware is therefore queried first + * to detect accessibility of the register. In case the platform firmware + * reports an incorrect state or the device isn't power manageable by the + * platform at all, we try to detect D3cold by testing accessibility of the + * vendor ID in config space. */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state) { - if (dev->pm_cap) { + if (platform_pci_get_power_state(dev) == PCI_D3cold || + !pci_device_is_present(dev)) { + dev->current_state = PCI_D3cold; + } else if (dev->pm_cap) { u16 pmcsr; - /* - * Configuration space is not accessible for device in - * D3cold, so just keep or set D3cold for safety - */ - if (dev->current_state == PCI_D3cold) - return; - if (state == PCI_D3cold) { - dev->current_state = PCI_D3cold; - return; - } pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); } else { -- cgit v1.2.3-59-g8ed1b From f0b99f70e92dcdc4fecf5cf7ce2f6857ddd82c65 Mon Sep 17 00:00:00 2001 From: Yongji Xie Date: Tue, 13 Sep 2016 17:00:31 +0800 Subject: PCI: Ignore requested alignment for PROBE_ONLY and fixed resources Users may request additional alignment of PCI resources, e.g., to align BARs on page boundaries so they can be shared with guests via VFIO. This of course may require reallocation if firmware has already assigned the BARs with smaller alignments. If the platform has requested PCI_PROBE_ONLY, we should never change any PCI BARs, so we can't provide any additional alignment. Also, if a BAR is marked as IORESOURCE_PCI_FIXED, e.g., for PCI Enhanced Allocation or if the firmware depends on the current BAR value, we can't change the alignment. In these cases, log a message and ignore the user's alignment requests. [bhelgaas: changelog, use goto to simplify PCI_PROBE_ONLY check] Signed-off-by: Yongji Xie Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index aab9d5115a5f..ed9447aee863 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4959,6 +4959,13 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) spin_lock(&resource_alignment_lock); p = resource_alignment_param; + if (!*p) + goto out; + if (pci_has_flag(PCI_PROBE_ONLY)) { + pr_info_once("PCI: Ignoring requested alignments (PCI_PROBE_ONLY)\n"); + goto out; + } + while (*p) { count = 0; if (sscanf(p, "%d%n", &align_order, &count) == 1 && @@ -5023,6 +5030,7 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev) } p++; } +out: spin_unlock(&resource_alignment_lock); return align; } @@ -5063,6 +5071,12 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) r = &dev->resource[i]; if (!(r->flags & IORESOURCE_MEM)) continue; + if (r->flags & IORESOURCE_PCI_FIXED) { + dev_info(&dev->dev, "Ignoring requested alignment for BAR%d: %pR\n", + i, r); + continue; + } + size = resource_size(r); if (size < align) { size = align; -- cgit v1.2.3-59-g8ed1b From 62d9a78f32d9a1b0f6fdae70751deeae6335e74b Mon Sep 17 00:00:00 2001 From: Yongji Xie Date: Tue, 13 Sep 2016 17:00:32 +0800 Subject: PCI: Ignore requested alignment for VF BARs Resource allocation for VFs is done via the VF BARx registers in the PF's SR-IOV Capability, and the BARs in the VFs themselves are read-only zeros (see SR-IOV spec r1.1, secs 3.3.14 and 3.4.1.11). Even though the actual VF BARs are read-only zeros, the VF dev->resource[] structs describe the space allocated for the VF (this is a piece of the space described by the VF BARx register in the PF's SR-IOV capability). It's meaningless to request additional alignment for a VF: the VF BAR alignment is completely determined by the alignment of the VF BARx in the PF and the size of the VF BAR. Ignore the user's alignment requests for VF devices. Signed-off-by: Yongji Xie Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/pci/pci.c') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ed9447aee863..b5b83331da26 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5049,6 +5049,15 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) resource_size_t align, size; u16 command; + /* + * VF BARs are read-only zero according to SR-IOV spec r1.1, sec + * 3.4.1.11. Their resources are allocated from the space + * described by the VF BARx register in the PF's SR-IOV capability. + * We can't influence their alignment here. + */ + if (dev->is_virtfn) + return; + /* check if specified PCI is target device to reassign */ align = pci_specified_resource_alignment(dev); if (!align) -- cgit v1.2.3-59-g8ed1b