aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/hv/vmbus_drv.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv/vmbus_drv.c')
-rw-r--r--drivers/hv/vmbus_drv.c265
1 files changed, 265 insertions, 0 deletions
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ebd35fc35290..391f0b225c9a 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -24,12 +24,14 @@
#include <linux/sched/task_stack.h>
#include <asm/mshyperv.h>
+#include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/ptrace.h>
#include <linux/screen_info.h>
#include <linux/kdebug.h>
#include <linux/efi.h>
#include <linux/random.h>
+#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include "hyperv_vmbus.h"
@@ -910,6 +912,43 @@ static void vmbus_shutdown(struct device *child_device)
drv->shutdown(dev);
}
+/*
+ * vmbus_suspend - Suspend a vmbus device
+ */
+static int vmbus_suspend(struct device *child_device)
+{
+ struct hv_driver *drv;
+ struct hv_device *dev = device_to_hv_device(child_device);
+
+ /* The device may not be attached yet */
+ if (!child_device->driver)
+ return 0;
+
+ drv = drv_to_hv_drv(child_device->driver);
+ if (!drv->suspend)
+ return -EOPNOTSUPP;
+
+ return drv->suspend(dev);
+}
+
+/*
+ * vmbus_resume - Resume a vmbus device
+ */
+static int vmbus_resume(struct device *child_device)
+{
+ struct hv_driver *drv;
+ struct hv_device *dev = device_to_hv_device(child_device);
+
+ /* The device may not be attached yet */
+ if (!child_device->driver)
+ return 0;
+
+ drv = drv_to_hv_drv(child_device->driver);
+ if (!drv->resume)
+ return -EOPNOTSUPP;
+
+ return drv->resume(dev);
+}
/*
* vmbus_device_release - Final callback release of the vmbus child device
@@ -925,6 +964,14 @@ static void vmbus_device_release(struct device *device)
kfree(hv_dev);
}
+/*
+ * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
+ * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm.
+ */
+static const struct dev_pm_ops vmbus_pm = {
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume)
+};
+
/* The one and only one */
static struct bus_type hv_bus = {
.name = "vmbus",
@@ -935,6 +982,7 @@ static struct bus_type hv_bus = {
.uevent = vmbus_uevent,
.dev_groups = vmbus_dev_groups,
.drv_groups = vmbus_drv_groups,
+ .pm = &vmbus_pm,
};
struct onmessage_work_context {
@@ -1022,6 +1070,41 @@ msg_handled:
vmbus_signal_eom(msg, message_type);
}
+/*
+ * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
+ * hibernation, because hv_sock connections can not persist across hibernation.
+ */
+static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
+{
+ struct onmessage_work_context *ctx;
+ struct vmbus_channel_rescind_offer *rescind;
+
+ WARN_ON(!is_hvsock_channel(channel));
+
+ /*
+ * sizeof(*ctx) is small and the allocation should really not fail,
+ * otherwise the state of the hv_sock connections ends up in limbo.
+ */
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
+
+ /*
+ * So far, these are not really used by Linux. Just set them to the
+ * reasonable values conforming to the definitions of the fields.
+ */
+ ctx->msg.header.message_type = 1;
+ ctx->msg.header.payload_size = sizeof(*rescind);
+
+ /* These values are actually used by Linux. */
+ rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload;
+ rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
+ rescind->child_relid = channel->offermsg.child_relid;
+
+ INIT_WORK(&ctx->work, vmbus_onmessage_work);
+
+ queue_work_on(vmbus_connection.connect_cpu,
+ vmbus_connection.work_queue,
+ &ctx->work);
+}
/*
* Direct callback for channels using other deferred processing
@@ -2042,6 +2125,129 @@ acpi_walk_err:
return ret_val;
}
+static int vmbus_bus_suspend(struct device *dev)
+{
+ struct vmbus_channel *channel, *sc;
+ unsigned long flags;
+
+ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
+ /*
+ * We wait here until the completion of any channel
+ * offers that are currently in progress.
+ */
+ msleep(1);
+ }
+
+ mutex_lock(&vmbus_connection.channel_mutex);
+ list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+ if (!is_hvsock_channel(channel))
+ continue;
+
+ vmbus_force_channel_rescinded(channel);
+ }
+ mutex_unlock(&vmbus_connection.channel_mutex);
+
+ /*
+ * Wait until all the sub-channels and hv_sock channels have been
+ * cleaned up. Sub-channels should be destroyed upon suspend, otherwise
+ * they would conflict with the new sub-channels that will be created
+ * in the resume path. hv_sock channels should also be destroyed, but
+ * a hv_sock channel of an established hv_sock connection can not be
+ * really destroyed since it may still be referenced by the userspace
+ * application, so we just force the hv_sock channel to be rescinded
+ * by vmbus_force_channel_rescinded(), and the userspace application
+ * will thoroughly destroy the channel after hibernation.
+ *
+ * Note: the counter nr_chan_close_on_suspend may never go above 0 if
+ * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM.
+ */
+ if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
+ wait_for_completion(&vmbus_connection.ready_for_suspend_event);
+
+ WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0);
+
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+ /*
+ * Invalidate the field. Upon resume, vmbus_onoffer() will fix
+ * up the field, and the other fields (if necessary).
+ */
+ channel->offermsg.child_relid = INVALID_RELID;
+
+ if (is_hvsock_channel(channel)) {
+ if (!channel->rescind) {
+ pr_err("hv_sock channel not rescinded!\n");
+ WARN_ON_ONCE(1);
+ }
+ continue;
+ }
+
+ spin_lock_irqsave(&channel->lock, flags);
+ list_for_each_entry(sc, &channel->sc_list, sc_list) {
+ pr_err("Sub-channel not deleted!\n");
+ WARN_ON_ONCE(1);
+ }
+ spin_unlock_irqrestore(&channel->lock, flags);
+
+ atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
+ }
+
+ mutex_unlock(&vmbus_connection.channel_mutex);
+
+ vmbus_initiate_unload(false);
+
+ vmbus_connection.conn_state = DISCONNECTED;
+
+ /* Reset the event for the next resume. */
+ reinit_completion(&vmbus_connection.ready_for_resume_event);
+
+ return 0;
+}
+
+static int vmbus_bus_resume(struct device *dev)
+{
+ struct vmbus_channel_msginfo *msginfo;
+ size_t msgsize;
+ int ret;
+
+ /*
+ * We only use the 'vmbus_proto_version', which was in use before
+ * hibernation, to re-negotiate with the host.
+ */
+ if (vmbus_proto_version == VERSION_INVAL ||
+ vmbus_proto_version == 0) {
+ pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
+ return -EINVAL;
+ }
+
+ msgsize = sizeof(*msginfo) +
+ sizeof(struct vmbus_channel_initiate_contact);
+
+ msginfo = kzalloc(msgsize, GFP_KERNEL);
+
+ if (msginfo == NULL)
+ return -ENOMEM;
+
+ ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
+
+ kfree(msginfo);
+
+ if (ret != 0)
+ return ret;
+
+ WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0);
+
+ vmbus_request_offers();
+
+ wait_for_completion(&vmbus_connection.ready_for_resume_event);
+
+ /* Reset the event for the next suspend. */
+ reinit_completion(&vmbus_connection.ready_for_suspend_event);
+
+ return 0;
+}
+
static const struct acpi_device_id vmbus_acpi_device_ids[] = {
{"VMBUS", 0},
{"VMBus", 0},
@@ -2049,6 +2255,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = {
};
MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
+/*
+ * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
+ * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the
+ * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the
+ * pci "noirq" restore callback runs before "non-noirq" callbacks (see
+ * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
+ * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
+ * resume callback must also run via the "noirq" callbacks.
+ */
+static const struct dev_pm_ops vmbus_bus_pm = {
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume)
+};
+
static struct acpi_driver vmbus_acpi_driver = {
.name = "vmbus",
.ids = vmbus_acpi_device_ids,
@@ -2056,6 +2275,7 @@ static struct acpi_driver vmbus_acpi_driver = {
.add = vmbus_acpi_add,
.remove = vmbus_acpi_remove,
},
+ .drv.pm = &vmbus_bus_pm,
};
static void hv_kexec_handler(void)
@@ -2086,6 +2306,47 @@ static void hv_crash_handler(struct pt_regs *regs)
hyperv_cleanup();
};
+static int hv_synic_suspend(void)
+{
+ /*
+ * When we reach here, all the non-boot CPUs have been offlined, and
+ * the stimers on them have been unbound in hv_synic_cleanup() ->
+ * hv_stimer_cleanup() -> clockevents_unbind_device().
+ *
+ * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here
+ * we do not unbind the stimer on CPU0 because: 1) it's unnecessary
+ * because the interrupts remain disabled between syscore_suspend()
+ * and syscore_resume(): see create_image() and resume_target_kernel();
+ * 2) the stimer on CPU0 is automatically disabled later by
+ * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
+ * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning
+ * would be triggered if we call clockevents_unbind_device(), which
+ * may sleep, in an interrupts-disabled context. So, we intentionally
+ * don't call hv_stimer_cleanup(0) here.
+ */
+
+ hv_synic_disable_regs(0);
+
+ return 0;
+}
+
+static void hv_synic_resume(void)
+{
+ hv_synic_enable_regs(0);
+
+ /*
+ * Note: we don't need to call hv_stimer_init(0), because the timer
+ * on CPU0 is not unbound in hv_synic_suspend(), and the timer is
+ * automatically re-enabled in timekeeping_resume().
+ */
+}
+
+/* The callbacks run only on CPU0, with irqs_disabled. */
+static struct syscore_ops hv_synic_syscore_ops = {
+ .suspend = hv_synic_suspend,
+ .resume = hv_synic_resume,
+};
+
static int __init hv_acpi_init(void)
{
int ret, t;
@@ -2116,6 +2377,8 @@ static int __init hv_acpi_init(void)
hv_setup_kexec_handler(hv_kexec_handler);
hv_setup_crash_handler(hv_crash_handler);
+ register_syscore_ops(&hv_synic_syscore_ops);
+
return 0;
cleanup:
@@ -2128,6 +2391,8 @@ static void __exit vmbus_exit(void)
{
int cpu;
+ unregister_syscore_ops(&hv_synic_syscore_ops);
+
hv_remove_kexec_handler();
hv_remove_crash_handler();
vmbus_connection.conn_state = DISCONNECTED;