From 2a31b9db153530df4aa02dac8c32837bf5f47019 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:36:47 +0200 Subject: kvm: introduce manual dirty log reprotect There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 132 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 17 deletions(-) (limited to 'virt/kvm/kvm_main.c') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 54f0fcfd431e..0041947b7390 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - spin_lock(&kvm->mmu_lock); + if (!dirty_bitmap[i]) + continue; + + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + } + + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id, n; + gfn_t offset; + unsigned long i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if ((log->first_page & 63) || (log->num_pages & 63)) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); *flush = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT; - if (!dirty_bitmap[i]) + spin_lock(&kvm->mmu_lock); + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; + if (!mask) continue; - *flush = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + mask &= atomic_long_fetch_andnot(mask, p); + /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ if (mask) { - offset = i * BITS_PER_LONG; + *flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } - spin_unlock(&kvm->mmu_lock); - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif bool kvm_largepages_enabled(void) @@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); } @@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; -- cgit v1.2.3-59-g8ed1b