diff options
Diffstat (limited to 'Documentation/virtual')
-rw-r--r-- | Documentation/virtual/kvm/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 222 | ||||
-rw-r--r-- | Documentation/virtual/kvm/devices/arm-vgic-v3.txt | 11 | ||||
-rw-r--r-- | Documentation/virtual/kvm/halt-polling.txt | 127 | ||||
-rw-r--r-- | Documentation/virtual/kvm/hypercalls.txt | 35 | ||||
-rw-r--r-- | Documentation/virtual/kvm/locking.txt | 39 | ||||
-rw-r--r-- | Documentation/virtual/kvm/msr.txt | 9 | ||||
-rw-r--r-- | Documentation/virtual/kvm/review-checklist.txt | 4 | ||||
-rw-r--r-- | Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 6 |
9 files changed, 431 insertions, 24 deletions
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX index fee9f2bf9c64..69fe1a8b7ad1 100644 --- a/Documentation/virtual/kvm/00-INDEX +++ b/Documentation/virtual/kvm/00-INDEX @@ -6,6 +6,8 @@ cpuid.txt - KVM-specific cpuid leaves (x86). devices/ - KVM_CAP_DEVICE_CTRL userspace API. +halt-polling.txt + - notes on halt-polling hypercalls.txt - KVM hypercalls. locking.txt diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6bbceb9a3a19..069450938b79 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2034,6 +2034,8 @@ registers, find a list below: PPC | KVM_REG_PPC_WORT | 64 PPC | KVM_REG_PPC_SPRG9 | 64 PPC | KVM_REG_PPC_DBSR | 32 + PPC | KVM_REG_PPC_TIDR | 64 + PPC | KVM_REG_PPC_PSSCR | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64 ... PPC | KVM_REG_PPC_TM_GPR31 | 64 @@ -2050,6 +2052,7 @@ registers, find a list below: PPC | KVM_REG_PPC_TM_VSCR | 32 PPC | KVM_REG_PPC_TM_DSCR | 64 PPC | KVM_REG_PPC_TM_TAR | 64 + PPC | KVM_REG_PPC_TM_XER | 64 | | MIPS | KVM_REG_MIPS_R0 | 64 ... @@ -2058,6 +2061,8 @@ registers, find a list below: MIPS | KVM_REG_MIPS_LO | 64 MIPS | KVM_REG_MIPS_PC | 64 MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 @@ -2068,9 +2073,11 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 MIPS | KVM_REG_MIPS_CP0_EPC | 64 MIPS | KVM_REG_MIPS_CP0_PRID | 32 + MIPS | KVM_REG_MIPS_CP0_EBASE | 64 MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 @@ -2145,6 +2152,12 @@ patterns depending on whether they're 32-bit or 64-bit registers: 0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit) 0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit) +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + MIPS KVM control registers (see above) have the following id bit patterns: 0x7030 0000 0002 <reg:16> @@ -2209,7 +2222,7 @@ after pausing the vcpu, but before it is resumed. 4.71 KVM_SIGNAL_MSI Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 arm64 +Architectures: x86 arm arm64 Type: vm ioctl Parameters: struct kvm_msi (in) Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error @@ -2440,18 +2453,20 @@ are, it will do nothing and return an EBUSY error. The parameter is a pointer to a 32-bit unsigned integer variable containing the order (log base 2) of the desired size of the hash table, which must be between 18 and 46. On successful return from the -ioctl, it will have been updated with the order of the hash table that -was allocated. +ioctl, the value will not be changed by the kernel. If no hash table has been allocated when any vcpu is asked to run (with the KVM_RUN ioctl), the host kernel will allocate a default-sized hash table (16 MB). If this ioctl is called when a hash table has already been allocated, -the kernel will clear out the existing hash table (zero all HPTEs) and -return the hash table order in the parameter. (If the guest is using -the virtualized real-mode area (VRMA) facility, the kernel will -re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. 4.77 KVM_S390_INTERRUPT @@ -3174,7 +3189,7 @@ of IOMMU pages. The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. -4.98 KVM_REINJECT_CONTROL +4.99 KVM_REINJECT_CONTROL Capability: KVM_CAP_REINJECT_CONTROL Architectures: x86 @@ -3198,6 +3213,166 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). +4.100 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.101 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + 5. The kvm_run structure ------------------------ @@ -3214,7 +3389,18 @@ struct kvm_run { Request that KVM_RUN return when it becomes possible to inject external interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - __u8 padding1[7]; + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +exits immediately, returning -EINTR. In the common scenario where a +signal is used to "kick" a VCPU out of KVM_RUN, this field can be used +to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. +Rather than blocking the signal outside KVM_RUN, userspace can set up +a signal handler that sets run->immediate_exit to a non-zero value. + +This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. + + __u8 padding1[6]; /* out */ __u32 exit_reason; @@ -3939,3 +4125,21 @@ In order to use SynIC, it has to be activated by setting this capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt index 9348b3caccd7..c1a24612c198 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt @@ -118,7 +118,7 @@ Groups: -EBUSY: One or more VCPUs are running - KVM_DEV_ARM_VGIC_CPU_SYSREGS + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | @@ -139,13 +139,15 @@ Groups: All system regs accessed through this API are (rw, 64-bit) and kvm_device_attr.addr points to a __u64 value. - KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the CPU specified by the mpidr field. + CPU interface registers access is not implemented for AArch32 mode. + Error -ENXIO is returned when accessed in AArch32 mode. Errors: -ENXIO: Getting or setting this register is not yet supported -EBUSY: VCPU is running - -EINVAL: Invalid mpidr supplied + -EINVAL: Invalid mpidr or register value supplied KVM_DEV_ARM_VGIC_GRP_NR_IRQS @@ -204,3 +206,6 @@ Groups: architecture defined MPIDR, and the field is encoded as follows: | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | | Aff3 | Aff2 | Aff1 | Aff0 | + Errors: + -EINVAL: vINTID is not multiple of 32 or + info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt new file mode 100644 index 000000000000..4a8418318769 --- /dev/null +++ b/Documentation/virtual/kvm/halt-polling.txt @@ -0,0 +1,127 @@ +The KVM halt polling system +=========================== + +The KVM halt polling system provides a feature within KVM whereby the latency +of a guest can, under some circumstances, be reduced by polling in the host +for some time period after the guest has elected to no longer run by cedeing. +That is, when a guest vcpu has ceded, or in the case of powerpc when all of the +vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions +before giving up the cpu to the scheduler in order to let something else run. + +Polling provides a latency advantage in cases where the guest can be run again +very quickly by at least saving us a trip through the scheduler, normally on +the order of a few micro-seconds, although performance benefits are workload +dependant. In the event that no wakeup source arrives during the polling +interval or some other task on the runqueue is runnable the scheduler is +invoked. Thus halt polling is especially useful on workloads with very short +wakeup periods where the time spent halt polling is minimised and the time +savings of not invoking the scheduler are distinguishable. + +The generic halt polling code is implemented in: + + virt/kvm/kvm_main.c: kvm_vcpu_block() + +The powerpc kvm-hv specific case is implemented in: + + arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() + +Halt Polling Interval +===================== + +The maximum time for which to poll before invoking the scheduler, referred to +as the halt polling interval, is increased and decreased based on the perceived +effectiveness of the polling in an attempt to limit pointless polling. +This value is stored in either the vcpu struct: + + kvm_vcpu->halt_poll_ns + +or in the case of powerpc kvm-hv, in the vcore struct: + + kvmppc_vcore->halt_poll_ns + +Thus this is a per vcpu (or vcore) value. + +During polling if a wakeup source is received within the halt polling interval, +the interval is left unchanged. In the event that a wakeup source isn't +received during the polling interval (and thus schedule is invoked) there are +two options, either the polling interval and total block time[0] were less than +the global max polling interval (see module params below), or the total block +time was greater than the global max polling interval. + +In the event that both the polling interval and total block time were less than +the global max polling interval then the polling interval can be increased in +the hope that next time during the longer polling interval the wake up source +will be received while the host is polling and the latency benefits will be +received. The polling interval is grown in the function grow_halt_poll_ns() and +is multiplied by the module parameter halt_poll_ns_grow. + +In the event that the total block time was greater than the global max polling +interval then the host will never poll for long enough (limited by the global +max) to wakeup during the polling interval so it may as well be shrunk in order +to avoid pointless polling. The polling interval is shrunk in the function +shrink_halt_poll_ns() and is divided by the module parameter +halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. + +It is worth noting that this adjustment process attempts to hone in on some +steady state polling interval but will only really do a good job for wakeups +which come at an approximately constant rate, otherwise there will be constant +adjustment of the polling interval. + +[0] total block time: the time between when the halt polling function is + invoked and a wakeup source received (irrespective of + whether the scheduler is invoked within that function). + +Module Parameters +================= + +The kvm module has 3 tuneable module parameters to adjust the global max +polling interval as well as the rate at which the polling interval is grown and +shrunk. These variables are defined in include/linux/kvm_host.h and as module +parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the +powerpc kvm-hv case. + +Module Parameter | Description | Default Value +-------------------------------------------------------------------------------- +halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT + | which defines the ceiling value | + | of the polling interval for | (per arch value) + | each vcpu. | +-------------------------------------------------------------------------------- +halt_poll_ns_grow | The value by which the halt | 2 + | polling interval is multiplied | + | in the grow_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- +halt_poll_ns_shrink | The value by which the halt | 0 + | polling interval is divided in | + | the shrink_halt_poll_ns() | + | function. | +-------------------------------------------------------------------------------- + +These module parameters can be set from the debugfs files in: + + /sys/module/kvm/parameters/ + +Note: that these module parameters are system wide values and are not able to + be tuned on a per vm basis. + +Further Notes +============= + +- Care should be taken when setting the halt_poll_ns module parameter as a +large value has the potential to drive the cpu usage to 100% on a machine which +would be almost entirely idle otherwise. This is because even if a guest has +wakeups during which very little work is done and which are quite far apart, if +the period is shorter than the global max polling interval (halt_poll_ns) then +the host will always poll for the entire block time and thus cpu utilisation +will go to 100%. + +- Halt polling essentially presents a trade off between power usage and latency +and the module parameters should be used to tune the affinity for this. Idle +cpu time is essentially converted to host kernel time with the aim of decreasing +latency when entering the guest. + +- Halt polling will only be conducted by the host when no other tasks are +runnable on that cpu, otherwise the polling will cease immediately and +schedule will be invoked to allow that other task to run. Thus this doesn't +allow a guest to denial of service the cpu. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index c8d040e27046..feaaa634f154 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index e5dd9f4d6100..1bb8bcaf8497 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -13,8 +13,12 @@ The acquisition orders for mutexes are as follows: - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring them together is quite rare. -For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything -else is a leaf: no other lock is taken inside the critical sections. +On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +For spinlocks, kvm_lock is taken outside kvm->mmu_lock. + +Everything else is a leaf: no other lock is taken inside the critical +sections. 2: Exception ------------ @@ -22,9 +26,16 @@ else is a leaf: no other lock is taken inside the critical sections. Fast page fault: Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast only if the -shadow page table is present and it is caused by write-protect, that means -we just need change the W bit of the spte. +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and SPTE_MMU_WRITEABLE bit on the spte: @@ -34,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte: page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This is safe because whenever changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -138,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always atomically update the spte, the race caused by fast page fault can be avoided, See the comments in spte_has_volatile_bits() and mmu_spte_update(). +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + 3. Reference ------------ diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 2a71c8f29f68..0a9ea515512a 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -208,7 +208,9 @@ MSR_KVM_STEAL_TIME: 0x4b564d03 __u64 steal; __u32 version; __u32 flags; - __u32 pad[12]; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; } whose data will be filled in by the hypervisor periodically. Only one @@ -232,6 +234,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03 nanoseconds. Time during which the vcpu is idle, will not be reported as steal time. + preempted: indicate the vCPU who owns this struct is running or + not. Non-zero values mean the vCPU has been preempted. Zero + means the vCPU is not preempted. NOTE, it is always zero if the + the hypervisor doesn't support this field. + MSR_KVM_EOI_EN: 0x4b564d04 data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 when disabled. Bit 1 is reserved and must be zero. When PV end of diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt index a850986ed684..a83b27635fdd 100644 --- a/Documentation/virtual/kvm/review-checklist.txt +++ b/Documentation/virtual/kvm/review-checklist.txt @@ -1,8 +1,8 @@ Review checklist for kvm patches ================================ -1. The patch must follow Documentation/CodingStyle and - Documentation/SubmittingPatches. +1. The patch must follow Documentation/process/coding-style.rst and + Documentation/process/submitting-patches.rst. 2. Patches should be against kvm.git master branch. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt index f4099ca6b483..87b80f589e1c 100644 --- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt +++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt @@ -2401,9 +2401,9 @@ This takes one argument, which is a single letter. It calls the generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in Documentation/sysrq.txt - in your favorite kernel tree to see what letters are valid and what - they do. + that argument. See the SysRq documentation in + Documentation/admin-guide/sysrq.rst in your favorite kernel tree to + see what letters are valid and what they do. |