aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig30
-rw-r--r--arch/x86_64/defconfig45
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c30
-rw-r--r--arch/x86_64/ia32/ia32_signal.c11
-rw-r--r--arch/x86_64/ia32/ia32entry.S1
-rw-r--r--arch/x86_64/kernel/Makefile6
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c2
-rw-r--r--arch/x86_64/kernel/apic.c5
-rw-r--r--arch/x86_64/kernel/e820.c38
-rw-r--r--arch/x86_64/kernel/head.S20
-rw-r--r--arch/x86_64/kernel/hpet.c511
-rw-r--r--arch/x86_64/kernel/i8259.c1
-rw-r--r--arch/x86_64/kernel/io_apic.c28
-rw-r--r--arch/x86_64/kernel/ioport.c2
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/mce.c66
-rw-r--r--arch/x86_64/kernel/mce_amd.c44
-rw-r--r--arch/x86_64/kernel/nmi.c75
-rw-r--r--arch/x86_64/kernel/pci-calgary.c17
-rw-r--r--arch/x86_64/kernel/pci-dma.c28
-rw-r--r--arch/x86_64/kernel/pci-gart.c4
-rw-r--r--arch/x86_64/kernel/pmtimer.c58
-rw-r--r--arch/x86_64/kernel/ptrace.c8
-rw-r--r--arch/x86_64/kernel/setup.c169
-rw-r--r--arch/x86_64/kernel/setup64.c1
-rw-r--r--arch/x86_64/kernel/smpboot.c231
-rw-r--r--arch/x86_64/kernel/stacktrace.c5
-rw-r--r--arch/x86_64/kernel/time.c955
-rw-r--r--arch/x86_64/kernel/tsc.c226
-rw-r--r--arch/x86_64/kernel/tsc_sync.c187
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S28
-rw-r--r--arch/x86_64/kernel/vsyscall.c127
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c5
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/copy_user_nocache.S217
-rw-r--r--arch/x86_64/mm/fault.c18
-rw-r--r--arch/x86_64/mm/init.c24
-rw-r--r--arch/x86_64/mm/numa.c202
-rw-r--r--arch/x86_64/mm/pageattr.c4
-rw-r--r--arch/x86_64/pci/Makefile3
-rw-r--r--arch/x86_64/pci/mmconfig.c118
41 files changed, 1836 insertions, 1728 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 02dd39457bcf..56eb14c98475 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,14 @@ config X86
bool
default y
+config GENERIC_TIME
+ bool
+ default y
+
+config GENERIC_TIME_VSYSCALL
+ bool
+ default y
+
config ZONE_DMA32
bool
default y
@@ -152,18 +160,18 @@ config MPSC
Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
with Intel Extended Memory 64 Technology(EM64T). For details see
<http://www.intel.com/technology/64bitextensions/>.
- Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the
- Netburst core and shouldn't use this option. You can distingush them
+ Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
+ Netburst core and shouldn't use this option. You can distinguish them
using the cpu family field
- in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one
- (this rule only applies to system that support EM64T)
+ in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one
+ (this rule only applies to systems that support EM64T)
config MCORE2
bool "Intel Core2 / newer Xeon"
help
Optimize for Intel Core2 and newer Xeons (51xx)
- You can distingush the newer Xeons from the older ones using
- the cpu family field in /proc/cpuinfo. 15 is a older Xeon
+ You can distinguish the newer Xeons from the older ones using
+ the cpu family field in /proc/cpuinfo. 15 is an older Xeon
(use CONFIG_MPSC then), 6 is a newer one. This rule only
applies to CPUs that support EM64T.
@@ -458,8 +466,8 @@ config IOMMU
on systems with more than 3GB. This is usually needed for USB,
sound, many IDE/SATA chipsets and some other devices.
Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
- based IOMMU and a software bounce buffer based IOMMU used on Intel
- systems and as fallback.
+ based hardware IOMMU and a software bounce buffer based IOMMU used
+ on Intel systems and as fallback.
The code is only active when needed (enough memory and limited
device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
too.
@@ -496,6 +504,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
+ help
+ Support for software bounce buffers used on x86-64 systems
+ which don't have a hardware IOMMU (e.g. the current generation
+ of Intel's x86-64 CPUs). Using this PCI devices which can only
+ access 32-bits of memory can be used on systems with more than
+ 3 GB of memory. If unsure, say Y.
config X86_MCE
bool "Machine check support" if EMBEDDED
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 69584c295305..293a4a4c609e 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.20-rc3
-# Fri Jan 5 11:54:41 2007
+# Linux kernel version: 2.6.20-git8
+# Tue Feb 13 11:25:16 2007
#
CONFIG_X86_64=y
CONFIG_64BIT=y
@@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
+CONFIG_ZONE_DMA=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
@@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_RESOURCES_64BIT=y
+CONFIG_ZONE_DMA_FLAG=1
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
@@ -201,13 +203,14 @@ CONFIG_ACPI=y
CONFIG_ACPI_SLEEP=y
CONFIG_ACPI_SLEEP_PROC_FS=y
CONFIG_ACPI_SLEEP_PROC_SLEEP=y
+CONFIG_ACPI_PROCFS=y
CONFIG_ACPI_AC=y
CONFIG_ACPI_BATTERY=y
CONFIG_ACPI_BUTTON=y
-# CONFIG_ACPI_VIDEO is not set
# CONFIG_ACPI_HOTKEY is not set
CONFIG_ACPI_FAN=y
# CONFIG_ACPI_DOCK is not set
+# CONFIG_ACPI_BAY is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
@@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCIEAER=y
CONFIG_PCI_MSI=y
-# CONFIG_PCI_MULTITHREAD_PROBE is not set
# CONFIG_PCI_DEBUG is not set
# CONFIG_HT_IRQ is not set
@@ -398,6 +400,7 @@ CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
#
@@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y
# CONFIG_BLK_DEV_IDETAPE is not set
# CONFIG_BLK_DEV_IDEFLOPPY is not set
# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_BLK_DEV_IDEACPI=y
# CONFIG_IDE_TASK_IOCTL is not set
#
@@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y
# CONFIG_BLK_DEV_JMICRON is not set
# CONFIG_BLK_DEV_SC1200 is not set
CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_IT8213 is not set
# CONFIG_BLK_DEV_IT821X is not set
# CONFIG_BLK_DEV_NS87415 is not set
# CONFIG_BLK_DEV_PDC202XX_OLD is not set
@@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y
# CONFIG_BLK_DEV_SLC90E66 is not set
# CONFIG_BLK_DEV_TRM290 is not set
# CONFIG_BLK_DEV_VIA82CXXX is not set
+# CONFIG_BLK_DEV_TC86C001 is not set
# CONFIG_IDE_ARM is not set
CONFIG_BLK_DEV_IDEDMA=y
# CONFIG_IDEDMA_IVB is not set
@@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y
# Serial ATA (prod) and Parallel ATA (experimental) drivers
#
CONFIG_ATA=y
+# CONFIG_ATA_NONSTANDARD is not set
CONFIG_SATA_AHCI=y
CONFIG_SATA_SVW=y
CONFIG_ATA_PIIX=y
@@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y
# CONFIG_SATA_ULI is not set
CONFIG_SATA_VIA=y
# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
CONFIG_SATA_INTEL_COMBINED=y
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
@@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_JMICRON is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_MARVELL is not set
@@ -682,9 +691,7 @@ CONFIG_IEEE1394=y
# Subsystem Options
#
# CONFIG_IEEE1394_VERBOSEDEBUG is not set
-# CONFIG_IEEE1394_OUI_DB is not set
# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
-# CONFIG_IEEE1394_EXPORT_FULL_API is not set
#
# Device Drivers
@@ -707,6 +714,11 @@ CONFIG_IEEE1394_RAWIO=y
# CONFIG_I2O is not set
#
+# Macintosh device drivers
+#
+# CONFIG_MAC_EMUMOUSEBTN is not set
+
+#
# Network device support
#
CONFIG_NETDEVICES=y
@@ -774,6 +786,7 @@ CONFIG_8139TOO=y
# CONFIG_EPIC100 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_VIA_RHINE is not set
+# CONFIG_SC92031 is not set
#
# Ethernet (1000 Mbit)
@@ -795,11 +808,13 @@ CONFIG_E1000=y
CONFIG_TIGON3=y
CONFIG_BNX2=y
# CONFIG_QLA3XXX is not set
+# CONFIG_ATL1 is not set
#
# Ethernet (10000 Mbit)
#
# CONFIG_CHELSIO_T1 is not set
+# CONFIG_CHELSIO_T3 is not set
# CONFIG_IXGB is not set
CONFIG_S2IO=m
# CONFIG_S2IO_NAPI is not set
@@ -1115,6 +1130,7 @@ CONFIG_SOUND=y
# Open Sound System
#
CONFIG_SOUND_PRIME=y
+CONFIG_OBSOLETE_OSS=y
# CONFIG_SOUND_BT878 is not set
# CONFIG_SOUND_ES1371 is not set
CONFIG_SOUND_ICH=y
@@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y
# HID Devices
#
CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
#
# USB support
@@ -1142,10 +1159,8 @@ CONFIG_USB=y
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
-# CONFIG_USB_BANDWIDTH is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_SUSPEND is not set
-# CONFIG_USB_MULTITHREAD_PROBE is not set
# CONFIG_USB_OTG is not set
#
@@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_SPLIT_ISO is not set
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
@@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y
# CONFIG_USB_ATI_REMOTE2 is not set
# CONFIG_USB_KEYSPAN_REMOTE is not set
# CONFIG_USB_APPLETOUCH is not set
+# CONFIG_USB_GTCO is not set
#
# USB Imaging devices
@@ -1313,6 +1331,10 @@ CONFIG_USB_MON=y
#
#
+# Auxiliary Display support
+#
+
+#
# Virtualization
#
# CONFIG_KVM is not set
@@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_FS=y
# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
+# CONFIG_DEBUG_SHIRQ is not set
CONFIG_LOG_BUF_SHIFT=18
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_SCHEDSTATS is not set
@@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_RWSEMS is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
@@ -1560,4 +1582,5 @@ CONFIG_CRC32=y
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
CONFIG_PLIST=y
-CONFIG_IOMAP_COPY=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 6efe04f3cbca..071100ea1251 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -344,20 +344,30 @@ EXPORT_SYMBOL(ia32_setup_arg_pages);
#include <linux/sysctl.h>
static ctl_table abi_table2[] = {
- { 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL,
- proc_dointvec },
- { 0, }
-};
+ {
+ .ctl_name = 99,
+ .procname = "vsyscall32",
+ .data = &sysctl_vsyscall32,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {}
+};
-static ctl_table abi_root_table2[] = {
- { .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555,
- .child = abi_table2 },
- { 0 },
-};
+static ctl_table abi_root_table2[] = {
+ {
+ .ctl_name = CTL_ABI,
+ .procname = "abi",
+ .mode = 0555,
+ .child = abi_table2
+ },
+ {}
+};
static __init int ia32_binfmt_init(void)
{
- register_sysctl_table(abi_root_table2, 1);
+ register_sysctl_table(abi_root_table2);
return 0;
}
__initcall(ia32_binfmt_init);
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index ff499ef2a1ba..359eacc38509 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -21,6 +21,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compat.h>
+#include <linux/binfmts.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
@@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
/* Return stub is in 32bit vsyscall page */
{
- void __user *restorer = VSYSCALL32_SIGRETURN;
+ void __user *restorer;
+ if (current->binfmt->hasvdso)
+ restorer = VSYSCALL32_SIGRETURN;
+ else
+ restorer = (void *)&frame->retcode;
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
@@ -495,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
@@ -601,7 +606,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5f32cf4de5fb..eda7a0d4dc15 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -718,4 +718,5 @@ ia32_sys_call_table:
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
.quad sys_getcpu
+ .quad sys_epoll_pwait
ia32_syscall_end:
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 3c7cbff04d3d..bb47e86f3d02 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
x8664_ksyms.o i387.o syscall.o vsyscall.o \
setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
- pci-dma.o pci-nommu.o alternative.o
+ pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
-obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
+obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o
obj-y += apic.o nmi.o
obj-y += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
@@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
+obj-y += pcspeaker.o
CFLAGS_vsyscall.o := $(PROFILING) -g0
@@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
alternative-y += ../../i386/kernel/alternative.o
+pcspeaker-y += ../../i386/kernel/pcspeaker.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 5ebf62c7a3d2..23178ce6c783 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0;
unsigned long acpi_video_flags;
extern char wakeup_start, wakeup_end;
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
static pgd_t low_ptr;
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 124b2d27b4ac..723417d924c0 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -37,6 +37,7 @@
#include <asm/idle.h>
#include <asm/proto.h>
#include <asm/timex.h>
+#include <asm/hpet.h>
#include <asm/apic.h>
int apic_mapped;
@@ -763,7 +764,7 @@ static void setup_APIC_timer(unsigned int clocks)
local_irq_save(flags);
/* wait for irq slice */
- if (vxtime.hpet_address && hpet_use_timer) {
+ if (hpet_address && hpet_use_timer) {
int trigger = hpet_readl(HPET_T0_CMP);
while (hpet_readl(HPET_COUNTER) >= trigger)
/* do nothing */ ;
@@ -785,7 +786,7 @@ static void setup_APIC_timer(unsigned int clocks)
/* Turn off PIT interrupt if we use APIC timer as main timer.
Only works with the PM timer right now
TBD fix it for HPET too. */
- if (vxtime.mode == VXTIME_PMTMR &&
+ if ((pmtmr_ioport != 0) &&
smp_processor_id() == boot_cpu_id &&
apic_runs_main_timer == 1 &&
!cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6fe191c58084..4651fd22b213 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
return 1;
}
+#ifdef CONFIG_NUMA
+ /* NUMA memory to node map */
+ if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+ *addrp = nodemap_addr + nodemap_size;
+ return 1;
+ }
+#endif
/* XXX ramdisk image here? */
return 0;
}
@@ -184,6 +191,37 @@ unsigned long __init e820_end_of_ram(void)
}
/*
+ * Find the hole size in the range.
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+ unsigned long ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long last, addr;
+
+ if (ei->type != E820_RAM ||
+ ei->addr+ei->size <= start ||
+ ei->addr >= end)
+ continue;
+
+ addr = round_up(ei->addr, PAGE_SIZE);
+ if (addr < start)
+ addr = start;
+
+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (last >= end)
+ last = end;
+
+ if (last > addr)
+ ram += last - addr;
+ }
+ return ((end - start) - ram);
+}
+
+/*
* Mark e820 reserved areas as busy for the resource manager.
*/
void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 1e6f80870679..598a4d0351fc 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -163,6 +163,20 @@ startup_64:
*/
lgdt cpu_gdt_descr
+ /* set up data segments. actually 0 would do too */
+ movl $__KERNEL_DS,%eax
+ movl %eax,%ds
+ movl %eax,%ss
+ movl %eax,%es
+
+ /*
+ * We don't really need to load %fs or %gs, but load them anyway
+ * to kill any stale realmode selectors. This allows execution
+ * under VT hardware.
+ */
+ movl %eax,%fs
+ movl %eax,%gs
+
/*
* Setup up a dummy PDA. this is just for some early bootup code
* that does in_interrupt()
@@ -173,12 +187,6 @@ startup_64:
shrq $32,%rdx
wrmsr
- /* set up data segments. actually 0 would do too */
- movl $__KERNEL_DS,%eax
- movl %eax,%ds
- movl %eax,%ss
- movl %eax,%es
-
/* esi is pointer to real mode structure with interesting info.
pass it to C */
movl %esi, %edi
diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c
new file mode 100644
index 000000000000..65a0edd71a17
--- /dev/null
+++ b/arch/x86_64/kernel/hpet.c
@@ -0,0 +1,511 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/hpet.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+
+int nohpet __initdata;
+
+unsigned long hpet_address;
+unsigned long hpet_period; /* fsecs / HPET clock */
+unsigned long hpet_tick; /* HPET clocks / interrupt */
+
+int hpet_use_timer; /* Use counter of hpet for time keeping,
+ * otherwise PIT
+ */
+
+#ifdef CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+ struct hpet_data hd;
+ unsigned int ntimer;
+
+ if (!hpet_address)
+ return 0;
+
+ memset(&hd, 0, sizeof(hd));
+
+ ntimer = hpet_readl(HPET_ID);
+ ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+ ntimer++;
+
+ /*
+ * Register with driver.
+ * Timer0 and Timer1 is used by platform.
+ */
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+ hd.hd_nirqs = ntimer;
+ hd.hd_flags = HPET_DATA_PLATFORM;
+ hpet_reserve_timer(&hd, 0);
+#ifdef CONFIG_HPET_EMULATE_RTC
+ hpet_reserve_timer(&hd, 1);
+#endif
+ hd.hd_irq[0] = HPET_LEGACY_8254;
+ hd.hd_irq[1] = HPET_LEGACY_RTC;
+ if (ntimer > 2) {
+ struct hpet *hpet;
+ struct hpet_timer *timer;
+ int i;
+
+ hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+ timer = &hpet->hpet_timers[2];
+ for (i = 2; i < ntimer; timer++, i++)
+ hd.hd_irq[i] = (timer->hpet_config &
+ Tn_INT_ROUTE_CNF_MASK) >>
+ Tn_INT_ROUTE_CNF_SHIFT;
+
+ }
+
+ hpet_alloc(&hd);
+ return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+int hpet_timer_stop_set_go(unsigned long tick)
+{
+ unsigned int cfg;
+
+/*
+ * Stop the timers and reset the main counter.
+ */
+
+ cfg = hpet_readl(HPET_CFG);
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ hpet_writel(0, HPET_COUNTER);
+ hpet_writel(0, HPET_COUNTER + 4);
+
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+ if (hpet_use_timer) {
+ hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT, HPET_T0_CFG);
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+ cfg |= HPET_CFG_LEGACY;
+ }
+/*
+ * Go!
+ */
+
+ cfg |= HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+
+ return 0;
+}
+
+int hpet_arch_init(void)
+{
+ unsigned int id;
+
+ if (!hpet_address)
+ return -1;
+ set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+ id = hpet_readl(HPET_ID);
+
+ if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+ return -1;
+
+ hpet_period = hpet_readl(HPET_PERIOD);
+ if (hpet_period < 100000 || hpet_period > 100000000)
+ return -1;
+
+ hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+
+ hpet_use_timer = (id & HPET_ID_LEGSUP);
+
+ return hpet_timer_stop_set_go(hpet_tick);
+}
+
+int hpet_reenable(void)
+{
+ return hpet_timer_stop_set_go(hpet_tick);
+}
+
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+
+#define TICK_COUNT 100000000
+#define TICK_MIN 5000
+
+/*
+ * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
+ * occurs between the reads of the hpet & TSC.
+ */
+static void __init read_hpet_tsc(int *hpet, int *tsc)
+{
+ int tsc1, tsc2, hpet1;
+
+ do {
+ tsc1 = get_cycles_sync();
+ hpet1 = hpet_readl(HPET_COUNTER);
+ tsc2 = get_cycles_sync();
+ } while (tsc2 - tsc1 > TICK_MIN);
+ *hpet = hpet1;
+ *tsc = tsc2;
+}
+
+unsigned int __init hpet_calibrate_tsc(void)
+{
+ int tsc_start, hpet_start;
+ int tsc_now, hpet_now;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ read_hpet_tsc(&hpet_start, &tsc_start);
+
+ do {
+ local_irq_disable();
+ read_hpet_tsc(&hpet_now, &tsc_now);
+ local_irq_restore(flags);
+ } while ((tsc_now - tsc_start) < TICK_COUNT &&
+ (hpet_now - hpet_start) < TICK_COUNT);
+
+ return (tsc_now - tsc_start) * 1000000000L
+ / ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ * is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 64
+#define RTC_NUM_INTS 1
+
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
+
+int is_hpet_enabled(void)
+{
+ return hpet_address != 0;
+}
+
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+ unsigned int cfg, cnt;
+ unsigned long flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+ /*
+ * Set the counter 1 and enable the interrupts.
+ */
+ if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+ hpet_rtc_int_freq = PIE_freq;
+ else
+ hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+ local_irq_save(flags);
+
+ cnt = hpet_readl(HPET_COUNTER);
+ cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+ hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
+
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T1_CFG);
+
+ local_irq_restore(flags);
+
+ return 1;
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+ unsigned int cfg, cnt, ticks_per_int, lost_ints;
+
+ if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+ return;
+ }
+
+ if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+ hpet_rtc_int_freq = PIE_freq;
+ else
+ hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+ /* It is more accurate to use the comparator value than current count.*/
+ ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+ hpet_t1_cmp += ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ /*
+ * If the interrupt handler was delayed too long, the write above tries
+ * to schedule the next interrupt in the past and the hardware would
+ * not interrupt until the counter had wrapped around.
+ * So we have to check that the comparator wasn't set to a past time.
+ */
+ cnt = hpet_readl(HPET_COUNTER);
+ if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+ lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+ /* Make sure that, even with the time needed to execute
+ * this code, the next scheduled interrupt has been moved
+ * back to the future: */
+ lost_ints++;
+
+ hpet_t1_cmp += lost_ints * ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ if (PIE_on)
+ PIE_count += lost_ints;
+
+ if (printk_ratelimit())
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
+ }
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (bit_mask & RTC_UIE)
+ UIE_on = 0;
+ if (bit_mask & RTC_PIE)
+ PIE_on = 0;
+ if (bit_mask & RTC_AIE)
+ AIE_on = 0;
+
+ return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+ int timer_init_reqd = 0;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (!(PIE_on | AIE_on | UIE_on))
+ timer_init_reqd = 1;
+
+ if (bit_mask & RTC_UIE) {
+ UIE_on = 1;
+ }
+ if (bit_mask & RTC_PIE) {
+ PIE_on = 1;
+ PIE_count = 0;
+ }
+ if (bit_mask & RTC_AIE) {
+ AIE_on = 1;
+ }
+
+ if (timer_init_reqd)
+ hpet_rtc_timer_init();
+
+ return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ alarm_time.tm_hour = hrs;
+ alarm_time.tm_min = min;
+ alarm_time.tm_sec = sec;
+
+ return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ PIE_freq = freq;
+ PIE_count = 0;
+
+ return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ return 1;
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+ struct rtc_time curr_time;
+ unsigned long rtc_int_flag = 0;
+ int call_rtc_interrupt = 0;
+
+ hpet_rtc_timer_reinit();
+
+ if (UIE_on | AIE_on) {
+ rtc_get_rtc_time(&curr_time);
+ }
+ if (UIE_on) {
+ if (curr_time.tm_sec != prev_update_sec) {
+ /* Set update int info, call real rtc int routine */
+ call_rtc_interrupt = 1;
+ rtc_int_flag = RTC_UF;
+ prev_update_sec = curr_time.tm_sec;
+ }
+ }
+ if (PIE_on) {
+ PIE_count++;
+ if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+ /* Set periodic int info, call real rtc int routine */
+ call_rtc_interrupt = 1;
+ rtc_int_flag |= RTC_PF;
+ PIE_count = 0;
+ }
+ }
+ if (AIE_on) {
+ if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+ (curr_time.tm_min == alarm_time.tm_min) &&
+ (curr_time.tm_hour == alarm_time.tm_hour)) {
+ /* Set alarm int info, call real rtc int routine */
+ call_rtc_interrupt = 1;
+ rtc_int_flag |= RTC_AF;
+ }
+ }
+ if (call_rtc_interrupt) {
+ rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+ rtc_interrupt(rtc_int_flag, dev_id);
+ }
+ return IRQ_HANDLED;
+}
+#endif
+
+static int __init nohpet_setup(char *s)
+{
+ nohpet = 1;
+ return 1;
+}
+
+__setup("nohpet", nohpet_setup);
+
+#define HPET_MASK 0xFFFFFFFF
+#define HPET_SHIFT 22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC 1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+ return (cycle_t)readl(hpet_ptr);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void)
+{
+ return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+ .name = "hpet",
+ .rating = 250,
+ .read = read_hpet,
+ .mask = (cycle_t)HPET_MASK,
+ .mult = 0, /* set below */
+ .shift = HPET_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .vread = vread_hpet,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+ unsigned long hpet_period;
+ void __iomem *hpet_base;
+ u64 tmp;
+
+ if (!hpet_address)
+ return -ENODEV;
+
+ /* calculate the hpet address: */
+ hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ hpet_ptr = hpet_base + HPET_COUNTER;
+
+ /* calculate the frequency: */
+ hpet_period = readl(hpet_base + HPET_PERIOD);
+
+ /*
+ * hpet period is in femto seconds per cycle
+ * so we need to convert this to ns/cyc units
+ * aproximated by mult/2^shift
+ *
+ * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+ * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+ * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+ * (fsec/cyc << shift)/1000000 = mult
+ * (hpet_period << shift)/FSEC_PER_NSEC = mult
+ */
+ tmp = (u64)hpet_period << HPET_SHIFT;
+ do_div(tmp, FSEC_PER_NSEC);
+ clocksource_hpet.mult = (u32)tmp;
+
+ return clocksource_register(&clocksource_hpet);
+}
+
+module_init(init_hpet_clocksource);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index d73c79e821f1..01e2cf0bdeb1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -103,6 +103,7 @@ static void mask_and_ack_8259A(unsigned int);
static struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
+ .disable = disable_8259A_irq,
.unmask = enable_8259A_irq,
.mask_ack = mask_and_ack_8259A,
};
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 6be6730acb5c..950682f35766 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -810,11 +810,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq, "fasteoi");
- else {
- irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
+ else
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_edge_irq, "edge");
- }
}
static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
{
@@ -831,7 +829,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.trigger = irq_trigger(idx);
entry.polarity = irq_polarity(idx);
@@ -839,7 +837,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
if (irq_trigger(idx)) {
entry.trigger = 1;
entry.mask = 1;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
}
if (!apic && !IO_APIC_IRQ(irq))
@@ -851,7 +849,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
if (vector < 0)
return;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+ entry.dest = cpu_mask_to_apicid(mask);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
@@ -920,7 +918,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
*/
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* unmask IRQ now */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
entry.trigger = 0;
@@ -1020,18 +1018,17 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
- " Stat Dest Deli Vect: \n");
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect: \n");
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, i);
- printk(KERN_DEBUG " %02x %03X %02X ",
+ printk(KERN_DEBUG " %02x %03X ",
i,
- entry.dest.logical.logical_dest,
- entry.dest.physical.physical_dest
+ entry.dest
);
printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
@@ -1293,8 +1290,7 @@ void disable_IO_APIC(void)
entry.dest_mode = 0; /* Physical */
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
- entry.dest.physical.physical_dest =
- GET_APIC_ID(apic_read(APIC_ID));
+ entry.dest = GET_APIC_ID(apic_read(APIC_ID));
/*
* Add it to the IO-APIC irq-routing table:
@@ -1556,7 +1552,7 @@ static inline void unlock_ExtINT_logic(void)
entry1.dest_mode = 0; /* physical delivery */
entry1.mask = 0; /* unmask IRQ now */
- entry1.dest.physical.physical_dest = hard_smp_processor_id();
+ entry1.dest = hard_smp_processor_id();
entry1.delivery_mode = dest_ExtINT;
entry1.polarity = entry0.polarity;
entry1.trigger = 0;
@@ -2131,7 +2127,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+ entry.dest = cpu_mask_to_apicid(mask);
entry.trigger = triggering;
entry.polarity = polarity;
entry.mask = 1; /* Disabled (masked) */
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index fe063d3cfe42..745b1f0f494e 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+ regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
return 0;
}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c06af6c13bc..3bc30d2c13d3 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -18,6 +18,7 @@
#include <asm/uaccess.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
+#include <asm/smp.h>
atomic_t irq_err_count;
@@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
if (likely(irq < NR_IRQS))
generic_handle_irq(irq);
- else if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
- __func__, smp_processor_id(), vector);
+ else {
+ if (!disable_apic)
+ ack_APIC_irq();
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+ __func__, smp_processor_id(), vector);
+ }
irq_exit();
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bdb54a2c9f18..8011a8e1c7d4 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/ctype.h>
+#include <linux/kmod.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
static int notify_user;
static int rip_msr;
static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
/*
* Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
void mce_log(struct mce *mce)
{
unsigned next, entry;
+ atomic_inc(&mce_events);
mce->finished = 0;
wmb();
for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
}
+static void do_mce_trigger(void)
+{
+ static atomic_t mce_logged;
+ int events = atomic_read(&mce_events);
+ if (events != atomic_read(&mce_logged) && trigger[0]) {
+ /* Small race window, but should be harmless. */
+ atomic_set(&mce_logged, events);
+ call_usermodehelper(trigger, trigger_argv, NULL, -1);
+ }
+}
+
/*
* The actual machine check handler
*/
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
}
/* Never do anything final in the polling timer */
- if (!regs)
+ if (!regs) {
+ /* Normal interrupt context here. Call trigger for any new
+ events. */
+ do_mce_trigger();
goto out;
+ }
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+/* TBD should generate these dynamically based on number of available banks */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+ strcpy(buf, trigger);
+ strcat(buf, "\n");
+ return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+ char *p;
+ int len;
+ strncpy(trigger, buf, sizeof(trigger));
+ trigger[sizeof(trigger)-1] = 0;
+ len = strlen(trigger);
+ p = strchr(trigger, '\n');
+ if (*p) *p = 0;
+ return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+ &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+ &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+ &attr_tolerant, &attr_check_interval, &attr_trigger,
+ NULL
+};
/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
err = sysdev_register(&per_cpu(device_mce,cpu));
if (!err) {
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_create_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
}
return err;
}
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
{
int i;
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 93c707257637..d0bd5d66e103 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -37,6 +37,8 @@
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
#define MASK_LVTOFF_HI 0x00F00000
#define MASK_COUNT_EN_HI 0x00080000
#define MASK_INT_TYPE_HI 0x00060000
@@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1)
- address = MCG_XBLK_ADDR
- + ((low & MASK_BLKPTR_LO) >> 21);
+ else if (block == 1) {
+ address = (low & MASK_BLKPTR_LO) >> 21;
+ if (!address)
+ break;
+ address += MCG_XBLK_ADDR;
+ }
else
++address;
if (rdmsr_safe(address, &low, &high))
- continue;
+ break;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
break;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
continue;
if (!block)
@@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void)
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+ continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1)
- address = MCG_XBLK_ADDR
- + ((low & MASK_BLKPTR_LO) >> 21);
+ else if (block == 1) {
+ address = (low & MASK_BLKPTR_LO) >> 21;
+ if (!address)
+ break;
+ address += MCG_XBLK_ADDR;
+ }
else
++address;
if (rdmsr_safe(address, &low, &high))
- continue;
+ break;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -206,10 +216,14 @@ asmlinkage void mce_threshold_interrupt(void)
break;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
continue;
+ /* Log the machine check that caused the threshold
+ event. */
+ do_machine_check(NULL, 0);
+
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
@@ -385,7 +399,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
return 0;
if (rdmsr_safe(address, &low, &high))
- goto recurse;
+ return 0;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -394,8 +408,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
return 0;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
goto recurse;
b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9cb42ecb7f89..486f4c61a948 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- return boot_cpu_data.x86 == 15;
+ return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return 1;
@@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+ unsigned int retval = hz;
+
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
+ retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
+ }
+ return retval;
+}
+
int __init check_nmi_watchdog (void)
{
int *counts;
@@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void)
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
nmi_hz = 1;
- /*
- * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
- ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
- nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
- }
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
}
kfree(counts);
@@ -360,6 +368,33 @@ void enable_timer_nmi_watchdog(void)
}
}
+static void __acpi_nmi_disable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
@@ -634,7 +669,9 @@ static int setup_intel_arch_watchdog(void)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
- wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -855,15 +892,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
dummy &= ~P4_CCCR_OVF;
wrmsrl(wd->cccr_msr, dummy);
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr,
+ -((u64)cpu_khz * 1000 / nmi_hz));
} else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
/*
* ArchPerfom/Core Duo needs to re-unmask
* the apic vector
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* ARCH_PERFMON has 32 bit counter writes */
+ wrmsr(wd->perfctr_msr,
+ (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
+ } else {
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr,
+ -((u64)cpu_khz * 1000 / nmi_hz));
}
- /* start the cycle over again */
- wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
rc = 1;
} else if (nmi_watchdog == NMI_IO_APIC) {
/* don't know how to accurately check for this.
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 3d65b1d4c2b3..04480c3b68f5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = {
#define PHB_DEBUG_STUFF_OFFSET 0x0020
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
static int translate_empty_slots __read_mostly = 0;
static int calgary_detected __read_mostly = 0;
@@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
{
unsigned long entry;
unsigned long badbit;
+ unsigned long badend;
+
+ /* were we called with bad_dma_address? */
+ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+ printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ "address 0x%Lx\n", dma_addr);
+ WARN_ON(1);
+ return;
+ }
entry = dma_addr >> PAGE_SHIFT;
@@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
u64 start;
struct iommu_table *tbl = dev->sysdata;
- /* reserve bad_dma_address in case it's a legal address */
- iommu_range_reserve(tbl, bad_dma_address, 1);
+ /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+ iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
start = (640 * 1024);
@@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void)
}
force_iommu = 1;
+ bad_dma_address = 0x0;
dma_ops = &calgary_dma_ops;
return 0;
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 683b7a5c1ab3..651ccfb06697 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
- [,forcesac][,fullflush][,nomerge][,biomerge]
- size set size of iommu (in bytes)
- noagp don't initialize the AGP driver and use full aperture.
- off don't use the IOMMU
- leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
- memaper[=order] allocate an own aperture over RAM with size 32MB^order.
- noforce don't force IOMMU usage. Default.
- force Force IOMMU.
- merge Do lazy merging. This may improve performance on some block devices.
- Implies force (experimental)
- biomerge Do merging at the BIO layer. This is more efficient than merge,
- but should be only done with very big IOMMUs. Implies merge,force.
- nomerge Don't do SG merging.
- forcesac For SAC mode for masks <40bits (experimental)
- fullflush Flush IOMMU on each allocation (default)
- nofullflush Don't use IOMMU fullflush
- allowed overwrite iommu off workarounds for specific chipsets.
- soft Use software bounce buffering (default for Intel machines)
- noaperture Don't touch the aperture for AGP.
- allowdac Allow DMA >4GB
- nodac Forbid DMA >4GB
- panic Force panic when IOMMU overflows
-*/
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
__init int iommu_setup(char *p)
{
iommu_merge = 1;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index fc1960f1f243..030eb3753358 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
- int high = addr + size >= mask;
+ int high = addr + size > mask;
int mmu = high;
if (force_iommu)
mmu = 1;
@@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size
static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
- int high = addr + size >= mask;
+ int high = addr + size > mask;
int mmu = high;
return mmu;
}
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index 7554458dc9cb..ae8f91214f15 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -24,15 +24,6 @@
#include <asm/msr.h>
#include <asm/vsyscall.h>
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport __read_mostly;
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_delay;
-static u32 last_pmtmr_tick;
-
#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
static inline u32 cyc2us(u32 cycles)
@@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles)
return (cycles >> 10);
}
-int pmtimer_mark_offset(void)
-{
- static int first_run = 1;
- unsigned long tsc;
- u32 lost;
-
- u32 tick = inl(pmtmr_ioport);
- u32 delta;
-
- delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
-
- last_pmtmr_tick = tick;
- monotonic_base += delta * NSEC_PER_USEC;
-
- delta += offset_delay;
-
- lost = delta / (USEC_PER_SEC / HZ);
- offset_delay = delta % (USEC_PER_SEC / HZ);
-
- rdtscll(tsc);
- vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000;
-
- /* don't calculate delay for first run,
- or if we've got less then a tick */
- if (first_run || (lost < 1)) {
- first_run = 0;
- offset_delay = 0;
- }
-
- return lost - 1;
-}
-
static unsigned pmtimer_wait_tick(void)
{
u32 a, b;
@@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us)
} while (cyc2us(b - a) < us);
}
-void pmtimer_resume(void)
-{
- last_pmtmr_tick = inl(pmtmr_ioport);
-}
-
-unsigned int do_gettimeoffset_pm(void)
-{
- u32 now, offset, delta = 0;
-
- offset = last_pmtmr_tick;
- now = inl(pmtmr_ioport);
- delta = (now - offset) & ACPI_PM_MASK;
-
- return offset_delay + cyc2us(delta);
-}
-
-
static int __init nopmtimer_setup(char *s)
{
pmtmr_ioport = 0;
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index addc14af0c56..4326a690a509 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
- ret |= __get_user(tmp, (unsigned long __user *) data);
- putreg(child, ui, tmp);
+ ret = __get_user(tmp, (unsigned long __user *) data);
+ if (ret)
+ break;
+ ret = putreg(child, ui, tmp);
+ if (ret)
+ break;
data += sizeof(long);
}
break;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 60477244d1a3..3d98b696881d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -138,128 +138,6 @@ struct resource code_resource = {
.flags = IORESOURCE_RAM,
};
-#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource adapter_rom_resources[] = {
- { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM }
-};
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_RAM,
-};
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
- unsigned char *p, sum = 0;
-
- for (p = rom; p < rom + length; p++)
- sum += *p;
- return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
- unsigned long start, length, upper;
- unsigned char *rom;
- int i;
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
- start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
#ifdef CONFIG_PROC_VMCORE
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel. This option will be passed
@@ -444,6 +322,11 @@ void __init setup_arch(char **cmdline_p)
/* reserve ebda region */
if (ebda_addr)
reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+ /* reserve nodemap region */
+ if (nodemap_addr)
+ reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
#ifdef CONFIG_SMP
/*
@@ -519,15 +402,11 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
- probe_roms();
+ * We trust e820 completely. No explicit ROM probing in memory.
+ */
e820_reserve_resources();
e820_mark_nosave_regions();
- request_resource(&iomem_resource, &video_ram_resource);
-
{
unsigned i;
/* request I/O space for devices used on all i[345]86 PCs */
@@ -1063,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
+ NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+ "3dnowext", "3dnow",
/* Transmeta-defined */
"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -1081,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
@@ -1091,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* AMD-defined (#2) */
- "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+ "altmovcr8", "abm", "sse4a",
+ "misalignsse", "3dnowprefetch",
+ "osvw", "ibs", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
@@ -1103,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"ttp", /* thermal trip */
"tm",
"stc",
+ "100mhzsteps",
+ "hwpstate",
+ NULL, /* tsc invariant mapped to constant_tsc */
NULL,
/* nothing */ /* constant_tsc - moved to flags */
};
@@ -1219,23 +1104,3 @@ struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
-
-#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
-#include <linux/platform_device.h>
-static __init int add_pcspkr(void)
-{
- struct platform_device *pd;
- int ret;
-
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
-
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
-}
-device_initcall(add_pcspkr);
-#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8c4b80fe71a1..6a70b55f719d 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
static int do_not_nx __cpuinitdata = 0;
/* noexec=on|off
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index daf19332f0dd..35443729aad8 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id)
print_cpu_info(c);
}
-/*
- * New Funky TSC sync algorithm borrowed from IA64.
- * Main advantage is that it doesn't reset the TSCs fully and
- * in general looks more robust and it works better than my earlier
- * attempts. I believe it was written by David Mosberger. Some minor
- * adjustments for x86-64 by me -AK
- *
- * Original comment reproduced below.
- *
- * Synchronize TSC of the current (slave) CPU with the TSC of the
- * MASTER CPU (normally the time-keeper CPU). We use a closed loop to
- * eliminate the possibility of unaccounted-for errors (such as
- * getting a machine check in the middle of a calibration step). The
- * basic idea is for the slave to ask the master what itc value it has
- * and to read its own itc before and after the master responds. Each
- * iteration gives us three timestamps:
- *
- * slave master
- *
- * t0 ---\
- * ---\
- * --->
- * tm
- * /---
- * /---
- * t1 <---
- *
- *
- * The goal is to adjust the slave's TSC such that tm falls exactly
- * half-way between t0 and t1. If we achieve this, the clocks are
- * synchronized provided the interconnect between the slave and the
- * master is symmetric. Even if the interconnect were asymmetric, we
- * would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us
- * synchronize the TSC to within one or two cycles. However, we can
- * only *guarantee* that the synchronization is accurate to within a
- * round-trip time, which is typically in the range of several hundred
- * cycles (e.g., ~500 cycles). In practice, this means that the TSCs
- * are usually almost perfectly synchronized, but we shouldn't assume
- * that the accuracy is much better than half a micro second or so.
- *
- * [there are other errors like the latency of RDTSC and of the
- * WRMSR. These can also account to hundreds of cycles. So it's
- * probably worse. It claims 153 cycles error on a dual Opteron,
- * but I suspect the numbers are actually somewhat worse -AK]
- */
-
-#define MASTER 0
-#define SLAVE (SMP_CACHE_BYTES/8)
-
-/* Intentionally don't use cpu_relax() while TSC synchronization
- because we don't want to go into funky power save modi or cause
- hypervisors to schedule us away. Going to sleep would likely affect
- latency and low latency is the primary objective here. -AK */
-#define no_cpu_relax() barrier()
-
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static volatile __cpuinitdata unsigned long go[SLAVE + 1];
-static int notscsync __cpuinitdata;
-
-#undef DEBUG_TSC_SYNC
-
-#define NUM_ROUNDS 64 /* magic value */
-#define NUM_ITERS 5 /* likewise */
-
-/* Callback on boot CPU */
-static __cpuinit void sync_master(void *arg)
-{
- unsigned long flags, i;
-
- go[MASTER] = 0;
-
- local_irq_save(flags);
- {
- for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
- while (!go[MASTER])
- no_cpu_relax();
- go[MASTER] = 0;
- rdtscll(go[SLAVE]);
- }
- }
- local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our tsc differs from the tsc
- * on the master (time-keeper) CPU. A positive number indicates our
- * tsc is ahead of the master, negative that it is behind.
- */
-static inline long
-get_delta(long *rt, long *master)
-{
- unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
- unsigned long tcenter, t0, t1, tm;
- int i;
-
- for (i = 0; i < NUM_ITERS; ++i) {
- rdtscll(t0);
- go[MASTER] = 1;
- while (!(tm = go[SLAVE]))
- no_cpu_relax();
- go[SLAVE] = 0;
- rdtscll(t1);
-
- if (t1 - t0 < best_t1 - best_t0)
- best_t0 = t0, best_t1 = t1, best_tm = tm;
- }
-
- *rt = best_t1 - best_t0;
- *master = best_tm - best_t0;
-
- /* average best_t0 and best_t1 without overflow: */
- tcenter = (best_t0/2 + best_t1/2);
- if (best_t0 % 2 + best_t1 % 2 == 2)
- ++tcenter;
- return tcenter - best_tm;
-}
-
-static __cpuinit void sync_tsc(unsigned int master)
-{
- int i, done = 0;
- long delta, adj, adjust_latency = 0;
- unsigned long flags, rt, master_time_stamp, bound;
-#ifdef DEBUG_TSC_SYNC
- static struct syncdebug {
- long rt; /* roundtrip time */
- long master; /* master's timestamp */
- long diff; /* difference between midpoint and master's timestamp */
- long lat; /* estimate of tsc adjustment latency */
- } t[NUM_ROUNDS] __cpuinitdata;
-#endif
-
- printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
- smp_processor_id(), master);
-
- go[MASTER] = 1;
-
- /* It is dangerous to broadcast IPI as cpus are coming up,
- * as they may not be ready to accept them. So since
- * we only need to send the ipi to the boot cpu direct
- * the message, and avoid the race.
- */
- smp_call_function_single(master, sync_master, NULL, 1, 0);
-
- while (go[MASTER]) /* wait for master to be ready */
- no_cpu_relax();
-
- spin_lock_irqsave(&tsc_sync_lock, flags);
- {
- for (i = 0; i < NUM_ROUNDS; ++i) {
- delta = get_delta(&rt, &master_time_stamp);
- if (delta == 0) {
- done = 1; /* let's lock on to this... */
- bound = rt;
- }
-
- if (!done) {
- unsigned long t;
- if (i > 0) {
- adjust_latency += -delta;
- adj = -delta + adjust_latency/4;
- } else
- adj = -delta;
-
- rdtscll(t);
- wrmsrl(MSR_IA32_TSC, t + adj);
- }
-#ifdef DEBUG_TSC_SYNC
- t[i].rt = rt;
- t[i].master = master_time_stamp;
- t[i].diff = delta;
- t[i].lat = adjust_latency/4;
-#endif
- }
- }
- spin_unlock_irqrestore(&tsc_sync_lock, flags);
-
-#ifdef DEBUG_TSC_SYNC
- for (i = 0; i < NUM_ROUNDS; ++i)
- printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
- t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
- printk(KERN_INFO
- "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
- "maxerr %lu cycles)\n",
- smp_processor_id(), master, delta, rt);
-}
-
-static void __cpuinit tsc_sync_wait(void)
-{
- /*
- * When the CPU has synchronized TSCs assume the BIOS
- * or the hardware already synced. Otherwise we could
- * mess up a possible perfect synchronization with a
- * not-quite-perfect algorithm.
- */
- if (notscsync || !cpu_has_tsc || !unsynchronized_tsc())
- return;
- sync_tsc(0);
-}
-
-static __init int notscsync_setup(char *s)
-{
- notscsync = 1;
- return 1;
-}
-__setup("notscsync", notscsync_setup);
-
static atomic_t init_deasserted __cpuinitdata;
/*
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void)
/* otherwise gcc will move up the smp_processor_id before the cpu_init */
barrier();
+ /*
+ * Check TSC sync first:
+ */
+ check_tsc_sync_target();
+
Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
setup_secondary_APIC_clock();
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void)
*/
set_cpu_sibling_map(smp_processor_id());
- /*
- * Wait for TSC sync to not schedule things before.
- * We still process interrupts, which could see an inconsistent
- * time in that window unfortunately.
- * Do this here because TSC sync has global unprotected state.
- */
- tsc_sync_wait();
-
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void)
cpu_set(smp_processor_id(), cpu_online_map);
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
spin_unlock(&vector_lock);
+
unlock_ipi_call_lock();
cpu_idle();
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
/* Unleash the CPU! */
Dprintk("waiting for cpu %d\n", cpu);
+ /*
+ * Make sure and check TSC sync:
+ */
+ check_tsc_sync_source(cpu);
+
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
@@ -1190,7 +982,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
smp_cleanup_boot();
setup_ioapic_dest();
check_nmi_watchdog();
- time_init_gtod();
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 6026b31d037e..65ac2c6b34a6 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr)
trace->skip--;
return;
}
- if (trace->nr_entries < trace->max_entries - 1)
+ if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = addr;
}
@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = {
void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
{
dump_trace(task, NULL, NULL, &save_stack_ops, trace);
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 335cc91c49b7..a87c51705620 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,8 @@
#include <linux/cpufreq.h>
#include <linux/hpet.h>
#include <asm/apic.h>
+#include <asm/hpet.h>
-#ifdef CONFIG_CPU_FREQ
-static void cpufreq_delayed_get(void);
-#endif
extern void i8254_timer_resume(void);
extern int using_apic_timer;
@@ -55,128 +53,7 @@ DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
DEFINE_SPINLOCK(i8253_lock);
-int nohpet __initdata = 0;
-static int notsc __initdata = 0;
-
-#define USEC_PER_TICK (USEC_PER_SEC / HZ)
-#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
-#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
-
-#define NS_SCALE 10 /* 2^10, carefully chosen */
-#define US_SCALE 32 /* 2^32, arbitralrily chosen */
-
-unsigned int cpu_khz; /* TSC clocks / usec, not used here */
-EXPORT_SYMBOL(cpu_khz);
-static unsigned long hpet_period; /* fsecs / HPET clock */
-unsigned long hpet_tick; /* HPET clocks / interrupt */
-int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */
-unsigned long vxtime_hz = PIT_TICK_RATE;
-int report_lost_ticks; /* command line option */
-unsigned long long monotonic_base;
-
-struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
-
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-struct timespec __xtime __section_xtime;
-struct timezone __sys_tz __section_sys_tz;
-
-/*
- * do_gettimeoffset() returns microseconds since last timer interrupt was
- * triggered by hardware. A memory read of HPET is slower than a register read
- * of TSC, but much more reliable. It's also synchronized to the timer
- * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
- * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
- * This is not a problem, because jiffies hasn't updated either. They are bound
- * together by xtime_lock.
- */
-
-static inline unsigned int do_gettimeoffset_tsc(void)
-{
- unsigned long t;
- unsigned long x;
- t = get_cycles_sync();
- if (t < vxtime.last_tsc)
- t = vxtime.last_tsc; /* hack */
- x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
- return x;
-}
-
-static inline unsigned int do_gettimeoffset_hpet(void)
-{
- /* cap counter read to one tick to avoid inconsistencies */
- unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
- return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
-}
-
-unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
-
-/*
- * This version of gettimeofday() has microsecond resolution and better than
- * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
- * MHz) HPET timer.
- */
-
-void do_gettimeofday(struct timeval *tv)
-{
- unsigned long seq;
- unsigned int sec, usec;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- sec = xtime.tv_sec;
- usec = xtime.tv_nsec / NSEC_PER_USEC;
-
- /* i386 does some correction here to keep the clock
- monotonous even when ntpd is fixing drift.
- But they didn't work for me, there is a non monotonic
- clock anyways with ntp.
- I dropped all corrections now until a real solution can
- be found. Note when you fix it here you need to do the same
- in arch/x86_64/kernel/vsyscall.c and export all needed
- variables in vmlinux.lds. -AK */
- usec += do_gettimeoffset();
-
- } while (read_seqretry(&xtime_lock, seq));
-
- tv->tv_sec = sec + usec / USEC_PER_SEC;
- tv->tv_usec = usec % USEC_PER_SEC;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * settimeofday() first undoes the correction that gettimeofday would do
- * on the time, and then saves it. This is ugly, but has been like this for
- * ages already.
- */
-
-int do_settimeofday(struct timespec *tv)
-{
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
-
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
- return -EINVAL;
-
- write_seqlock_irq(&xtime_lock);
-
- nsec -= do_gettimeoffset() * NSEC_PER_USEC;
-
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
- ntp_clear();
-
- write_sequnlock_irq(&xtime_lock);
- clock_was_set();
- return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
unsigned long profile_pc(struct pt_regs *regs)
{
@@ -267,84 +144,9 @@ static void set_rtc_mmss(unsigned long nowtime)
}
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- * Note: This function is required to return accurate
- * time even in the absence of multiple timer ticks.
- */
-static inline unsigned long long cycles_2_ns(unsigned long long cyc);
-unsigned long long monotonic_clock(void)
-{
- unsigned long seq;
- u32 last_offset, this_offset, offset;
- unsigned long long base;
-
- if (vxtime.mode == VXTIME_HPET) {
- do {
- seq = read_seqbegin(&xtime_lock);
-
- last_offset = vxtime.last;
- base = monotonic_base;
- this_offset = hpet_readl(HPET_COUNTER);
- } while (read_seqretry(&xtime_lock, seq));
- offset = (this_offset - last_offset);
- offset *= NSEC_PER_TICK / hpet_tick;
- } else {
- do {
- seq = read_seqbegin(&xtime_lock);
-
- last_offset = vxtime.last_tsc;
- base = monotonic_base;
- } while (read_seqretry(&xtime_lock, seq));
- this_offset = get_cycles_sync();
- offset = cycles_2_ns(this_offset - last_offset);
- }
- return base + offset;
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-static noinline void handle_lost_ticks(int lost)
-{
- static long lost_count;
- static int warned;
- if (report_lost_ticks) {
- printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
- print_symbol("rip %s)\n", get_irq_regs()->rip);
- }
-
- if (lost_count == 1000 && !warned) {
- printk(KERN_WARNING "warning: many lost ticks.\n"
- KERN_WARNING "Your time source seems to be instable or "
- "some driver is hogging interupts\n");
- print_symbol("rip %s\n", get_irq_regs()->rip);
- if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
- printk(KERN_WARNING "Falling back to HPET\n");
- if (hpet_use_timer)
- vxtime.last = hpet_readl(HPET_T0_CMP) -
- hpet_tick;
- else
- vxtime.last = hpet_readl(HPET_COUNTER);
- vxtime.mode = VXTIME_HPET;
- do_gettimeoffset = do_gettimeoffset_hpet;
- }
- /* else should fall back to PIT, but code missing. */
- warned = 1;
- } else
- lost_count++;
-
-#ifdef CONFIG_CPU_FREQ
- /* In some cases the CPU can change frequency without us noticing
- Give cpufreq a change to catch up. */
- if ((lost_count+1) % 25 == 0)
- cpufreq_delayed_get();
-#endif
-}
-
void main_timer_handler(void)
{
static unsigned long rtc_update = 0;
- unsigned long tsc;
- int delay = 0, offset = 0, lost = 0;
-
/*
* Here we are in the timer irq handler. We have irqs locally disabled (so we
* don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -354,72 +156,11 @@ void main_timer_handler(void)
write_seqlock(&xtime_lock);
- if (vxtime.hpet_address)
- offset = hpet_readl(HPET_COUNTER);
-
- if (hpet_use_timer) {
- /* if we're using the hpet timer functionality,
- * we can more accurately know the counter value
- * when the timer interrupt occured.
- */
- offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
- delay = hpet_readl(HPET_COUNTER) - offset;
- } else if (!pmtmr_ioport) {
- spin_lock(&i8253_lock);
- outb_p(0x00, 0x43);
- delay = inb_p(0x40);
- delay |= inb(0x40) << 8;
- spin_unlock(&i8253_lock);
- delay = LATCH - 1 - delay;
- }
-
- tsc = get_cycles_sync();
-
- if (vxtime.mode == VXTIME_HPET) {
- if (offset - vxtime.last > hpet_tick) {
- lost = (offset - vxtime.last) / hpet_tick - 1;
- }
-
- monotonic_base +=
- (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
-
- vxtime.last = offset;
-#ifdef CONFIG_X86_PM_TIMER
- } else if (vxtime.mode == VXTIME_PMTMR) {
- lost = pmtimer_mark_offset();
-#endif
- } else {
- offset = (((tsc - vxtime.last_tsc) *
- vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
-
- if (offset < 0)
- offset = 0;
-
- if (offset > USEC_PER_TICK) {
- lost = offset / USEC_PER_TICK;
- offset %= USEC_PER_TICK;
- }
-
- monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
-
- vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
-
- if ((((tsc - vxtime.last_tsc) *
- vxtime.tsc_quot) >> US_SCALE) < offset)
- vxtime.last_tsc = tsc -
- (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
- }
-
- if (lost > 0)
- handle_lost_ticks(lost);
- else
- lost = 0;
-
/*
* Do the timer stuff.
*/
- do_timer(lost + 1);
+ do_timer(1);
#ifndef CONFIG_SMP
update_process_times(user_mode(get_irq_regs()));
#endif
@@ -460,40 +201,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static unsigned int cyc2ns_scale __read_mostly;
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
- cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
- return (cyc * cyc2ns_scale) >> NS_SCALE;
-}
-
-unsigned long long sched_clock(void)
-{
- unsigned long a = 0;
-
-#if 0
- /* Don't do a HPET read here. Using TSC always is much faster
- and HPET may not be mapped yet when the scheduler first runs.
- Disadvantage is a small drift between CPUs in some configurations,
- but that should be tolerable. */
- if (__vxtime.mode == VXTIME_HPET)
- return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
-#endif
-
- /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
- which means it is not completely exact and may not be monotonous between
- CPUs. But the errors should be too small to matter for scheduling
- purposes. */
-
- rdtscll(a);
- return cycles_2_ns(a);
-}
-
static unsigned long get_cmos_time(void)
{
unsigned int year, mon, day, hour, min, sec;
@@ -545,159 +252,6 @@ static unsigned long get_cmos_time(void)
return mktime(year, mon, day, hour, min, sec);
}
-#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
- changes.
-
- RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
- not that important because current Opteron setups do not support
- scaling on SMP anyroads.
-
- Should fix up last_tsc too. Currently gettimeofday in the
- first tick after the change will be slightly wrong. */
-
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(struct work_struct *v)
-{
- unsigned int cpu;
- for_each_online_cpu(cpu) {
- cpufreq_get(cpu);
- }
- cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static void cpufreq_delayed_get(void)
-{
- static int warned;
- if (cpufreq_init && !cpufreq_delayed_issched) {
- cpufreq_delayed_issched = 1;
- if (!warned) {
- warned = 1;
- printk(KERN_DEBUG
- "Losing some ticks... checking if CPU frequency changed.\n");
- }
- schedule_work(&cpufreq_delayed_get_work);
- }
-}
-
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-static unsigned long cpu_khz_ref = 0;
-
-static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- struct cpufreq_freqs *freq = data;
- unsigned long *lpj, dummy;
-
- if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
- return 0;
-
- lpj = &dummy;
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
- lpj = &cpu_data[freq->cpu].loops_per_jiffy;
-#else
- lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
- if (!ref_freq) {
- ref_freq = freq->old;
- loops_per_jiffy_ref = *lpj;
- cpu_khz_ref = cpu_khz;
- }
- if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- *lpj =
- cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
- cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
- }
-
- set_cyc2ns_scale(cpu_khz_ref);
-
- return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
- .notifier_call = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
- INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
- if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER))
- cpufreq_init = 1;
- return 0;
-}
-
-core_initcall(cpufreq_tsc);
-
-#endif
-
-/*
- * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
- * it to the HPET timer of known frequency.
- */
-
-#define TICK_COUNT 100000000
-#define TICK_MIN 5000
-
-/*
- * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
- * occurs between the reads of the hpet & TSC.
- */
-static void __init read_hpet_tsc(int *hpet, int *tsc)
-{
- int tsc1, tsc2, hpet1;
-
- do {
- tsc1 = get_cycles_sync();
- hpet1 = hpet_readl(HPET_COUNTER);
- tsc2 = get_cycles_sync();
- } while (tsc2 - tsc1 > TICK_MIN);
- *hpet = hpet1;
- *tsc = tsc2;
-}
-
-
-static unsigned int __init hpet_calibrate_tsc(void)
-{
- int tsc_start, hpet_start;
- int tsc_now, hpet_now;
- unsigned long flags;
-
- local_irq_save(flags);
- local_irq_disable();
-
- read_hpet_tsc(&hpet_start, &tsc_start);
-
- do {
- local_irq_disable();
- read_hpet_tsc(&hpet_now, &tsc_now);
- local_irq_restore(flags);
- } while ((tsc_now - tsc_start) < TICK_COUNT &&
- (hpet_now - hpet_start) < TICK_COUNT);
-
- return (tsc_now - tsc_start) * 1000000000L
- / ((hpet_now - hpet_start) * hpet_period / 1000);
-}
-
/*
* pit_calibrate_tsc() uses the speaker output (channel 2) of
@@ -728,124 +282,6 @@ static unsigned int __init pit_calibrate_tsc(void)
return (end - start) / 50;
}
-#ifdef CONFIG_HPET
-static __init int late_hpet_init(void)
-{
- struct hpet_data hd;
- unsigned int ntimer;
-
- if (!vxtime.hpet_address)
- return 0;
-
- memset(&hd, 0, sizeof (hd));
-
- ntimer = hpet_readl(HPET_ID);
- ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
- ntimer++;
-
- /*
- * Register with driver.
- * Timer0 and Timer1 is used by platform.
- */
- hd.hd_phys_address = vxtime.hpet_address;
- hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
- hd.hd_nirqs = ntimer;
- hd.hd_flags = HPET_DATA_PLATFORM;
- hpet_reserve_timer(&hd, 0);
-#ifdef CONFIG_HPET_EMULATE_RTC
- hpet_reserve_timer(&hd, 1);
-#endif
- hd.hd_irq[0] = HPET_LEGACY_8254;
- hd.hd_irq[1] = HPET_LEGACY_RTC;
- if (ntimer > 2) {
- struct hpet *hpet;
- struct hpet_timer *timer;
- int i;
-
- hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
- timer = &hpet->hpet_timers[2];
- for (i = 2; i < ntimer; timer++, i++)
- hd.hd_irq[i] = (timer->hpet_config &
- Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
-
- }
-
- hpet_alloc(&hd);
- return 0;
-}
-fs_initcall(late_hpet_init);
-#endif
-
-static int hpet_timer_stop_set_go(unsigned long tick)
-{
- unsigned int cfg;
-
-/*
- * Stop the timers and reset the main counter.
- */
-
- cfg = hpet_readl(HPET_CFG);
- cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
- hpet_writel(cfg, HPET_CFG);
- hpet_writel(0, HPET_COUNTER);
- hpet_writel(0, HPET_COUNTER + 4);
-
-/*
- * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
- * and period also hpet_tick.
- */
- if (hpet_use_timer) {
- hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
- HPET_TN_32BIT, HPET_T0_CFG);
- hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
- hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
- cfg |= HPET_CFG_LEGACY;
- }
-/*
- * Go!
- */
-
- cfg |= HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
-
- return 0;
-}
-
-static int hpet_init(void)
-{
- unsigned int id;
-
- if (!vxtime.hpet_address)
- return -1;
- set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
- __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-
-/*
- * Read the period, compute tick and quotient.
- */
-
- id = hpet_readl(HPET_ID);
-
- if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
- return -1;
-
- hpet_period = hpet_readl(HPET_PERIOD);
- if (hpet_period < 100000 || hpet_period > 100000000)
- return -1;
-
- hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
-
- hpet_use_timer = (id & HPET_ID_LEGSUP);
-
- return hpet_timer_stop_set_go(hpet_tick);
-}
-
-static int hpet_reenable(void)
-{
- return hpet_timer_stop_set_go(hpet_tick);
-}
-
#define PIT_MODE 0x43
#define PIT_CH0 0x40
@@ -873,7 +309,7 @@ void __init pit_stop_interrupt(void)
void __init stop_timer_interrupt(void)
{
char *name;
- if (vxtime.hpet_address) {
+ if (hpet_address) {
name = "HPET";
hpet_timer_stop_set_go(0);
} else {
@@ -883,12 +319,6 @@ void __init stop_timer_interrupt(void)
printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
}
-int __init time_setup(char *str)
-{
- report_lost_ticks = 1;
- return 1;
-}
-
static struct irqaction irq0 = {
timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
};
@@ -896,124 +326,41 @@ static struct irqaction irq0 = {
void __init time_init(void)
{
if (nohpet)
- vxtime.hpet_address = 0;
-
+ hpet_address = 0;
xtime.tv_sec = get_cmos_time();
xtime.tv_nsec = 0;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
- if (!hpet_init())
- vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
- else
- vxtime.hpet_address = 0;
+ if (hpet_arch_init())
+ hpet_address = 0;
if (hpet_use_timer) {
/* set tick_nsec to use the proper rate for HPET */
tick_nsec = TICK_NSEC_HPET;
cpu_khz = hpet_calibrate_tsc();
timename = "HPET";
-#ifdef CONFIG_X86_PM_TIMER
- } else if (pmtmr_ioport && !vxtime.hpet_address) {
- vxtime_hz = PM_TIMER_FREQUENCY;
- timename = "PM";
- pit_init();
- cpu_khz = pit_calibrate_tsc();
-#endif
} else {
pit_init();
cpu_khz = pit_calibrate_tsc();
timename = "PIT";
}
- vxtime.mode = VXTIME_TSC;
- vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
- vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
- vxtime.last_tsc = get_cycles_sync();
- set_cyc2ns_scale(cpu_khz);
- setup_irq(0, &irq0);
-
-#ifndef CONFIG_SMP
- time_init_gtod();
-#endif
-}
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-#ifdef CONFIG_SMP
- if (apic_is_clustered_box())
- return 1;
-#endif
- /* Most intel systems have synchronized TSCs except for
- multi node systems */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-#ifdef CONFIG_ACPI
- /* But TSC doesn't tick in C3 so don't use it there */
- if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
- return 1;
-#endif
- return 0;
- }
-
- /* Assume multi socket systems are not synchronized */
- return num_present_cpus() > 1;
-}
-
-/*
- * Decide what mode gettimeofday should use.
- */
-void time_init_gtod(void)
-{
- char *timetype;
-
if (unsynchronized_tsc())
- notsc = 1;
+ mark_tsc_unstable();
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
vgetcpu_mode = VGETCPU_RDTSCP;
else
vgetcpu_mode = VGETCPU_LSL;
- if (vxtime.hpet_address && notsc) {
- timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
- if (hpet_use_timer)
- vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
- else
- vxtime.last = hpet_readl(HPET_COUNTER);
- vxtime.mode = VXTIME_HPET;
- do_gettimeoffset = do_gettimeoffset_hpet;
-#ifdef CONFIG_X86_PM_TIMER
- /* Using PM for gettimeofday is quite slow, but we have no other
- choice because the TSC is too unreliable on some systems. */
- } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
- timetype = "PM";
- do_gettimeoffset = do_gettimeoffset_pm;
- vxtime.mode = VXTIME_PMTMR;
- sysctl_vsyscall = 0;
- printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
-#endif
- } else {
- timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
- vxtime.mode = VXTIME_TSC;
- }
-
- printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
- vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
+ set_cyc2ns_scale(cpu_khz);
printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
- vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
- vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
- vxtime.last_tsc = get_cycles_sync();
-
- set_cyc2ns_scale(cpu_khz);
+ setup_irq(0, &irq0);
}
-__setup("report_lost_ticks", time_setup);
static long clock_cmos_diff;
static unsigned long sleep_start;
@@ -1050,7 +397,7 @@ static int timer_resume(struct sys_device *dev)
sleep_length = 0;
ctime = sleep_start;
}
- if (vxtime.hpet_address)
+ if (hpet_address)
hpet_reenable();
else
i8254_timer_resume();
@@ -1059,20 +406,8 @@ static int timer_resume(struct sys_device *dev)
write_seqlock_irqsave(&xtime_lock,flags);
xtime.tv_sec = sec;
xtime.tv_nsec = 0;
- if (vxtime.mode == VXTIME_HPET) {
- if (hpet_use_timer)
- vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
- else
- vxtime.last = hpet_readl(HPET_COUNTER);
-#ifdef CONFIG_X86_PM_TIMER
- } else if (vxtime.mode == VXTIME_PMTMR) {
- pmtimer_resume();
-#endif
- } else
- vxtime.last_tsc = get_cycles_sync();
- write_sequnlock_irqrestore(&xtime_lock,flags);
jiffies += sleep_length;
- monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
+ write_sequnlock_irqrestore(&xtime_lock,flags);
touch_softlockup_watchdog();
return 0;
}
@@ -1098,269 +433,3 @@ static int time_init_device(void)
}
device_initcall(time_init_device);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- * is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ 64
-#define RTC_NUM_INTS 1
-
-static unsigned long UIE_on;
-static unsigned long prev_update_sec;
-
-static unsigned long AIE_on;
-static struct rtc_time alarm_time;
-
-static unsigned long PIE_on;
-static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
-static unsigned long PIE_count;
-
-static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
-static unsigned int hpet_t1_cmp; /* cached comparator register */
-
-int is_hpet_enabled(void)
-{
- return vxtime.hpet_address != 0;
-}
-
-/*
- * Timer 1 for RTC, we do not use periodic interrupt feature,
- * even if HPET supports periodic interrupts on Timer 1.
- * The reason being, to set up a periodic interrupt in HPET, we need to
- * stop the main counter. And if we do that everytime someone diables/enables
- * RTC, we will have adverse effect on main kernel timer running on Timer 0.
- * So, for the time being, simulate the periodic interrupt in software.
- *
- * hpet_rtc_timer_init() is called for the first time and during subsequent
- * interuppts reinit happens through hpet_rtc_timer_reinit().
- */
-int hpet_rtc_timer_init(void)
-{
- unsigned int cfg, cnt;
- unsigned long flags;
-
- if (!is_hpet_enabled())
- return 0;
- /*
- * Set the counter 1 and enable the interrupts.
- */
- if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
- hpet_rtc_int_freq = PIE_freq;
- else
- hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
- local_irq_save(flags);
-
- cnt = hpet_readl(HPET_COUNTER);
- cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
-
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T1_CFG);
-
- local_irq_restore(flags);
-
- return 1;
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
- unsigned int cfg, cnt, ticks_per_int, lost_ints;
-
- if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T1_CFG);
- return;
- }
-
- if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
- hpet_rtc_int_freq = PIE_freq;
- else
- hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
- /* It is more accurate to use the comparator value than current count.*/
- ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
- hpet_t1_cmp += ticks_per_int;
- hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
- /*
- * If the interrupt handler was delayed too long, the write above tries
- * to schedule the next interrupt in the past and the hardware would
- * not interrupt until the counter had wrapped around.
- * So we have to check that the comparator wasn't set to a past time.
- */
- cnt = hpet_readl(HPET_COUNTER);
- if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
- lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
- /* Make sure that, even with the time needed to execute
- * this code, the next scheduled interrupt has been moved
- * back to the future: */
- lost_ints++;
-
- hpet_t1_cmp += lost_ints * ticks_per_int;
- hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
- if (PIE_on)
- PIE_count += lost_ints;
-
- printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
- hpet_rtc_int_freq);
- }
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
- if (!is_hpet_enabled())
- return 0;
-
- if (bit_mask & RTC_UIE)
- UIE_on = 0;
- if (bit_mask & RTC_PIE)
- PIE_on = 0;
- if (bit_mask & RTC_AIE)
- AIE_on = 0;
-
- return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
- int timer_init_reqd = 0;
-
- if (!is_hpet_enabled())
- return 0;
-
- if (!(PIE_on | AIE_on | UIE_on))
- timer_init_reqd = 1;
-
- if (bit_mask & RTC_UIE) {
- UIE_on = 1;
- }
- if (bit_mask & RTC_PIE) {
- PIE_on = 1;
- PIE_count = 0;
- }
- if (bit_mask & RTC_AIE) {
- AIE_on = 1;
- }
-
- if (timer_init_reqd)
- hpet_rtc_timer_init();
-
- return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
-{
- if (!is_hpet_enabled())
- return 0;
-
- alarm_time.tm_hour = hrs;
- alarm_time.tm_min = min;
- alarm_time.tm_sec = sec;
-
- return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
- if (!is_hpet_enabled())
- return 0;
-
- PIE_freq = freq;
- PIE_count = 0;
-
- return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
- if (!is_hpet_enabled())
- return 0;
-
- return 1;
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
- struct rtc_time curr_time;
- unsigned long rtc_int_flag = 0;
- int call_rtc_interrupt = 0;
-
- hpet_rtc_timer_reinit();
-
- if (UIE_on | AIE_on) {
- rtc_get_rtc_time(&curr_time);
- }
- if (UIE_on) {
- if (curr_time.tm_sec != prev_update_sec) {
- /* Set update int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag = RTC_UF;
- prev_update_sec = curr_time.tm_sec;
- }
- }
- if (PIE_on) {
- PIE_count++;
- if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
- /* Set periodic int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag |= RTC_PF;
- PIE_count = 0;
- }
- }
- if (AIE_on) {
- if ((curr_time.tm_sec == alarm_time.tm_sec) &&
- (curr_time.tm_min == alarm_time.tm_min) &&
- (curr_time.tm_hour == alarm_time.tm_hour)) {
- /* Set alarm int info, call real rtc int routine */
- call_rtc_interrupt = 1;
- rtc_int_flag |= RTC_AF;
- }
- }
- if (call_rtc_interrupt) {
- rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
- rtc_interrupt(rtc_int_flag, dev_id);
- }
- return IRQ_HANDLED;
-}
-#endif
-
-static int __init nohpet_setup(char *s)
-{
- nohpet = 1;
- return 1;
-}
-
-__setup("nohpet", nohpet_setup);
-
-int __init notsc_setup(char *s)
-{
- notsc = 1;
- return 1;
-}
-
-__setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
new file mode 100644
index 000000000000..895831865019
--- /dev/null
+++ b/arch/x86_64/kernel/tsc.c
@@ -0,0 +1,226 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/time.h>
+#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+
+#include <asm/timex.h>
+
+static int notsc __initdata = 0;
+
+unsigned int cpu_khz; /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+
+static unsigned int cyc2ns_scale __read_mostly;
+
+void set_cyc2ns_scale(unsigned long khz)
+{
+ cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
+}
+
+static unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ return (cyc * cyc2ns_scale) >> NS_SCALE;
+}
+
+unsigned long long sched_clock(void)
+{
+ unsigned long a = 0;
+
+ /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+ * which means it is not completely exact and may not be monotonous
+ * between CPUs. But the errors should be too small to matter for
+ * scheduling purposes.
+ */
+
+ rdtscll(a);
+ return cycles_2_ns(a);
+}
+
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+ return tsc_unstable;
+}
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+#include <linux/workqueue.h>
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(struct work_struct *v)
+{
+ unsigned int cpu;
+ for_each_online_cpu(cpu) {
+ cpufreq_get(cpu);
+ }
+ cpufreq_delayed_issched = 0;
+}
+
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+static unsigned long cpu_khz_ref = 0;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long *lpj, dummy;
+
+ if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ lpj = &dummy;
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+ lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+ lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = *lpj;
+ cpu_khz_ref = cpu_khz;
+ }
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ *lpj =
+ cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+ cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ mark_tsc_unstable();
+ }
+
+ set_cyc2ns_scale(cpu_khz_ref);
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+ INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
+ if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER))
+ cpufreq_init = 1;
+ return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+static int tsc_unstable = 0;
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+ if (tsc_unstable)
+ return 1;
+
+#ifdef CONFIG_SMP
+ if (apic_is_clustered_box())
+ return 1;
+#endif
+ /* Most intel systems have synchronized TSCs except for
+ multi node systems */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+ /* But TSC doesn't tick in C3 so don't use it there */
+ if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
+ return 1;
+#endif
+ return 0;
+ }
+
+ /* Assume multi socket systems are not synchronized */
+ return num_present_cpus() > 1;
+}
+
+int __init notsc_setup(char *s)
+{
+ notsc = 1;
+ return 1;
+}
+
+__setup("notsc", notsc_setup);
+
+
+/* clock source code: */
+static cycle_t read_tsc(void)
+{
+ cycle_t ret = (cycle_t)get_cycles_sync();
+ return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+ cycle_t ret = (cycle_t)get_cycles_sync();
+ return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .shift = 22,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_MUST_VERIFY,
+ .vread = vread_tsc,
+};
+
+void mark_tsc_unstable(void)
+{
+ if (!tsc_unstable) {
+ tsc_unstable = 1;
+ /* Change only the rating, when not registered */
+ if (clocksource_tsc.mult)
+ clocksource_change_rating(&clocksource_tsc, 0);
+ else
+ clocksource_tsc.rating = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static int __init init_tsc_clocksource(void)
+{
+ if (!notsc) {
+ clocksource_tsc.mult = clocksource_khz2mult(cpu_khz,
+ clocksource_tsc.shift);
+ if (check_tsc_unstable())
+ clocksource_tsc.rating = 0;
+
+ return clocksource_register(&clocksource_tsc);
+ }
+ return 0;
+}
+
+module_init(init_tsc_clocksource);
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
new file mode 100644
index 000000000000..014f0db45dfa
--- /dev/null
+++ b/arch/x86_64/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
+/*
+ * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ * protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static __cpuinitdata atomic_t start_count;
+static __cpuinitdata atomic_t stop_count;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata cycles_t last_tsc;
+static __cpuinitdata cycles_t max_warp;
+static __cpuinitdata int nr_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+static __cpuinit void check_tsc_warp(void)
+{
+ cycles_t start, now, prev, end;
+ int i;
+
+ start = get_cycles_sync();
+ /*
+ * The measurement runs for 20 msecs:
+ */
+ end = start + cpu_khz * 20ULL;
+ now = start;
+
+ for (i = 0; ; i++) {
+ /*
+ * We take the global lock, measure TSC, save the
+ * previous TSC that was measured (possibly on
+ * another CPU) and update the previous TSC timestamp.
+ */
+ __raw_spin_lock(&sync_lock);
+ prev = last_tsc;
+ now = get_cycles_sync();
+ last_tsc = now;
+ __raw_spin_unlock(&sync_lock);
+
+ /*
+ * Be nice every now and then (and also check whether
+ * measurement is done [we also insert a 100 million
+ * loops safety exit, so we dont lock up in case the
+ * TSC readout is totally broken]):
+ */
+ if (unlikely(!(i & 7))) {
+ if (now > end || i > 100000000)
+ break;
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ /*
+ * Outside the critical section we can now see whether
+ * we saw a time-warp of the TSC going backwards:
+ */
+ if (unlikely(prev > now)) {
+ __raw_spin_lock(&sync_lock);
+ max_warp = max(max_warp, prev - now);
+ nr_warps++;
+ __raw_spin_unlock(&sync_lock);
+ }
+
+ }
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+ int cpus = 2;
+
+ /*
+ * No need to check if we already know that the TSC is not
+ * synchronized:
+ */
+ if (unsynchronized_tsc())
+ return;
+
+ printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+ smp_processor_id(), cpu);
+
+ /*
+ * Reset it - in case this is a second bootup:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Wait for the target to arrive:
+ */
+ while (atomic_read(&start_count) != cpus-1)
+ cpu_relax();
+ /*
+ * Trigger the target to continue into the measurement too:
+ */
+ atomic_inc(&start_count);
+
+ check_tsc_warp();
+
+ while (atomic_read(&stop_count) != cpus-1)
+ cpu_relax();
+
+ /*
+ * Reset it - just in case we boot another CPU later:
+ */
+ atomic_set(&start_count, 0);
+
+ if (nr_warps) {
+ printk("\n");
+ printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
+ " turning off TSC clock.\n", max_warp);
+ mark_tsc_unstable();
+ nr_warps = 0;
+ max_warp = 0;
+ last_tsc = 0;
+ } else {
+ printk(" passed.\n");
+ }
+
+ /*
+ * Let the target continue with the bootup:
+ */
+ atomic_inc(&stop_count);
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+ int cpus = 2;
+
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Register this CPU's participation and wait for the
+ * source CPU to start the measurement:
+ */
+ atomic_inc(&start_count);
+ while (atomic_read(&start_count) != cpus)
+ cpu_relax();
+
+ check_tsc_warp();
+
+ /*
+ * Ok, we are done:
+ */
+ atomic_inc(&stop_count);
+
+ /*
+ * Wait for the source CPU to print stuff:
+ */
+ while (atomic_read(&stop_count) != cpus)
+ cpu_relax();
+}
+#undef NR_LOOPS
+
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index c360c4225244..b73212c0a550 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -88,31 +88,25 @@ SECTIONS
__vsyscall_0 = VSYSCALL_VIRT_ADDR;
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
- xtime_lock = VVIRT(.xtime_lock);
-
- .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
- vxtime = VVIRT(.vxtime);
+ .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
+ { *(.vsyscall_gtod_data) }
+ vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
vgetcpu_mode = VVIRT(.vgetcpu_mode);
- .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
- sys_tz = VVIRT(.sys_tz);
-
- .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
- sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
-
- .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
- xtime = VVIRT(.xtime);
-
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
jiffies = VVIRT(.jiffies);
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
+ { *(.vsyscall_1) }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
+ { *(.vsyscall_2) }
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
+ { *(.vsyscall_3) }
. = VSYSCALL_VIRT_ADDR + 4096;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 2433d6fc68b1..180ff919eaf9 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
+#include <linux/clocksource.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
@@ -34,6 +35,7 @@
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
+#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
@@ -44,56 +46,41 @@
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
#define __syscall_clobber "r11","rcx","memory"
-int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+struct vsyscall_gtod_data_t {
+ seqlock_t lock;
+ int sysctl_enabled;
+ struct timeval wall_time_tv;
+ struct timezone sys_tz;
+ cycle_t offset_base;
+ struct clocksource clock;
+};
int __vgetcpu_mode __section_vgetcpu_mode;
-#include <asm/unistd.h>
-
-static __always_inline void timeval_normalize(struct timeval * tv)
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =
{
- time_t __sec;
-
- __sec = tv->tv_usec / 1000000;
- if (__sec) {
- tv->tv_usec %= 1000000;
- tv->tv_sec += __sec;
- }
-}
+ .lock = SEQLOCK_UNLOCKED,
+ .sysctl_enabled = 1,
+};
-static __always_inline void do_vgettimeofday(struct timeval * tv)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
{
- long sequence, t;
- unsigned long sec, usec;
-
- do {
- sequence = read_seqbegin(&__xtime_lock);
-
- sec = __xtime.tv_sec;
- usec = __xtime.tv_nsec / 1000;
-
- if (__vxtime.mode != VXTIME_HPET) {
- t = get_cycles_sync();
- if (t < __vxtime.last_tsc)
- t = __vxtime.last_tsc;
- usec += ((t - __vxtime.last_tsc) *
- __vxtime.tsc_quot) >> 32;
- /* See comment in x86_64 do_gettimeofday. */
- } else {
- usec += ((readl((void __iomem *)
- fix_to_virt(VSYSCALL_HPET) + 0xf0) -
- __vxtime.last) * __vxtime.quot) >> 32;
- }
- } while (read_seqretry(&__xtime_lock, sequence));
-
- tv->tv_sec = sec + usec / 1000000;
- tv->tv_usec = usec % 1000000;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ /* copy vsyscall data */
+ vsyscall_gtod_data.clock = *clock;
+ vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
+ vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
+ vsyscall_gtod_data.sys_tz = sys_tz;
+ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
+/* RED-PEN may want to readd seq locking, but then the variable should be
+ * write-once.
+ */
static __always_inline void do_get_tz(struct timezone * tz)
{
- *tz = __sys_tz;
+ *tz = __vsyscall_gtod_data.sys_tz;
}
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -101,7 +88,8 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
int ret;
asm volatile("vsysc2: syscall"
: "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+ : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+ : __syscall_clobber );
return ret;
}
@@ -114,10 +102,44 @@ static __always_inline long time_syscall(long *t)
return secs;
}
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+ cycle_t now, base, mask, cycle_delta;
+ unsigned long seq, mult, shift, nsec_delta;
+ cycle_t (*vread)(void);
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+ vread = __vsyscall_gtod_data.clock.vread;
+ if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+ gettimeofday(tv,0);
+ return;
+ }
+ now = vread();
+ base = __vsyscall_gtod_data.clock.cycle_last;
+ mask = __vsyscall_gtod_data.clock.mask;
+ mult = __vsyscall_gtod_data.clock.mult;
+ shift = __vsyscall_gtod_data.clock.shift;
+
+ *tv = __vsyscall_gtod_data.wall_time_tv;
+
+ } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
+ /* calculate interval: */
+ cycle_delta = (now - base) & mask;
+ /* convert to nsecs: */
+ nsec_delta = (cycle_delta * mult) >> shift;
+
+ /* convert to usecs and add to timespec: */
+ tv->tv_usec += nsec_delta / NSEC_PER_USEC;
+ while (tv->tv_usec > USEC_PER_SEC) {
+ tv->tv_sec += 1;
+ tv->tv_usec -= USEC_PER_SEC;
+ }
+}
+
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
{
- if (!__sysctl_vsyscall)
- return gettimeofday(tv,tz);
if (tv)
do_vgettimeofday(tv);
if (tz)
@@ -129,11 +151,11 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
- if (!__sysctl_vsyscall)
+ if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);
else if (t)
- *t = __xtime.tv_sec;
- return __xtime.tv_sec;
+ *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
+ return __vsyscall_gtod_data.wall_time_tv.tv_sec;
}
/* Fast way to get current CPU and node.
@@ -210,7 +232,7 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
ret = -ENOMEM;
goto out;
}
- if (!sysctl_vsyscall) {
+ if (!vsyscall_gtod_data.sysctl_enabled) {
writew(SYSCALL, map1);
writew(SYSCALL, map2);
} else {
@@ -232,16 +254,17 @@ static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
static ctl_table kernel_table2[] = {
{ .ctl_name = 99, .procname = "vsyscall64",
- .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
+ .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
+ .mode = 0644,
.strategy = vsyscall_sysctl_nostrat,
.proc_handler = vsyscall_sysctl_change },
- { 0, }
+ {}
};
static ctl_table kernel_root_table2[] = {
{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
.child = kernel_table2 },
- { 0 },
+ {}
};
#endif
@@ -301,7 +324,7 @@ static int __init vsyscall_init(void)
BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
map_vsyscall();
#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2, 0);
+ register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
hotcpu_notifier(cpu_vsyscall_notifier, 0);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e4797a47..0dffae69f4ad 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
EXPORT_SYMBOL(__copy_from_user_inatomic);
@@ -34,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
#ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+extern void __write_lock_failed(rwlock_t *rw);
+extern void __read_lock_failed(rwlock_t *rw);
EXPORT_SYMBOL(__write_lock_failed);
EXPORT_SYMBOL(__read_lock_failed);
#endif
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d4170fce2..8d5f835af481 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx /* save zero flag */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+
+ xorl %eax,%eax /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movnti %r11,(%rdi)
+.Ld2: movnti %r8,1*8(%rdi)
+.Ld3: movnti %r9,2*8(%rdi)
+.Ld4: movnti %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movnti %r11,4*8(%rdi)
+.Ld6: movnti %r8,5*8(%rdi)
+.Ld7: movnti %r9,6*8(%rdi)
+.Ld8: movnti %r10,7*8(%rdi)
+
+ dec %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movnti %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE %rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp) /* zero flag set? */
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 49e8cf2e06f8..6ada7231f3ab 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
-static inline int notify_page_fault(enum die_val val, const char *str,
- struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs, long err)
{
struct die_args args = {
.regs = regs,
- .str = str,
+ .str = "page fault",
.err = err,
- .trapnr = trap,
- .signr = sig
+ .trapnr = 14,
+ .signr = SIGSEGV
};
- return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
+ return atomic_notifier_call_chain(&notify_page_fault_chain,
+ DIE_PAGE_FAULT, &args);
}
/* Sometimes the CPU reports invalid exceptions on prefetch.
@@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (vmalloc_fault(address) >= 0)
return;
}
- if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
- SIGSEGV) == NOTIFY_STOP)
+ if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
- if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
- SIGSEGV) == NOTIFY_STOP)
+ if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
if (likely(regs->eflags & X86_EFLAGS_IF))
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2968b90ef8ad..ec31534eb104 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -711,20 +711,30 @@ int kern_addr_valid(unsigned long addr)
extern int exception_trace, page_fault_trace;
static ctl_table debug_table2[] = {
- { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
- proc_dointvec },
- { 0, }
+ {
+ .ctl_name = 99,
+ .procname = "exception-trace",
+ .data = &exception_trace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {}
};
static ctl_table debug_root_table2[] = {
- { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
- .child = debug_table2 },
- { 0 },
+ {
+ .ctl_name = CTL_DEBUG,
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table2
+ },
+ {}
};
static __init int x8664_sysctl_init(void)
{
- register_sysctl_table(debug_root_table2, 1);
+ register_sysctl_table(debug_root_table2);
return 0;
}
__initcall(x8664_sysctl_init);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e003606c..41b8fb069924 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
/*
@@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
int res = -1;
unsigned long addr, end;
- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+ memset(memnodemap, 0xff, memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != 0xff)
return -1;
memnodemap[addr >> shift] = i;
- addr += (1UL << shift);
+ addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
{
- int shift = 20;
+ unsigned long pad, pad_addr;
+
+ memnodemap = memnode.embedded_map;
+ if (memnodemapsize <= 48)
+ return 0;
+
+ pad = L1_CACHE_BYTES - 1;
+ pad_addr = 0x8000;
+ nodemap_size = pad + memnodemapsize;
+ nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+ nodemap_size);
+ if (nodemap_addr == -1UL) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ nodemap_addr = nodemap_size = 0;
+ return -1;
+ }
+ pad_addr = (nodemap_addr + pad) & ~pad;
+ memnodemap = phys_to_virt(pad_addr);
+
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+}
- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+{
+ int i, nodes_used = 0;
+ unsigned long start, end;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for (i = 0; i < numnodes; i++) {
+ start = nodes[i].start;
+ end = nodes[i].end;
+ if (start >= end)
+ continue;
+ bitfield |= start;
+ nodes_used++;
+ if (end > memtop)
+ memtop = end;
+ }
+ if (nodes_used <= 1)
+ i = 63;
+ else
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ memnodemapsize = (memtop >> i)+1;
+ return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+ int shift;
+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if (allocate_cachealigned_memnodemap())
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
@@ -216,31 +272,113 @@ void __init numa_init_array(void)
}
#ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
int numa_fake __initdata = 0;
-/* Numa emulation */
+/*
+ * This function is used to find out if the start and end correspond to
+ * different zones.
+ */
+int zone_cross_over(unsigned long start, unsigned long end)
+{
+ if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
+ (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
+ return 1;
+ return 0;
+}
+
static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
- int i;
+ int i, big;
struct bootnode nodes[MAX_NUMNODES];
- unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+ unsigned long sz, old_sz;
+ unsigned long hole_size;
+ unsigned long start, end;
+ unsigned long max_addr = (end_pfn << PAGE_SHIFT);
+
+ start = (start_pfn << PAGE_SHIFT);
+ hole_size = e820_hole_size(start, max_addr);
+ sz = (max_addr - start - hole_size) / numa_fake;
/* Kludge needed for the hash function */
- if (hweight64(sz) > 1) {
- unsigned long x = 1;
- while ((x << 1) < sz)
- x <<= 1;
- if (x < sz/2)
- printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
- sz = x;
- }
+ old_sz = sz;
+ /*
+ * Round down to the nearest FAKE_NODE_MIN_SIZE.
+ */
+ sz &= FAKE_NODE_MIN_HASH_MASK;
+
+ /*
+ * We ensure that each node is at least 64MB big. Smaller than this
+ * size can cause VM hiccups.
+ */
+ if (sz == 0) {
+ printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
+ "the number of nodes\n", numa_fake);
+ numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
+ printk(KERN_INFO "Number of fake nodes will be = %d\n",
+ numa_fake);
+ sz = FAKE_NODE_MIN_SIZE;
+ }
+ /*
+ * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
+ * This logic ensures the extra memory gets distributed among as many
+ * nodes as possible (as compared to one single node getting all that
+ * extra memory.
+ */
+ big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
+ printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
+ "%d\n",
+ (sz >> 20), (hole_size >> 20), big);
memset(&nodes,0,sizeof(nodes));
+ end = start;
for (i = 0; i < numa_fake; i++) {
- nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ /*
+ * In case we are not able to allocate enough memory for all
+ * the nodes, we reduce the number of fake nodes.
+ */
+ if (end >= max_addr) {
+ numa_fake = i - 1;
+ break;
+ }
+ start = nodes[i].start = end;
+ /*
+ * Final node can have all the remaining memory.
+ */
if (i == numa_fake-1)
- sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
- nodes[i].end = nodes[i].start + sz;
+ sz = max_addr - start;
+ end = nodes[i].start + sz;
+ /*
+ * Fir "big" number of nodes get extra granule.
+ */
+ if (i < big)
+ end += FAKE_NODE_MIN_SIZE;
+ /*
+ * Iterate over the range to ensure that this node gets at
+ * least sz amount of RAM (excluding holes)
+ */
+ while ((end - start - e820_hole_size(start, end)) < sz) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end >= max_addr)
+ break;
+ }
+ /*
+ * Look at the next node to make sure there is some real memory
+ * to map. Bad things happen when the only memory present
+ * in a zone on a fake node is IO hole.
+ */
+ while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
+ if (zone_cross_over(start, end + sz)) {
+ end = (MAX_DMA32_PFN << PAGE_SHIFT);
+ break;
+ }
+ if (end >= max_addr)
+ break;
+ end += FAKE_NODE_MIN_SIZE;
+ }
+ if (end > max_addr)
+ end = max_addr;
+ nodes[i].end = end;
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
i,
nodes[i].start, nodes[i].end,
@@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
end_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
memnode_shift = 63;
+ memnodemap = memnode.embedded_map;
memnodemap[0] = 0;
nodes_clear(node_online_map);
node_set_online(0);
@@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-#ifdef CONFIG_SPARSEMEM
-static void __init arch_sparse_init(void)
-{
- int i;
-
- for_each_online_node(i)
- memory_present(i, node_start_pfn(i), node_end_pfn(i));
-
- sparse_init();
-}
-#else
-#define arch_sparse_init() do {} while (0)
-#endif
-
void __init paging_init(void)
{
int i;
@@ -344,7 +469,8 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = end_pfn;
- arch_sparse_init();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
for_each_online_node(i) {
setup_node_zones(i);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index ccb91dd996a9..65c5eaa59905 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
pud_t *pud;
pmd_t *pmd;
pte_t large_pte;
+ unsigned long pfn;
pgd = pgd_offset_k(address);
BUG_ON(pgd_none(*pgd));
@@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
- large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+ pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
+ large_pte = pfn_pte(pfn, ref_prot);
large_pte = pte_mkhuge(large_pte);
set_pte((pte_t *)pmd, large_pte);
}
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index 149aba05a5b8..c9eddc8859c0 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y += fixup.o init.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o common.o early.o
# mmconfig has a 64bit special
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
obj-$(CONFIG_NUMA) += k8-bus.o
@@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o
i386-y += ../../i386/pci/i386.o
init-y += ../../i386/pci/init.o
early-y += ../../i386/pci/early.o
+mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index 98202cb50d8a..65d82736987e 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -13,16 +13,6 @@
#include "pci.h"
-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN (2 * 1024*1024)
-#define MMCONFIG_APER_MAX (256 * 1024*1024)
-
-/* Verify the first 16 busses. We assume that systems with more busses
- get MCFG right. */
-#define MAX_CHECK_BUS 16
-
-static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
-
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
struct acpi_mcfg_allocation *cfg;
@@ -32,30 +22,17 @@ static struct mmcfg_virt *pci_mmcfg_virt;
static char __iomem *get_virt(unsigned int seg, unsigned bus)
{
- int cfg_num = -1;
struct acpi_mcfg_allocation *cfg;
+ int cfg_num;
- while (1) {
- ++cfg_num;
- if (cfg_num >= pci_mmcfg_config_num)
- break;
+ for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
cfg = pci_mmcfg_virt[cfg_num].cfg;
- if (cfg->pci_segment != seg)
- continue;
- if ((cfg->start_bus_number <= bus) &&
+ if (cfg->pci_segment == seg &&
+ (cfg->start_bus_number <= bus) &&
(cfg->end_bus_number >= bus))
return pci_mmcfg_virt[cfg_num].virt;
}
- /* Handle more broken MCFG tables on Asus etc.
- They only contain a single entry for bus 0-0. Assume
- this applies to all busses. */
- cfg = &pci_mmcfg_config[0];
- if (pci_mmcfg_config_num == 1 &&
- cfg->pci_segment == 0 &&
- (cfg->start_bus_number | cfg->end_bus_number) == 0)
- return pci_mmcfg_virt[0].virt;
-
/* Fall back to type 0 */
return NULL;
}
@@ -63,8 +40,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
char __iomem *addr;
- if (seg == 0 && bus < MAX_CHECK_BUS &&
- test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+ test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
return NULL;
addr = get_virt(seg, bus);
if (!addr)
@@ -135,81 +112,46 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};
-/* K8 systems have some devices (typically in the builtin northbridge)
- that are only accessible using type1
- Normally this can be expressed in the MCFG by not listing them
- and assigning suitable _SEGs, but this isn't implemented in some BIOS.
- Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them. */
-static __init void unreachable_devices(void)
+static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
{
- int i, k;
- /* Use the max bus number from ACPI here? */
- for (k = 0; k < MAX_CHECK_BUS; k++) {
- for (i = 0; i < 32; i++) {
- u32 val1;
- char __iomem *addr;
-
- pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
- addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
- if (addr == NULL|| readl(addr) != val1) {
- set_bit(i + 32*k, fallback_slots);
- printk(KERN_NOTICE "PCI: No mmconfig possible"
- " on device %02x:%02x\n", k, i);
- }
- }
+ void __iomem *addr;
+ u32 size;
+
+ size = (cfg->end_bus_number + 1) << 20;
+ addr = ioremap_nocache(cfg->address, size);
+ if (addr) {
+ printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
+ cfg->address, cfg->address + size - 1);
}
+ return addr;
}
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+ unsigned int devfn)
{
- int i;
-
- if ((pci_probe & PCI_PROBE_MMCONF) == 0)
- return;
-
- if (acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg))
- return;
-
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
- return;
-
- /* Only do this check when type 1 works. If it doesn't work
- assume we run on a Mac and always use MCFG */
- if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
- pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
- E820_RESERVED)) {
- printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
- (unsigned long)pci_mmcfg_config[0].address);
- printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
- return;
- }
+ return pci_dev_base(seg, bus, devfn) != NULL;
+}
- pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
+int __init pci_mmcfg_arch_init(void)
+{
+ int i;
+ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
+ pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {
printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
- return;
+ return 0;
}
+
for (i = 0; i < pci_mmcfg_config_num; ++i) {
pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
- pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
- MMCONFIG_APER_MAX);
+ pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
if (!pci_mmcfg_virt[i].virt) {
printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
"segment %d\n",
pci_mmcfg_config[i].pci_segment);
- return;
+ return 0;
}
- printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n",
- (unsigned long)pci_mmcfg_config[i].address);
}
-
- unreachable_devices();
-
raw_pci_ops = &pci_mmcfg;
- pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ return 1;
}