aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386')
-rw-r--r--arch/i386/Kconfig36
-rw-r--r--arch/i386/Kconfig.cpu11
-rw-r--r--arch/i386/Makefile4
-rw-r--r--arch/i386/boot/.gitignore2
-rw-r--r--arch/i386/boot/Makefile48
-rw-r--r--arch/i386/boot/a20.c161
-rw-r--r--arch/i386/boot/apm.c97
-rw-r--r--arch/i386/boot/bitops.h45
-rw-r--r--arch/i386/boot/boot.h296
-rw-r--r--arch/i386/boot/bootsect.S98
-rw-r--r--arch/i386/boot/cmdline.c97
-rw-r--r--arch/i386/boot/code16gcc.h15
-rw-r--r--arch/i386/boot/compressed/Makefile7
-rw-r--r--arch/i386/boot/compressed/head.S6
-rw-r--r--arch/i386/boot/compressed/relocs.c3
-rw-r--r--arch/i386/boot/copy.S101
-rw-r--r--arch/i386/boot/cpu.c69
-rw-r--r--arch/i386/boot/cpucheck.c267
-rw-r--r--arch/i386/boot/edd.S231
-rw-r--r--arch/i386/boot/edd.c196
-rw-r--r--arch/i386/boot/header.S283
-rw-r--r--arch/i386/boot/main.c161
-rw-r--r--arch/i386/boot/mca.c43
-rw-r--r--arch/i386/boot/memory.c99
-rw-r--r--arch/i386/boot/pm.c170
-rw-r--r--arch/i386/boot/pmjump.S54
-rw-r--r--arch/i386/boot/printf.c307
-rw-r--r--arch/i386/boot/setup.S1075
-rw-r--r--arch/i386/boot/setup.ld54
-rw-r--r--arch/i386/boot/string.c52
-rw-r--r--arch/i386/boot/tools/build.c162
-rw-r--r--arch/i386/boot/tty.c112
-rw-r--r--arch/i386/boot/version.c23
-rw-r--r--arch/i386/boot/vesa.h79
-rw-r--r--arch/i386/boot/video-bios.c125
-rw-r--r--arch/i386/boot/video-vesa.c284
-rw-r--r--arch/i386/boot/video-vga.c260
-rw-r--r--arch/i386/boot/video.S2043
-rw-r--r--arch/i386/boot/video.c461
-rw-r--r--arch/i386/boot/video.h152
-rw-r--r--arch/i386/boot/voyager.c46
-rw-r--r--arch/i386/defconfig265
-rw-r--r--arch/i386/kernel/Makefile1
-rw-r--r--arch/i386/kernel/acpi/boot.c44
-rw-r--r--arch/i386/kernel/acpi/sleep.c12
-rw-r--r--arch/i386/kernel/acpi/wakeup.S37
-rw-r--r--arch/i386/kernel/alternative.c69
-rw-r--r--arch/i386/kernel/apic.c10
-rw-r--r--arch/i386/kernel/apm.c2
-rw-r--r--arch/i386/kernel/asm-offsets.c29
-rw-r--r--arch/i386/kernel/cpu/Makefile3
-rw-r--r--arch/i386/kernel/cpu/addon_cpuid_features.c50
-rw-r--r--arch/i386/kernel/cpu/amd.c11
-rw-r--r--arch/i386/kernel/cpu/common.c4
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig31
-rw-r--r--arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c17
-rw-r--r--arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c6
-rw-r--r--arch/i386/kernel/cpu/cpufreq/gx-suspmod.c6
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c216
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.h12
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.c29
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c276
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-ich.c4
-rw-r--r--arch/i386/kernel/cpu/cyrix.c2
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c79
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c14
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c4
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c6
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c1
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/state.c1
-rw-r--r--arch/i386/kernel/cpu/perfctr-watchdog.c10
-rw-r--r--arch/i386/kernel/cpu/proc.c21
-rw-r--r--arch/i386/kernel/cpu/rise.c52
-rw-r--r--arch/i386/kernel/e820.c34
-rw-r--r--arch/i386/kernel/efi.c2
-rw-r--r--arch/i386/kernel/entry.S87
-rw-r--r--arch/i386/kernel/geode.c155
-rw-r--r--arch/i386/kernel/head.S13
-rw-r--r--arch/i386/kernel/hpet.c98
-rw-r--r--arch/i386/kernel/i8253.c32
-rw-r--r--arch/i386/kernel/init_task.c2
-rw-r--r--arch/i386/kernel/io_apic.c27
-rw-r--r--arch/i386/kernel/irq.c10
-rw-r--r--arch/i386/kernel/kprobes.c9
-rw-r--r--arch/i386/kernel/nmi.c10
-rw-r--r--arch/i386/kernel/paravirt.c55
-rw-r--r--arch/i386/kernel/process.c85
-rw-r--r--arch/i386/kernel/ptrace.c39
-rw-r--r--arch/i386/kernel/reboot.c9
-rw-r--r--arch/i386/kernel/setup.c25
-rw-r--r--arch/i386/kernel/signal.c7
-rw-r--r--arch/i386/kernel/smp.c5
-rw-r--r--arch/i386/kernel/smpboot.c8
-rw-r--r--arch/i386/kernel/smpcommon.c8
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/i386/kernel/sysenter.c4
-rw-r--r--arch/i386/kernel/time.c50
-rw-r--r--arch/i386/kernel/traps.c53
-rw-r--r--arch/i386/kernel/tsc.c27
-rw-r--r--arch/i386/kernel/verify_cpu.S94
-rw-r--r--arch/i386/kernel/vmi.c4
-rw-r--r--arch/i386/kernel/vmiclock.c8
-rw-r--r--arch/i386/kernel/vmlinux.lds.S8
-rw-r--r--arch/i386/kernel/vsyscall-note.S52
-rw-r--r--arch/i386/lib/Makefile2
-rw-r--r--arch/i386/lib/string.c257
-rw-r--r--arch/i386/mach-generic/es7000.c2
-rw-r--r--arch/i386/mach-visws/traps.c4
-rw-r--r--arch/i386/mach-voyager/voyager_thread.c2
-rw-r--r--arch/i386/mm/fault.c33
-rw-r--r--arch/i386/mm/init.c27
-rw-r--r--arch/i386/mm/ioremap.c2
-rw-r--r--arch/i386/mm/pageattr.c22
-rw-r--r--arch/i386/mm/pgtable.c6
-rw-r--r--arch/i386/pci/acpi.c32
-rw-r--r--arch/i386/pci/common.c13
-rw-r--r--arch/i386/pci/fixup.c9
-rw-r--r--arch/i386/pci/mmconfig-shared.c48
-rw-r--r--arch/i386/video/Makefile1
-rw-r--r--arch/i386/video/fbdev.c32
-rw-r--r--arch/i386/xen/Kconfig11
-rw-r--r--arch/i386/xen/Makefile4
-rw-r--r--arch/i386/xen/enlighten.c1144
-rw-r--r--arch/i386/xen/events.c591
-rw-r--r--arch/i386/xen/features.c29
-rw-r--r--arch/i386/xen/manage.c143
-rw-r--r--arch/i386/xen/mmu.c564
-rw-r--r--arch/i386/xen/mmu.h60
-rw-r--r--arch/i386/xen/multicalls.c90
-rw-r--r--arch/i386/xen/multicalls.h45
-rw-r--r--arch/i386/xen/setup.c111
-rw-r--r--arch/i386/xen/smp.c404
-rw-r--r--arch/i386/xen/time.c593
-rw-r--r--arch/i386/xen/vdso.h4
-rw-r--r--arch/i386/xen/xen-asm.S291
-rw-r--r--arch/i386/xen/xen-head.S38
-rw-r--r--arch/i386/xen/xen-ops.h71
139 files changed, 10184 insertions, 4685 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b1b2b30b1b8e..abb582bc218f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,10 @@ config GENERIC_TIME
bool
default y
+config GENERIC_CMOS_UPDATE
+ bool
+ default y
+
config CLOCKSOURCE_WATCHDOG
bool
default y
@@ -222,6 +226,8 @@ config PARAVIRT
However, when run without a hypervisor the kernel is
theoretically slower. If in doubt, say N.
+source "arch/i386/xen/Kconfig"
+
config VMI
bool "VMI Paravirt-ops support"
depends on PARAVIRT
@@ -542,6 +548,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
depends on !M386 && !M486
+ select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
gigabytes of physical RAM.
@@ -571,12 +578,12 @@ choice
config VMSPLIT_3G
bool "3G/1G user/kernel split"
config VMSPLIT_3G_OPT
- depends on !HIGHMEM
+ depends on !X86_PAE
bool "3G/1G user/kernel split (for full 1G low memory)"
config VMSPLIT_2G
bool "2G/2G user/kernel split"
config VMSPLIT_2G_OPT
- depends on !HIGHMEM
+ depends on !X86_PAE
bool "2G/2G user/kernel split (for full 2G low memory)"
config VMSPLIT_1G
bool "1G/3G user/kernel split"
@@ -596,10 +603,15 @@ config HIGHMEM
default y
config X86_PAE
- bool
- depends on HIGHMEM64G
- default y
+ bool "PAE (Physical Address Extension) Support"
+ default n
+ depends on !HIGHMEM4G
select RESOURCES_64BIT
+ help
+ PAE is required for NX support, and furthermore enables
+ larger swapspace support for non-overcommit purposes. It
+ has the cost of more pagetable lookup overhead, and also
+ consumes more pagetable space per process.
# Common NUMA Features
config NUMA
@@ -815,6 +827,7 @@ config CRASH_DUMP
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+ default "0x1000000" if X86_NUMAQ
default "0x100000"
help
This gives the physical address where the kernel is loaded.
@@ -1212,21 +1225,26 @@ source "drivers/Kconfig"
source "fs/Kconfig"
-menu "Instrumentation Support"
+menuconfig INSTRUMENTATION
+ bool "Instrumentation Support"
depends on EXPERIMENTAL
+ default y
+
+if INSTRUMENTATION
source "arch/i386/oprofile/Kconfig"
config KPROBES
- bool "Kprobes (EXPERIMENTAL)"
- depends on KALLSYMS && EXPERIMENTAL && MODULES
+ bool "Kprobes"
+ depends on KALLSYMS && MODULES
help
Kprobes allows you to trap at almost any kernel address and
execute a callback function. register_kprobe() establishes
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
-endmenu
+
+endif # INSTRUMENTATION
source "arch/i386/Kconfig.debug"
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 5c95ceb7f122..11a24d54f27b 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -297,11 +297,6 @@ config X86_POPAD_OK
depends on !M386
default y
-config X86_CMPXCHG64
- bool
- depends on X86_PAE
- default y
-
config X86_ALIGNMENT_16
bool
depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
@@ -344,8 +339,8 @@ config X86_CMOV
depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
default y
-config X86_MINIMUM_CPU_MODEL
+config X86_MINIMUM_CPU_FAMILY
int
- default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP
- default "0"
+ default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK
+ default "3"
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index bd28f9f9b4b7..01f0ff0daaf4 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
mcore-$(CONFIG_X86_ES7000) := mach-default
core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
+# Xen paravirtualization support
+core-$(CONFIG_XEN) += arch/i386/xen/
+
# default subarch .h files
mflags-y += -Iinclude/asm-i386/mach-default
@@ -108,6 +111,7 @@ drivers-$(CONFIG_PCI) += arch/i386/pci/
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/
drivers-$(CONFIG_PM) += arch/i386/power/
+drivers-$(CONFIG_FB) += arch/i386/video/
CFLAGS += $(mflags-y)
AFLAGS += $(mflags-y)
diff --git a/arch/i386/boot/.gitignore b/arch/i386/boot/.gitignore
index 495f20c085de..18465143cfa2 100644
--- a/arch/i386/boot/.gitignore
+++ b/arch/i386/boot/.gitignore
@@ -1,3 +1,5 @@
bootsect
bzImage
setup
+setup.bin
+setup.elf
diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile
index bfbc32098a4a..93386a4e40b4 100644
--- a/arch/i386/boot/Makefile
+++ b/arch/i386/boot/Makefile
@@ -25,27 +25,56 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
#RAMDISK := -DRAMDISK=512
-targets := vmlinux.bin bootsect bootsect.o \
- setup setup.o zImage bzImage
+targets := vmlinux.bin setup.bin setup.elf zImage bzImage
subdir- := compressed
+setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
+setup-y += printf.o string.o tty.o video.o version.o voyager.o
+
+# The link order of the video-*.o modules can matter. In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+setup-y += video-vga.o
+setup-y += video-vesa.o
+setup-y += video-bios.o
+targets += $(setup-y)
hostprogs-y := tools/build
HOSTCFLAGS_build.o := $(LINUXINCLUDE)
# ---------------------------------------------------------------------------
+# How to compile the 16-bit code. Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+cflags-i386 :=
+cflags-x86_64 := -m32
+CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
+ $(cflags-$(ARCH)) \
+ -Wall -Wstrict-prototypes \
+ -march=i386 -mregparm=3 \
+ -include $(srctree)/$(src)/code16gcc.h \
+ -fno-strict-aliasing -fomit-frame-pointer \
+ $(call cc-option, -ffreestanding) \
+ $(call cc-option, -fno-toplevel-reorder,\
+ $(call cc-option, -fno-unit-at-a-time)) \
+ $(call cc-option, -fno-stack-protector) \
+ $(call cc-option, -mpreferred-stack-boundary=2)
+AFLAGS := $(CFLAGS) -D__ASSEMBLY__
+
$(obj)/zImage: IMAGE_OFFSET := 0x1000
$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK)
$(obj)/bzImage: IMAGE_OFFSET := 0x100000
+$(obj)/bzImage: EXTRA_CFLAGS := -D__BIG_KERNEL__
$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
$(obj)/bzImage: BUILDFLAGS := -b
quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \
+cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
$(obj)/vmlinux.bin $(ROOT_DEV) > $@
-$(obj)/zImage $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
+$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
$(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
@@ -53,12 +82,17 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
$(call if_changed,objcopy)
-LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
-LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext
+SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-$(obj)/setup $(obj)/bootsect: %: %.o FORCE
+LDFLAGS_setup.elf := -T
+$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
$(call if_changed,ld)
+OBJCOPYFLAGS_setup.bin := -O binary
+
+$(obj)/setup.bin: $(obj)/setup.elf FORCE
+ $(call if_changed,objcopy)
+
$(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
diff --git a/arch/i386/boot/a20.c b/arch/i386/boot/a20.c
new file mode 100644
index 000000000000..31348d054fca
--- /dev/null
+++ b/arch/i386/boot/a20.c
@@ -0,0 +1,161 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/a20.c
+ *
+ * Enable A20 gate (return -1 on failure)
+ */
+
+#include "boot.h"
+
+#define MAX_8042_LOOPS 100000
+
+static int empty_8042(void)
+{
+ u8 status;
+ int loops = MAX_8042_LOOPS;
+
+ while (loops--) {
+ io_delay();
+
+ status = inb(0x64);
+ if (status & 1) {
+ /* Read and discard input data */
+ io_delay();
+ (void)inb(0x60);
+ } else if (!(status & 2)) {
+ /* Buffers empty, finished! */
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+/* Returns nonzero if the A20 line is enabled. The memory address
+ used as a test is the int $0x80 vector, which should be safe. */
+
+#define A20_TEST_ADDR (4*0x80)
+#define A20_TEST_SHORT 32
+#define A20_TEST_LONG 2097152 /* 2^21 */
+
+static int a20_test(int loops)
+{
+ int ok = 0;
+ int saved, ctr;
+
+ set_fs(0x0000);
+ set_gs(0xffff);
+
+ saved = ctr = rdfs32(A20_TEST_ADDR);
+
+ while (loops--) {
+ wrfs32(++ctr, A20_TEST_ADDR);
+ io_delay(); /* Serialize and make delay constant */
+ ok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr;
+ if (ok)
+ break;
+ }
+
+ wrfs32(saved, A20_TEST_ADDR);
+ return ok;
+}
+
+/* Quick test to see if A20 is already enabled */
+static int a20_test_short(void)
+{
+ return a20_test(A20_TEST_SHORT);
+}
+
+/* Longer test that actually waits for A20 to come on line; this
+ is useful when dealing with the KBC or other slow external circuitry. */
+static int a20_test_long(void)
+{
+ return a20_test(A20_TEST_LONG);
+}
+
+static void enable_a20_bios(void)
+{
+ asm volatile("pushfl; int $0x15; popfl"
+ : : "a" ((u16)0x2401));
+}
+
+static void enable_a20_kbc(void)
+{
+ empty_8042();
+
+ outb(0xd1, 0x64); /* Command write */
+ empty_8042();
+
+ outb(0xdf, 0x60); /* A20 on */
+ empty_8042();
+}
+
+static void enable_a20_fast(void)
+{
+ u8 port_a;
+
+ port_a = inb(0x92); /* Configuration port A */
+ port_a |= 0x02; /* Enable A20 */
+ port_a &= ~0x01; /* Do not reset machine */
+ outb(port_a, 0x92);
+}
+
+/*
+ * Actual routine to enable A20; return 0 on ok, -1 on failure
+ */
+
+#define A20_ENABLE_LOOPS 255 /* Number of times to try */
+
+int enable_a20(void)
+{
+ int loops = A20_ENABLE_LOOPS;
+
+#if defined(CONFIG_X86_ELAN)
+ /* Elan croaks if we try to touch the KBC */
+ enable_a20_fast();
+ while (!a20_test_long())
+ ;
+ return 0;
+#elif defined(CONFIG_X86_VOYAGER)
+ /* On Voyager, a20_test() is unsafe? */
+ enable_a20_kbc();
+ return 0;
+#else
+ while (loops--) {
+ /* First, check to see if A20 is already enabled
+ (legacy free, etc.) */
+ if (a20_test_short())
+ return 0;
+
+ /* Next, try the BIOS (INT 0x15, AX=0x2401) */
+ enable_a20_bios();
+ if (a20_test_short())
+ return 0;
+
+ /* Try enabling A20 through the keyboard controller */
+ empty_8042();
+ if (a20_test_short())
+ return 0; /* BIOS worked, but with delayed reaction */
+
+ enable_a20_kbc();
+ if (a20_test_long())
+ return 0;
+
+ /* Finally, try enabling the "fast A20 gate" */
+ enable_a20_fast();
+ if (a20_test_long())
+ return 0;
+ }
+
+ return -1;
+#endif
+}
diff --git a/arch/i386/boot/apm.c b/arch/i386/boot/apm.c
new file mode 100644
index 000000000000..a34087c370c0
--- /dev/null
+++ b/arch/i386/boot/apm.c
@@ -0,0 +1,97 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * Original APM BIOS checking by Stephen Rothwell, May 1994
+ * (sfr@canb.auug.org.au)
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/apm.c
+ *
+ * Get APM BIOS information
+ */
+
+#include "boot.h"
+
+#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
+
+int query_apm_bios(void)
+{
+ u16 ax, bx, cx, dx, di;
+ u32 ebx, esi;
+ u8 err;
+
+ /* APM BIOS installation check */
+ ax = 0x5300;
+ bx = cx = 0;
+ asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
+ : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
+ : : "esi", "edi");
+
+ if (err)
+ return -1; /* No APM BIOS */
+
+ if (bx != 0x504d) /* "PM" signature */
+ return -1;
+
+ if (cx & 0x02) /* 32 bits supported? */
+ return -1;
+
+ /* Disconnect first, just in case */
+ ax = 0x5304;
+ asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
+ : "+a" (ax)
+ : : "ebx", "ecx", "edx", "esi", "edi");
+
+ /* Paranoia */
+ ebx = esi = 0;
+ cx = dx = di = 0;
+
+ /* 32-bit connect */
+ asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
+ : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
+ "+S" (esi), "+D" (di), "=m" (err)
+ : "a" (0x5303));
+
+ boot_params.apm_bios_info.cseg = ax;
+ boot_params.apm_bios_info.offset = ebx;
+ boot_params.apm_bios_info.cseg_16 = cx;
+ boot_params.apm_bios_info.dseg = dx;
+ boot_params.apm_bios_info.cseg_len = (u16)esi;
+ boot_params.apm_bios_info.cseg_16_len = esi >> 16;
+ boot_params.apm_bios_info.dseg_len = di;
+
+ if (err)
+ return -1;
+
+ /* Redo the installation check as the 32-bit connect;
+ some BIOSes return different flags this way... */
+
+ ax = 0x5300;
+ bx = cx = 0;
+ asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
+ : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
+ : : "esi", "edi");
+
+ if (err || bx != 0x504d) {
+ /* Failure with 32-bit connect, try to disconect and ignore */
+ ax = 0x5304;
+ bx = 0;
+ asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
+ : "+a" (ax), "+b" (bx)
+ : : "ecx", "edx", "esi", "edi");
+ return -1;
+ }
+
+ boot_params.apm_bios_info.version = ax;
+ boot_params.apm_bios_info.flags = cx;
+ return 0;
+}
+
+#endif
diff --git a/arch/i386/boot/bitops.h b/arch/i386/boot/bitops.h
new file mode 100644
index 000000000000..8dcc8dc7db88
--- /dev/null
+++ b/arch/i386/boot/bitops.h
@@ -0,0 +1,45 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/bitops.h
+ *
+ * Very simple bitops for the boot code.
+ */
+
+#ifndef BOOT_BITOPS_H
+#define BOOT_BITOPS_H
+#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
+
+static inline int constant_test_bit(int nr, const void *addr)
+{
+ const u32 *p = (const u32 *)addr;
+ return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
+}
+static inline int variable_test_bit(int nr, const void *addr)
+{
+ u8 v;
+ const u32 *p = (const u32 *)addr;
+
+ asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+ return v;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+static inline void set_bit(int nr, void *addr)
+{
+ asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr));
+}
+
+#endif /* BOOT_BITOPS_H */
diff --git a/arch/i386/boot/boot.h b/arch/i386/boot/boot.h
new file mode 100644
index 000000000000..dec70c9b6050
--- /dev/null
+++ b/arch/i386/boot/boot.h
@@ -0,0 +1,296 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/boot.h
+ *
+ * Header file for the real-mode kernel code
+ */
+
+#ifndef BOOT_BOOT_H
+#define BOOT_BOOT_H
+
+#ifndef __ASSEMBLY__
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/edd.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+
+/* Useful macros */
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+extern struct setup_header hdr;
+extern struct boot_params boot_params;
+
+/* Basic port I/O */
+static inline void outb(u8 v, u16 port)
+{
+ asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u8 inb(u16 port)
+{
+ u8 v;
+ asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void outw(u16 v, u16 port)
+{
+ asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u16 inw(u16 port)
+{
+ u16 v;
+ asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void outl(u32 v, u16 port)
+{
+ asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u32 inl(u32 port)
+{
+ u32 v;
+ asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void io_delay(void)
+{
+ const u16 DELAY_PORT = 0x80;
+ asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+}
+
+/* These functions are used to reference data in other segments. */
+
+static inline u16 ds(void)
+{
+ u16 seg;
+ asm("movw %%ds,%0" : "=rm" (seg));
+ return seg;
+}
+
+static inline void set_fs(u16 seg)
+{
+ asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+static inline u16 fs(void)
+{
+ u16 seg;
+ asm("movw %%fs,%0" : "=rm" (seg));
+ return seg;
+}
+
+static inline void set_gs(u16 seg)
+{
+ asm volatile("movw %0,%%gs" : : "rm" (seg));
+}
+static inline u16 gs(void)
+{
+ u16 seg;
+ asm("movw %%gs,%0" : "=rm" (seg));
+ return seg;
+}
+
+typedef unsigned int addr_t;
+
+static inline u8 rdfs8(addr_t addr)
+{
+ u8 v;
+ asm("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+ return v;
+}
+static inline u16 rdfs16(addr_t addr)
+{
+ u16 v;
+ asm("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ return v;
+}
+static inline u32 rdfs32(addr_t addr)
+{
+ u32 v;
+ asm("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ return v;
+}
+
+static inline void wrfs8(u8 v, addr_t addr)
+{
+ asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+}
+static inline void wrfs16(u16 v, addr_t addr)
+{
+ asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+}
+static inline void wrfs32(u32 v, addr_t addr)
+{
+ asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+}
+
+static inline u8 rdgs8(addr_t addr)
+{
+ u8 v;
+ asm("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+ return v;
+}
+static inline u16 rdgs16(addr_t addr)
+{
+ u16 v;
+ asm("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ return v;
+}
+static inline u32 rdgs32(addr_t addr)
+{
+ u32 v;
+ asm("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ return v;
+}
+
+static inline void wrgs8(u8 v, addr_t addr)
+{
+ asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+}
+static inline void wrgs16(u16 v, addr_t addr)
+{
+ asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+}
+static inline void wrgs32(u32 v, addr_t addr)
+{
+ asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+}
+
+/* Note: these only return true/false, not a signed return value! */
+static inline int memcmp(const void *s1, const void *s2, size_t len)
+{
+ u8 diff;
+ asm("repe; cmpsb; setnz %0"
+ : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
+static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
+{
+ u8 diff;
+ asm("fs; repe; cmpsb; setnz %0"
+ : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
+{
+ u8 diff;
+ asm("gs; repe; cmpsb; setnz %0"
+ : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
+static inline int isdigit(int ch)
+{
+ return (ch >= '0') && (ch <= '9');
+}
+
+/* Heap -- available for dynamic lists. */
+#define STACK_SIZE 512 /* Minimum number of bytes for stack */
+
+extern char _end[];
+extern char *HEAP;
+extern char *heap_end;
+#define RESET_HEAP() ((void *)( HEAP = _end ))
+static inline char *__get_heap(size_t s, size_t a, size_t n)
+{
+ char *tmp;
+
+ HEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1));
+ tmp = HEAP;
+ HEAP += s*n;
+ return tmp;
+}
+#define GET_HEAP(type, n) \
+ ((type *)__get_heap(sizeof(type),__alignof__(type),(n)))
+
+static inline int heap_free(void)
+{
+ return heap_end-HEAP;
+}
+
+/* copy.S */
+
+void copy_to_fs(addr_t dst, void *src, size_t len);
+void *copy_from_fs(void *dst, addr_t src, size_t len);
+void copy_to_gs(addr_t dst, void *src, size_t len);
+void *copy_from_gs(void *dst, addr_t src, size_t len);
+void *memcpy(void *dst, void *src, size_t len);
+void *memset(void *dst, int c, size_t len);
+
+#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
+#define memset(d,c,l) __builtin_memset(d,c,l)
+
+/* a20.c */
+int enable_a20(void);
+
+/* apm.c */
+int query_apm_bios(void);
+
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+
+/* cpu.c, cpucheck.c */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int validate_cpu(void);
+
+/* edd.c */
+void query_edd(void);
+
+/* header.S */
+void __attribute__((noreturn)) die(void);
+
+/* mca.c */
+int query_mca(void);
+
+/* memory.c */
+int detect_memory(void);
+
+/* pm.c */
+void __attribute__((noreturn)) go_to_protected_mode(void);
+
+/* pmjump.S */
+void __attribute__((noreturn))
+ protected_mode_jump(u32 entrypoint, u32 bootparams);
+
+/* printf.c */
+int sprintf(char *buf, const char *fmt, ...);
+int vsprintf(char *buf, const char *fmt, va_list args);
+int printf(const char *fmt, ...);
+
+/* string.c */
+int strcmp(const char *str1, const char *str2);
+size_t strnlen(const char *s, size_t maxlen);
+unsigned int atou(const char *s);
+
+/* tty.c */
+void puts(const char *);
+void putchar(int);
+int getchar(void);
+void kbd_flush(void);
+int getchar_timeout(void);
+
+/* video.c */
+void set_video(void);
+
+/* video-vesa.c */
+void vesa_store_edid(void);
+
+/* voyager.c */
+int query_voyager(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* BOOT_BOOT_H */
diff --git a/arch/i386/boot/bootsect.S b/arch/i386/boot/bootsect.S
deleted file mode 100644
index 011b7a4993d4..000000000000
--- a/arch/i386/boot/bootsect.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds
- *
- * modified by Drew Eckhardt
- * modified by Bruce Evans (bde)
- * modified by Chris Noe (May 1999) (as86 -> gas)
- * gutted by H. Peter Anvin (Jan 2003)
- *
- * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
- * addresses must be multiplied by 16 to obtain their respective linear
- * addresses. To avoid confusion, linear addresses are written using leading
- * hex while segment addresses are written as segment:offset.
- *
- */
-
-#include <asm/boot.h>
-
-SETUPSECTS = 4 /* default nr of setup-sectors */
-BOOTSEG = 0x07C0 /* original address of boot-sector */
-INITSEG = DEF_INITSEG /* we move boot here - out of the way */
-SETUPSEG = DEF_SETUPSEG /* setup starts here */
-SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
-SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
- /* to be loaded */
-ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
-SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
-
-#ifndef SVGA_MODE
-#define SVGA_MODE ASK_VGA
-#endif
-
-#ifndef RAMDISK
-#define RAMDISK 0
-#endif
-
-#ifndef ROOT_RDONLY
-#define ROOT_RDONLY 1
-#endif
-
-.code16
-.text
-
-.global _start
-_start:
-
- # Normalize the start address
- jmpl $BOOTSEG, $start2
-
-start2:
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- movw $0x7c00, %sp
- sti
- cld
-
- movw $bugger_off_msg, %si
-
-msg_loop:
- lodsb
- andb %al, %al
- jz die
- movb $0xe, %ah
- movw $7, %bx
- int $0x10
- jmp msg_loop
-
-die:
- # Allow the user to press a key, then reboot
- xorw %ax, %ax
- int $0x16
- int $0x19
-
- # int 0x19 should never return. In case it does anyway,
- # invoke the BIOS reset code...
- ljmp $0xf000,$0xfff0
-
-
-bugger_off_msg:
- .ascii "Direct booting from floppy is no longer supported.\r\n"
- .ascii "Please use a boot loader program instead.\r\n"
- .ascii "\n"
- .ascii "Remove disk and press any key to reboot . . .\r\n"
- .byte 0
-
-
- # Kernel attributes; used by setup
-
- .org 497
-setup_sects: .byte SETUPSECTS
-root_flags: .word ROOT_RDONLY
-syssize: .word SYSSIZE
-swap_dev: .word SWAP_DEV
-ram_size: .word RAMDISK
-vid_mode: .word SVGA_MODE
-root_dev: .word ROOT_DEV
-boot_flag: .word 0xAA55
diff --git a/arch/i386/boot/cmdline.c b/arch/i386/boot/cmdline.c
new file mode 100644
index 000000000000..34bb778c4357
--- /dev/null
+++ b/arch/i386/boot/cmdline.c
@@ -0,0 +1,97 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/cmdline.c
+ *
+ * Simple command-line parser for early boot.
+ */
+
+#include "boot.h"
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/*
+ * Find a non-boolean option, that is, "option=argument". In accordance
+ * with standard Linux practice, if this option is repeated, this returns
+ * the last instance on the command line.
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
+ addr_t cptr;
+ char c;
+ int len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline_ptr || cmdline_ptr >= 0x100000)
+ return -1; /* No command line, or inaccessible */
+
+ cptr = cmdline_ptr & 0xf;
+ set_fs(cmdline_ptr >> 4);
+
+ while (cptr < 0x10000 && (c = rdfs8(cptr++))) {
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ /* else */
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if (c == '=' && !*opptr) {
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ } else if (myisspace(c)) {
+ state = st_wordstart;
+ } else if (c != *opptr++) {
+ state = st_wordskip;
+ }
+ break;
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ if (len < bufsize-1)
+ *bufptr++ = c;
+ len++;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
diff --git a/arch/i386/boot/code16gcc.h b/arch/i386/boot/code16gcc.h
new file mode 100644
index 000000000000..3bd848093b9d
--- /dev/null
+++ b/arch/i386/boot/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc omits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index a661217f33ec..189fa1dbefcc 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -9,9 +9,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
EXTRA_AFLAGS := -traditional
LDFLAGS_vmlinux := -T
-CFLAGS_misc.o += -fPIC
hostprogs-y := relocs
+CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
+ -fno-strict-aliasing -fPIC \
+ $(call cc-option,-ffreestanding) \
+ $(call cc-option,-fno-stack-protector)
+LDFLAGS := -m elf_i386
+
$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index 3517a32aaf41..f35ea2237522 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -45,10 +45,10 @@ startup_32:
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
- * data at 0x34-0x3f are used as the stack for this calculation.
- * Only 4 bytes are needed.
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
*/
- leal 0x40(%esi), %esp
+ leal (0x1e4+4)(%esi), %esp
call 1f
1: popl %ebp
subl $1b, %ebp
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aaf..2d77ee728f92 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,9 @@ static const char* safe_abs_relocs[] = {
"__kernel_rt_sigreturn",
"__kernel_sigreturn",
"SYSENTER_RETURN",
+ "VDSO_NOTE_MASK",
+ "xen_irq_disable_direct_reloc",
+ "xen_save_fl_direct_reloc",
};
static int is_safe_abs_reloc(const char* sym_name)
diff --git a/arch/i386/boot/copy.S b/arch/i386/boot/copy.S
new file mode 100644
index 000000000000..ef127e56a3cf
--- /dev/null
+++ b/arch/i386/boot/copy.S
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/copy.S
+ *
+ * Memory copy routines
+ */
+
+ .code16gcc
+ .text
+
+ .globl memcpy
+ .type memcpy, @function
+memcpy:
+ pushw %si
+ pushw %di
+ movw %ax, %di
+ movw %dx, %si
+ pushw %cx
+ shrw $2, %cx
+ rep; movsl
+ popw %cx
+ andw $3, %cx
+ rep; movsb
+ popw %di
+ popw %si
+ ret
+ .size memcpy, .-memcpy
+
+ .globl memset
+ .type memset, @function
+memset:
+ pushw %di
+ movw %ax, %di
+ movzbl %dl, %eax
+ imull $0x01010101,%eax
+ pushw %cx
+ shrw $2, %cx
+ rep; stosl
+ popw %cx
+ andw $3, %cx
+ rep; stosb
+ popw %di
+ ret
+ .size memset, .-memset
+
+ .globl copy_from_fs
+ .type copy_from_fs, @function
+copy_from_fs:
+ pushw %ds
+ pushw %fs
+ popw %ds
+ call memcpy
+ popw %ds
+ ret
+ .size copy_from_fs, .-copy_from_fs
+
+ .globl copy_to_fs
+ .type copy_to_fs, @function
+copy_to_fs:
+ pushw %es
+ pushw %fs
+ popw %es
+ call memcpy
+ popw %es
+ ret
+ .size copy_to_fs, .-copy_to_fs
+
+#if 0 /* Not currently used, but can be enabled as needed */
+
+ .globl copy_from_gs
+ .type copy_from_gs, @function
+copy_from_gs:
+ pushw %ds
+ pushw %gs
+ popw %ds
+ call memcpy
+ popw %ds
+ ret
+ .size copy_from_gs, .-copy_from_gs
+ .globl copy_to_gs
+
+ .type copy_to_gs, @function
+copy_to_gs:
+ pushw %es
+ pushw %gs
+ popw %es
+ call memcpy
+ popw %es
+ ret
+ .size copy_to_gs, .-copy_to_gs
+
+#endif
diff --git a/arch/i386/boot/cpu.c b/arch/i386/boot/cpu.c
new file mode 100644
index 000000000000..2a5c32da5852
--- /dev/null
+++ b/arch/i386/boot/cpu.c
@@ -0,0 +1,69 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/cpu.c
+ *
+ * Check for obligatory CPU features and abort if the features are not
+ * present.
+ */
+
+#include "boot.h"
+#include "bitops.h"
+#include <asm/cpufeature.h>
+
+static char *cpu_name(int level)
+{
+ static char buf[6];
+
+ if (level == 64) {
+ return "x86-64";
+ } else {
+ sprintf(buf, "i%d86", level);
+ return buf;
+ }
+}
+
+int validate_cpu(void)
+{
+ u32 *err_flags;
+ int cpu_level, req_level;
+
+ check_cpu(&cpu_level, &req_level, &err_flags);
+
+ if (cpu_level < req_level) {
+ printf("This kernel requires an %s CPU, ",
+ cpu_name(req_level));
+ printf("but only detected an %s CPU.\n",
+ cpu_name(cpu_level));
+ return -1;
+ }
+
+ if (err_flags) {
+ int i, j;
+ puts("This kernel requires the following features "
+ "not present on the CPU:\n");
+
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+
+ for (j = 0; j < 32; j++) {
+ if (e & 1)
+ printf("%d:%d ", i, j);
+
+ e >>= 1;
+ }
+ }
+ putchar('\n');
+ return -1;
+ } else {
+ return 0;
+ }
+}
diff --git a/arch/i386/boot/cpucheck.c b/arch/i386/boot/cpucheck.c
new file mode 100644
index 000000000000..991e8ceae1de
--- /dev/null
+++ b/arch/i386/boot/cpucheck.c
@@ -0,0 +1,267 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/cpucheck.c
+ *
+ * Check for obligatory CPU features and abort if the features are not
+ * present. This code should be compilable as 16-, 32- or 64-bit
+ * code, so be very careful with types and inline assembly.
+ *
+ * This code should not contain any messages; that requires an
+ * additional wrapper.
+ *
+ * As written, this code is not safe for inclusion into the kernel
+ * proper (after FPU initialization, in particular).
+ */
+
+#ifdef _SETUP
+# include "boot.h"
+# include "bitops.h"
+#endif
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+
+struct cpu_features {
+ int level; /* Family, or 64 for x86-64 */
+ int model;
+ u32 flags[NCAPINTS];
+};
+
+static struct cpu_features cpu;
+static u32 cpu_vendor[3];
+static u32 err_flags[NCAPINTS];
+
+#ifdef CONFIG_X86_64
+static const int req_level = 64;
+#elif defined(CONFIG_X86_MINIMUM_CPU_FAMILY)
+static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
+#else
+static const int req_level = 3;
+#endif
+
+static const u32 req_flags[NCAPINTS] =
+{
+ REQUIRED_MASK0,
+ REQUIRED_MASK1,
+ REQUIRED_MASK2,
+ REQUIRED_MASK3,
+ REQUIRED_MASK4,
+ REQUIRED_MASK5,
+ REQUIRED_MASK6,
+ REQUIRED_MASK7,
+};
+
+#define A32(a,b,c,d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
+
+static int is_amd(void)
+{
+ return cpu_vendor[0] == A32('A','u','t','h') &&
+ cpu_vendor[1] == A32('e','n','t','i') &&
+ cpu_vendor[2] == A32('c','A','M','D');
+}
+
+static int is_centaur(void)
+{
+ return cpu_vendor[0] == A32('C','e','n','t') &&
+ cpu_vendor[1] == A32('a','u','r','H') &&
+ cpu_vendor[2] == A32('a','u','l','s');
+}
+
+static int is_transmeta(void)
+{
+ return cpu_vendor[0] == A32('G','e','n','u') &&
+ cpu_vendor[1] == A32('i','n','e','T') &&
+ cpu_vendor[2] == A32('M','x','8','6');
+}
+
+static int has_fpu(void)
+{
+ u16 fcw = -1, fsw = -1;
+ u32 cr0;
+
+ asm("movl %%cr0,%0" : "=r" (cr0));
+ if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+ cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+ asm volatile("movl %0,%%cr0" : : "r" (cr0));
+ }
+
+ asm("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static int has_eflag(u32 mask)
+{
+ u32 f0, f1;
+
+ asm("pushfl ; "
+ "pushfl ; "
+ "popl %0 ; "
+ "movl %0,%1 ; "
+ "xorl %2,%1 ; "
+ "pushl %1 ; "
+ "popfl ; "
+ "pushfl ; "
+ "popl %1 ; "
+ "popfl"
+ : "=&r" (f0), "=&r" (f1)
+ : "ri" (mask));
+
+ return !!((f0^f1) & mask);
+}
+
+static void get_flags(void)
+{
+ u32 max_intel_level, max_amd_level;
+ u32 tfms;
+
+ if (has_fpu())
+ set_bit(X86_FEATURE_FPU, cpu.flags);
+
+ if (has_eflag(X86_EFLAGS_ID)) {
+ asm("cpuid"
+ : "=a" (max_intel_level),
+ "=b" (cpu_vendor[0]),
+ "=d" (cpu_vendor[1]),
+ "=c" (cpu_vendor[2])
+ : "a" (0));
+
+ if (max_intel_level >= 0x00000001 &&
+ max_intel_level <= 0x0000ffff) {
+ asm("cpuid"
+ : "=a" (tfms),
+ "=c" (cpu.flags[4]),
+ "=d" (cpu.flags[0])
+ : "a" (0x00000001)
+ : "ebx");
+ cpu.level = (tfms >> 8) & 15;
+ cpu.model = (tfms >> 4) & 15;
+ if (cpu.level >= 6)
+ cpu.model += ((tfms >> 16) & 0xf) << 4;
+ }
+
+ asm("cpuid"
+ : "=a" (max_amd_level)
+ : "a" (0x80000000)
+ : "ebx", "ecx", "edx");
+
+ if (max_amd_level >= 0x80000001 &&
+ max_amd_level <= 0x8000ffff) {
+ u32 eax = 0x80000001;
+ asm("cpuid"
+ : "+a" (eax),
+ "=c" (cpu.flags[6]),
+ "=d" (cpu.flags[1])
+ : : "ebx");
+ }
+ }
+}
+
+/* Returns a bitmask of which words we have error bits in */
+static int check_flags(void)
+{
+ u32 err;
+ int i;
+
+ err = 0;
+ for (i = 0; i < NCAPINTS; i++) {
+ err_flags[i] = req_flags[i] & ~cpu.flags[i];
+ if (err_flags[i])
+ err |= 1 << i;
+ }
+
+ return err;
+}
+
+/*
+ * Returns -1 on error.
+ *
+ * *cpu_level is set to the current CPU level; *req_level to the required
+ * level. x86-64 is considered level 64 for this purpose.
+ *
+ * *err_flags_ptr is set to the flags error array if there are flags missing.
+ */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
+{
+ int err;
+
+ memset(&cpu.flags, 0, sizeof cpu.flags);
+ cpu.level = 3;
+
+ if (has_eflag(X86_EFLAGS_AC))
+ cpu.level = 4;
+
+ get_flags();
+ err = check_flags();
+
+ if (test_bit(X86_FEATURE_LM, cpu.flags))
+ cpu.level = 64;
+
+ if (err == 0x01 &&
+ !(err_flags[0] &
+ ~((1 << X86_FEATURE_XMM)|(1 << X86_FEATURE_XMM2))) &&
+ is_amd()) {
+ /* If this is an AMD and we're only missing SSE+SSE2, try to
+ turn them on */
+
+ u32 ecx = MSR_K7_HWCR;
+ u32 eax, edx;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ eax &= ~(1 << 15);
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ get_flags(); /* Make sure it really did something */
+ err = check_flags();
+ } else if (err == 0x01 &&
+ !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
+ is_centaur() && cpu.model >= 6) {
+ /* If this is a VIA C3, we might have to enable CX8
+ explicitly */
+
+ u32 ecx = MSR_VIA_FCR;
+ u32 eax, edx;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ eax |= (1<<1)|(1<<7);
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ set_bit(X86_FEATURE_CX8, cpu.flags);
+ err = check_flags();
+ } else if (err == 0x01 && is_transmeta()) {
+ /* Transmeta might have masked feature bits in word 0 */
+
+ u32 ecx = 0x80860004;
+ u32 eax, edx;
+ u32 level = 1;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+ asm("cpuid"
+ : "+a" (level), "=d" (cpu.flags[0])
+ : : "ecx", "ebx");
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ err = check_flags();
+ }
+
+ if (err_flags_ptr)
+ *err_flags_ptr = err ? err_flags : NULL;
+ if (cpu_level_ptr)
+ *cpu_level_ptr = cpu.level;
+ if (req_level_ptr)
+ *req_level_ptr = req_level;
+
+ return (cpu.level < req_level || err) ? -1 : 0;
+}
diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S
deleted file mode 100644
index 34321368011a..000000000000
--- a/arch/i386/boot/edd.S
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * BIOS Enhanced Disk Drive support
- * Copyright (C) 2002, 2003, 2004 Dell, Inc.
- * by Matt Domsch <Matt_Domsch@dell.com> October 2002
- * conformant to T13 Committee www.t13.org
- * projects 1572D, 1484D, 1386D, 1226DT
- * disk signature read by Matt Domsch <Matt_Domsch@dell.com>
- * and Andrew Wilks <Andrew_Wilks@dell.com> September 2003, June 2004
- * legacy CHS retrieval by Patrick J. LoPresti <patl@users.sourceforge.net>
- * March 2004
- * Command line option parsing, Matt Domsch, November 2004
- */
-
-#include <linux/edd.h>
-#include <asm/setup.h>
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-
-# It is assumed that %ds == INITSEG here
-
- movb $0, (EDD_MBR_SIG_NR_BUF)
- movb $0, (EDDNR)
-
-# Check the command line for options:
-# edd=of disables EDD completely (edd=off)
-# edd=sk skips the MBR test (edd=skipmbr)
-# edd=on re-enables EDD (edd=on)
-
- pushl %esi
- movw $edd_mbr_sig_start, %di # Default to edd=on
-
- movl %cs:(cmd_line_ptr), %esi
- andl %esi, %esi
- jz old_cl # Old boot protocol?
-
-# Convert to a real-mode pointer in fs:si
- movl %esi, %eax
- shrl $4, %eax
- movw %ax, %fs
- andw $0xf, %si
- jmp have_cl_pointer
-
-# Old-style boot protocol?
-old_cl:
- push %ds # aka INITSEG
- pop %fs
-
- cmpw $0xa33f, (0x20)
- jne done_cl # No command line at all?
- movw (0x22), %si # Pointer relative to INITSEG
-
-# fs:si has the pointer to the command line now
-have_cl_pointer:
-
-# Loop through kernel command line one byte at a time. Just in
-# case the loader is buggy and failed to null-terminate the command line
-# terminate if we get close enough to the end of the segment that we
-# cannot fit "edd=XX"...
-cl_atspace:
- cmpw $-5, %si # Watch for segment wraparound
- jae done_cl
- movl %fs:(%si), %eax
- andb %al, %al # End of line?
- jz done_cl
- cmpl $EDD_CL_EQUALS, %eax
- jz found_edd_equals
- cmpb $0x20, %al # <= space consider whitespace
- ja cl_skipword
- incw %si
- jmp cl_atspace
-
-cl_skipword:
- cmpw $-5, %si # Watch for segment wraparound
- jae done_cl
- movb %fs:(%si), %al # End of string?
- andb %al, %al
- jz done_cl
- cmpb $0x20, %al
- jbe cl_atspace
- incw %si
- jmp cl_skipword
-
-found_edd_equals:
-# only looking at first two characters after equals
-# late overrides early on the command line, so keep going after finding something
- movw %fs:4(%si), %ax
- cmpw $EDD_CL_OFF, %ax # edd=of
- je do_edd_off
- cmpw $EDD_CL_SKIP, %ax # edd=sk
- je do_edd_skipmbr
- cmpw $EDD_CL_ON, %ax # edd=on
- je do_edd_on
- jmp cl_skipword
-do_edd_skipmbr:
- movw $edd_start, %di
- jmp cl_skipword
-do_edd_off:
- movw $edd_done, %di
- jmp cl_skipword
-do_edd_on:
- movw $edd_mbr_sig_start, %di
- jmp cl_skipword
-
-done_cl:
- popl %esi
- jmpw *%di
-
-# Read the first sector of each BIOS disk device and store the 4-byte signature
-edd_mbr_sig_start:
- movb $0x80, %dl # from device 80
- movw $EDD_MBR_SIG_BUF, %bx # store buffer ptr in bx
-edd_mbr_sig_read:
- movl $0xFFFFFFFF, %eax
- movl %eax, (%bx) # assume failure
- pushw %bx
- movb $READ_SECTORS, %ah
- movb $1, %al # read 1 sector
- movb $0, %dh # at head 0
- movw $1, %cx # cylinder 0, sector 0
- pushw %es
- pushw %ds
- popw %es
- movw $EDDBUF, %bx # disk's data goes into EDDBUF
- pushw %dx # work around buggy BIOSes
- stc # work around buggy BIOSes
- int $0x13
- sti # work around buggy BIOSes
- popw %dx
- popw %es
- popw %bx
- jc edd_mbr_sig_done # on failure, we're done.
- cmpb $0, %ah # some BIOSes do not set CF
- jne edd_mbr_sig_done # on failure, we're done.
- movl (EDDBUF+EDD_MBR_SIG_OFFSET), %eax # read sig out of the MBR
- movl %eax, (%bx) # store success
- incb (EDD_MBR_SIG_NR_BUF) # note that we stored something
- incb %dl # increment to next device
- addw $4, %bx # increment sig buffer ptr
- cmpb $EDD_MBR_SIG_MAX, (EDD_MBR_SIG_NR_BUF) # Out of space?
- jb edd_mbr_sig_read # keep looping
-edd_mbr_sig_done:
-
-# Do the BIOS Enhanced Disk Drive calls
-# This consists of two calls:
-# int 13h ah=41h "Check Extensions Present"
-# int 13h ah=48h "Get Device Parameters"
-# int 13h ah=08h "Legacy Get Device Parameters"
-#
-# A buffer of size EDDMAXNR*(EDDEXTSIZE+EDDPARMSIZE) is reserved for our use
-# in the boot_params at EDDBUF. The first four bytes of which are
-# used to store the device number, interface support map and version
-# results from fn41. The next four bytes are used to store the legacy
-# cylinders, heads, and sectors from fn08. The following 74 bytes are used to
-# store the results from fn48. Starting from device 80h, fn41, then fn48
-# are called and their results stored in EDDBUF+n*(EDDEXTSIZE+EDDPARMIZE).
-# Then the pointer is incremented to store the data for the next call.
-# This repeats until either a device doesn't exist, or until EDDMAXNR
-# devices have been stored.
-# The one tricky part is that ds:si always points EDDEXTSIZE bytes into
-# the structure, and the fn41 and fn08 results are stored at offsets
-# from there. This removes the need to increment the pointer for
-# every store, and leaves it ready for the fn48 call.
-# A second one-byte buffer, EDDNR, in the boot_params stores
-# the number of BIOS devices which exist, up to EDDMAXNR.
-# In setup.c, copy_edd() stores both boot_params buffers away
-# for later use, as they would get overwritten otherwise.
-# This code is sensitive to the size of the structs in edd.h
-edd_start:
- # %ds points to the bootsector
- # result buffer for fn48
- movw $EDDBUF+EDDEXTSIZE, %si # in ds:si, fn41 results
- # kept just before that
- movb $0x80, %dl # BIOS device 0x80
-
-edd_check_ext:
- movb $CHECKEXTENSIONSPRESENT, %ah # Function 41
- movw $EDDMAGIC1, %bx # magic
- int $0x13 # make the call
- jc edd_done # no more BIOS devices
-
- cmpw $EDDMAGIC2, %bx # is magic right?
- jne edd_next # nope, next...
-
- movb %dl, %ds:-8(%si) # store device number
- movb %ah, %ds:-7(%si) # store version
- movw %cx, %ds:-6(%si) # store extensions
- incb (EDDNR) # note that we stored something
-
-edd_get_device_params:
- movw $EDDPARMSIZE, %ds:(%si) # put size
- movw $0x0, %ds:2(%si) # work around buggy BIOSes
- movb $GETDEVICEPARAMETERS, %ah # Function 48
- int $0x13 # make the call
- # Don't check for fail return
- # it doesn't matter.
-edd_get_legacy_chs:
- xorw %ax, %ax
- movw %ax, %ds:-4(%si)
- movw %ax, %ds:-2(%si)
- # Ralf Brown's Interrupt List says to set ES:DI to
- # 0000h:0000h "to guard against BIOS bugs"
- pushw %es
- movw %ax, %es
- movw %ax, %di
- pushw %dx # legacy call clobbers %dl
- movb $LEGACYGETDEVICEPARAMETERS, %ah # Function 08
- int $0x13 # make the call
- jc edd_legacy_done # failed
- movb %cl, %al # Low 6 bits are max
- andb $0x3F, %al # sector number
- movb %al, %ds:-1(%si) # Record max sect
- movb %dh, %ds:-2(%si) # Record max head number
- movb %ch, %al # Low 8 bits of max cyl
- shr $6, %cl
- movb %cl, %ah # High 2 bits of max cyl
- movw %ax, %ds:-4(%si)
-
-edd_legacy_done:
- popw %dx
- popw %es
- movw %si, %ax # increment si
- addw $EDDPARMSIZE+EDDEXTSIZE, %ax
- movw %ax, %si
-
-edd_next:
- incb %dl # increment to next device
- cmpb $EDDMAXNR, (EDDNR) # Out of space?
- jb edd_check_ext # keep looping
-
-edd_done:
-#endif
diff --git a/arch/i386/boot/edd.c b/arch/i386/boot/edd.c
new file mode 100644
index 000000000000..25a282494f4c
--- /dev/null
+++ b/arch/i386/boot/edd.c
@@ -0,0 +1,196 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/edd.c
+ *
+ * Get EDD BIOS disk information
+ */
+
+#include "boot.h"
+#include <linux/edd.h>
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+
+struct edd_dapa {
+ u8 pkt_size;
+ u8 rsvd;
+ u16 sector_cnt;
+ u16 buf_off, buf_seg;
+ u64 lba;
+ u64 buf_lin_addr;
+};
+
+/*
+ * Read the MBR (first sector) from a specific device.
+ */
+static int read_mbr(u8 devno, void *buf)
+{
+ struct edd_dapa dapa;
+ u16 ax, bx, cx, dx, si;
+
+ memset(&dapa, 0, sizeof dapa);
+ dapa.pkt_size = sizeof(dapa);
+ dapa.sector_cnt = 1;
+ dapa.buf_off = (size_t)buf;
+ dapa.buf_seg = ds();
+ /* dapa.lba = 0; */
+
+ ax = 0x4200; /* Extended Read */
+ si = (size_t)&dapa;
+ dx = devno;
+ asm("pushfl; stc; int $0x13; setc %%al; popfl"
+ : "+a" (ax), "+S" (si), "+d" (dx)
+ : "m" (dapa)
+ : "ebx", "ecx", "edi", "memory");
+
+ if (!(u8)ax)
+ return 0; /* OK */
+
+ ax = 0x0201; /* Legacy Read, one sector */
+ cx = 0x0001; /* Sector 0-0-1 */
+ dx = devno;
+ bx = (size_t)buf;
+ asm("pushfl; stc; int $0x13; setc %%al; popfl"
+ : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
+ : : "esi", "edi", "memory");
+
+ return -(u8)ax; /* 0 or -1 */
+}
+
+static u32 read_mbr_sig(u8 devno, struct edd_info *ei)
+{
+ int sector_size;
+ char *mbrbuf_ptr, *mbrbuf_end;
+ u32 mbrsig;
+ u32 buf_base, mbr_base;
+ extern char _end[];
+ static char mbr_buf[1024];
+
+ sector_size = ei->params.bytes_per_sector;
+ if (!sector_size)
+ sector_size = 512; /* Best available guess */
+
+ buf_base = (ds() << 4) + (u32)&_end;
+ mbr_base = (buf_base+sector_size-1) & ~(sector_size-1);
+ mbrbuf_ptr = mbr_buf + (mbr_base-buf_base);
+ mbrbuf_end = mbrbuf_ptr + sector_size;
+
+ if (!(boot_params.hdr.loadflags & CAN_USE_HEAP))
+ return 0;
+ if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
+ return 0;
+
+ if (read_mbr(devno, mbrbuf_ptr))
+ return 0;
+
+ mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
+ return mbrsig;
+}
+
+static int get_edd_info(u8 devno, struct edd_info *ei)
+{
+ u16 ax, bx, cx, dx, di;
+
+ memset(ei, 0, sizeof *ei);
+
+ /* Check Extensions Present */
+
+ ax = 0x4100;
+ bx = EDDMAGIC1;
+ dx = devno;
+ asm("pushfl; stc; int $0x13; setc %%al; popfl"
+ : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
+ : : "esi", "edi");
+
+ if ((u8)ax)
+ return -1; /* No extended information */
+
+ if (bx != EDDMAGIC2)
+ return -1;
+
+ ei->device = devno;
+ ei->version = ax >> 8; /* EDD version number */
+ ei->interface_support = cx; /* EDD functionality subsets */
+
+ /* Extended Get Device Parameters */
+
+ ei->params.length = sizeof(ei->params);
+ ax = 0x4800;
+ dx = devno;
+ asm("pushfl; int $0x13; popfl"
+ : "+a" (ax), "+d" (dx)
+ : "S" (&ei->params)
+ : "ebx", "ecx", "edi");
+
+ /* Get legacy CHS parameters */
+
+ /* Ralf Brown recommends setting ES:DI to 0:0 */
+ ax = 0x0800;
+ dx = devno;
+ di = 0;
+ asm("pushw %%es; "
+ "movw %%di,%%es; "
+ "pushfl; stc; int $0x13; setc %%al; popfl; "
+ "popw %%es"
+ : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
+ : : "esi");
+
+ if ((u8)ax == 0) {
+ ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
+ ei->legacy_max_head = dx >> 8;
+ ei->legacy_sectors_per_track = cx & 0x3f;
+ }
+
+ return 0;
+}
+
+void query_edd(void)
+{
+ char eddarg[8];
+ int do_mbr = 1;
+ int do_edd = 1;
+ int devno;
+ struct edd_info ei, *edp;
+
+ if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) {
+ if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip"))
+ do_mbr = 0;
+ else if (!strcmp(eddarg, "off"))
+ do_edd = 0;
+ }
+
+ edp = (struct edd_info *)boot_params.eddbuf;
+
+ if (!do_edd)
+ return;
+
+ for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
+ /*
+ * Scan the BIOS-supported hard disks and query EDD
+ * information...
+ */
+ get_edd_info(devno, &ei);
+
+ if (boot_params.eddbuf_entries < EDDMAXNR) {
+ memcpy(edp, &ei, sizeof ei);
+ edp++;
+ boot_params.eddbuf_entries++;
+ }
+
+ if (do_mbr) {
+ u32 mbr_sig;
+ mbr_sig = read_mbr_sig(devno, &ei);
+ boot_params.edd_mbr_sig_buffer[devno-0x80] = mbr_sig;
+ }
+ }
+}
+
+#endif
diff --git a/arch/i386/boot/header.S b/arch/i386/boot/header.S
new file mode 100644
index 000000000000..6b9923fb6eae
--- /dev/null
+++ b/arch/i386/boot/header.S
@@ -0,0 +1,283 @@
+/*
+ * header.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Based on bootsect.S and setup.S
+ * modified by more people than can be counted
+ *
+ * Rewritten as a common file by H. Peter Anvin (Apr 2007)
+ *
+ * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
+ * addresses must be multiplied by 16 to obtain their respective linear
+ * addresses. To avoid confusion, linear addresses are written using leading
+ * hex while segment addresses are written as segment:offset.
+ *
+ */
+
+#include <asm/segment.h>
+#include <linux/utsrelease.h>
+#include <asm/boot.h>
+#include <asm/e820.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include "boot.h"
+
+SETUPSECTS = 4 /* default nr of setup-sectors */
+BOOTSEG = 0x07C0 /* original address of boot-sector */
+SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
+SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
+ /* to be loaded */
+ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
+SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
+
+#ifndef SVGA_MODE
+#define SVGA_MODE ASK_VGA
+#endif
+
+#ifndef RAMDISK
+#define RAMDISK 0
+#endif
+
+#ifndef ROOT_RDONLY
+#define ROOT_RDONLY 1
+#endif
+
+ .code16
+ .section ".bstext", "ax"
+
+ .global bootsect_start
+bootsect_start:
+
+ # Normalize the start address
+ ljmp $BOOTSEG, $start2
+
+start2:
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+ xorw %sp, %sp
+ sti
+ cld
+
+ movw $bugger_off_msg, %si
+
+msg_loop:
+ lodsb
+ andb %al, %al
+ jz bs_die
+ movb $0xe, %ah
+ movw $7, %bx
+ int $0x10
+ jmp msg_loop
+
+bs_die:
+ # Allow the user to press a key, then reboot
+ xorw %ax, %ax
+ int $0x16
+ int $0x19
+
+ # int 0x19 should never return. In case it does anyway,
+ # invoke the BIOS reset code...
+ ljmp $0xf000,$0xfff0
+
+ .section ".bsdata", "a"
+bugger_off_msg:
+ .ascii "Direct booting from floppy is no longer supported.\r\n"
+ .ascii "Please use a boot loader program instead.\r\n"
+ .ascii "\n"
+ .ascii "Remove disk and press any key to reboot . . .\r\n"
+ .byte 0
+
+
+ # Kernel attributes; used by setup. This is part 1 of the
+ # header, from the old boot sector.
+
+ .section ".header", "a"
+ .globl hdr
+hdr:
+setup_sects: .byte SETUPSECTS
+root_flags: .word ROOT_RDONLY
+syssize: .long SYSSIZE
+ram_size: .word RAMDISK
+vid_mode: .word SVGA_MODE
+root_dev: .word ROOT_DEV
+boot_flag: .word 0xAA55
+
+ # offset 512, entry point
+
+ .globl _start
+_start:
+ # Explicitly enter this as bytes, or the assembler
+ # tries to generate a 3-byte jump here, which causes
+ # everything else to push off to the wrong offset.
+ .byte 0xeb # short (2-byte) jump
+ .byte start_of_setup-1f
+1:
+
+ # Part 2 of the header, from the old setup.S
+
+ .ascii "HdrS" # header signature
+ .word 0x0206 # header version number (>= 0x0105)
+ # or else old loadlin-1.5 will fail)
+ .globl realmode_swtch
+realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
+start_sys_seg: .word SYSSEG
+ .word kernel_version-512 # pointing to kernel version string
+ # above section of header is compatible
+ # with loadlin-1.5 (header v1.5). Don't
+ # change it.
+
+type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
+ # Bootlin, SYSLX, bootsect...)
+ # See Documentation/i386/boot.txt for
+ # assigned ids
+
+# flags, unused bits must be zero (RFU) bit within loadflags
+loadflags:
+LOADED_HIGH = 1 # If set, the kernel is loaded high
+CAN_USE_HEAP = 0x80 # If set, the loader also has set
+ # heap_end_ptr to tell how much
+ # space behind setup.S can be used for
+ # heap purposes.
+ # Only the loader knows what is free
+#ifndef __BIG_KERNEL__
+ .byte 0
+#else
+ .byte LOADED_HIGH
+#endif
+
+setup_move_size: .word 0x8000 # size to move, when setup is not
+ # loaded at 0x90000. We will move setup
+ # to 0x90000 then just before jumping
+ # into the kernel. However, only the
+ # loader knows how much data behind
+ # us also needs to be loaded.
+
+code32_start: # here loaders can put a different
+ # start address for 32-bit code.
+#ifndef __BIG_KERNEL__
+ .long 0x1000 # 0x1000 = default for zImage
+#else
+ .long 0x100000 # 0x100000 = default for big kernel
+#endif
+
+ramdisk_image: .long 0 # address of loaded ramdisk image
+ # Here the loader puts the 32-bit
+ # address where it loaded the image.
+ # This only will be read by the kernel.
+
+ramdisk_size: .long 0 # its size in bytes
+
+bootsect_kludge:
+ .long 0 # obsolete
+
+heap_end_ptr: .word _end+1024 # (Header version 0x0201 or later)
+ # space from here (exclusive) down to
+ # end of setup code can be used by setup
+ # for local heap purposes.
+
+pad1: .word 0
+cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
+ # If nonzero, a 32-bit pointer
+ # to the kernel command line.
+ # The command line should be
+ # located between the start of
+ # setup and the end of low
+ # memory (0xa0000), or it may
+ # get overwritten before it
+ # gets read. If this field is
+ # used, there is no longer
+ # anything magical about the
+ # 0x90000 segment; the setup
+ # can be located anywhere in
+ # low memory 0x10000 or higher.
+
+ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
+ # (Header version 0x0203 or later)
+ # The highest safe address for
+ # the contents of an initrd
+
+kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
+ #required for protected mode
+ #kernel
+#ifdef CONFIG_RELOCATABLE
+relocatable_kernel: .byte 1
+#else
+relocatable_kernel: .byte 0
+#endif
+pad2: .byte 0
+pad3: .word 0
+
+cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
+ #added with boot protocol
+ #version 2.06
+
+# End of setup header #####################################################
+
+ .section ".inittext", "ax"
+start_of_setup:
+#ifdef SAFE_RESET_DISK_CONTROLLER
+# Reset the disk controller.
+ movw $0x0000, %ax # Reset disk controller
+ movb $0x80, %dl # All disks
+ int $0x13
+#endif
+
+# We will have entired with %cs = %ds+0x20, normalize %cs so
+# it is on par with the other segments.
+ pushw %ds
+ pushw $setup2
+ lretw
+
+setup2:
+# Force %es = %ds
+ movw %ds, %ax
+ movw %ax, %es
+ cld
+
+# Stack paranoia: align the stack and make sure it is good
+# for both 16- and 32-bit references. In particular, if we
+# were meant to have been using the full 16-bit segment, the
+# caller might have set %sp to zero, which breaks %esp-based
+# references.
+ andw $~3, %sp # dword align (might as well...)
+ jnz 1f
+ movw $0xfffc, %sp # Make sure we're not zero
+1: movzwl %sp, %esp # Clear upper half of %esp
+ sti
+
+# Check signature at end of setup
+ cmpl $0x5a5aaa55, setup_sig
+ jne setup_bad
+
+# Zero the bss
+ movw $__bss_start, %di
+ movw $_end+3, %cx
+ xorl %eax, %eax
+ subw %di, %cx
+ shrw $2, %cx
+ rep; stosl
+
+# Jump to C code (should not return)
+ calll main
+
+# Setup corrupt somehow...
+setup_bad:
+ movl $setup_corrupt, %eax
+ calll puts
+ # Fall through...
+
+ .globl die
+ .type die, @function
+die:
+ hlt
+ jmp die
+
+ .size die, .-due
+
+ .section ".initdata", "a"
+setup_corrupt:
+ .byte 7
+ .string "No setup signature found..."
diff --git a/arch/i386/boot/main.c b/arch/i386/boot/main.c
new file mode 100644
index 000000000000..7f01f96c4fb8
--- /dev/null
+++ b/arch/i386/boot/main.c
@@ -0,0 +1,161 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/main.c
+ *
+ * Main module for the real-mode kernel code
+ */
+
+#include "boot.h"
+
+struct boot_params boot_params __attribute__((aligned(16)));
+
+char *HEAP = _end;
+char *heap_end = _end; /* Default end of heap = no heap */
+
+/*
+ * Copy the header into the boot parameter block. Since this
+ * screws up the old-style command line protocol, adjust by
+ * filling in the new-style command line pointer instead.
+ */
+#define OLD_CL_MAGIC 0xA33F
+#define OLD_CL_ADDRESS 0x20
+
+static void copy_boot_params(void)
+{
+ struct old_cmdline {
+ u16 cl_magic;
+ u16 cl_offset;
+ };
+ const struct old_cmdline * const oldcmd =
+ (const struct old_cmdline *)OLD_CL_ADDRESS;
+
+ BUILD_BUG_ON(sizeof boot_params != 4096);
+ memcpy(&boot_params.hdr, &hdr, sizeof hdr);
+
+ if (!boot_params.hdr.cmd_line_ptr &&
+ oldcmd->cl_magic == OLD_CL_MAGIC) {
+ /* Old-style command line protocol. */
+ u16 cmdline_seg;
+
+ /* Figure out if the command line falls in the region
+ of memory that an old kernel would have copied up
+ to 0x90000... */
+ if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
+ cmdline_seg = ds();
+ else
+ cmdline_seg = 0x9000;
+
+ boot_params.hdr.cmd_line_ptr =
+ (cmdline_seg << 4) + oldcmd->cl_offset;
+ }
+}
+
+/*
+ * Set the keyboard repeat rate to maximum. Unclear why this
+ * is done here; this might be possible to kill off as stale code.
+ */
+static void keyboard_set_repeat(void)
+{
+ u16 ax = 0x0305;
+ u16 bx = 0;
+ asm volatile("int $0x16"
+ : "+a" (ax), "+b" (bx)
+ : : "ecx", "edx", "esi", "edi");
+}
+
+/*
+ * Get Intel SpeedStep IST information.
+ */
+static void query_speedstep_ist(void)
+{
+ asm("int $0x15"
+ : "=a" (boot_params.speedstep_info[0]),
+ "=b" (boot_params.speedstep_info[1]),
+ "=c" (boot_params.speedstep_info[2]),
+ "=d" (boot_params.speedstep_info[3])
+ : "a" (0x0000e980), /* IST Support */
+ "d" (0x47534943)); /* Request value */
+}
+
+/*
+ * Tell the BIOS what CPU mode we intend to run in.
+ */
+static void set_bios_mode(void)
+{
+#ifdef CONFIG_X86_64
+ u32 eax, ebx;
+
+ eax = 0xec00;
+ ebx = 2;
+ asm volatile("int $0x15"
+ : "+a" (eax), "+b" (ebx)
+ : : "ecx", "edx", "esi", "edi");
+#endif
+}
+
+void main(void)
+{
+ /* First, copy the boot header into the "zeropage" */
+ copy_boot_params();
+
+ /* End of heap check */
+ if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
+ heap_end = (char *)(boot_params.hdr.heap_end_ptr
+ +0x200-STACK_SIZE);
+ } else {
+ /* Boot protocol 2.00 only, no heap available */
+ puts("WARNING: Ancient bootloader, some functionality "
+ "may be limited!\n");
+ }
+
+ /* Make sure we have all the proper CPU support */
+ if (validate_cpu()) {
+ puts("Unable to boot - please use a kernel appropriate "
+ "for your CPU.\n");
+ die();
+ }
+
+ /* Tell the BIOS what CPU mode we intend to run in. */
+ set_bios_mode();
+
+ /* Detect memory layout */
+ detect_memory();
+
+ /* Set keyboard repeat rate (why?) */
+ keyboard_set_repeat();
+
+ /* Set the video mode */
+ set_video();
+
+ /* Query MCA information */
+ query_mca();
+
+ /* Voyager */
+#ifdef CONFIG_X86_VOYAGER
+ query_voyager();
+#endif
+
+ /* Query SpeedStep IST information */
+ query_speedstep_ist();
+
+ /* Query APM information */
+#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
+ query_apm_bios();
+#endif
+
+ /* Query EDD information */
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+ query_edd();
+#endif
+ /* Do the last things and invoke protected mode */
+ go_to_protected_mode();
+}
diff --git a/arch/i386/boot/mca.c b/arch/i386/boot/mca.c
new file mode 100644
index 000000000000..68222f2d4b67
--- /dev/null
+++ b/arch/i386/boot/mca.c
@@ -0,0 +1,43 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/mca.c
+ *
+ * Get the MCA system description table
+ */
+
+#include "boot.h"
+
+int query_mca(void)
+{
+ u8 err;
+ u16 es, bx, len;
+
+ asm("pushw %%es ; "
+ "int $0x15 ; "
+ "setc %0 ; "
+ "movw %%es, %1 ; "
+ "popw %%es"
+ : "=acd" (err), "=acdSD" (es), "=b" (bx)
+ : "a" (0xc000));
+
+ if (err)
+ return -1; /* No MCA present */
+
+ set_fs(es);
+ len = rdfs16(bx);
+
+ if (len > sizeof(boot_params.sys_desc_table))
+ len = sizeof(boot_params.sys_desc_table);
+
+ copy_from_fs(&boot_params.sys_desc_table, bx, len);
+ return 0;
+}
diff --git a/arch/i386/boot/memory.c b/arch/i386/boot/memory.c
new file mode 100644
index 000000000000..1a2e62db8bed
--- /dev/null
+++ b/arch/i386/boot/memory.c
@@ -0,0 +1,99 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/memory.c
+ *
+ * Memory detection code
+ */
+
+#include "boot.h"
+
+#define SMAP 0x534d4150 /* ASCII "SMAP" */
+
+static int detect_memory_e820(void)
+{
+ u32 next = 0;
+ u32 size, id;
+ u8 err;
+ struct e820entry *desc = boot_params.e820_map;
+
+ do {
+ size = sizeof(struct e820entry);
+ id = SMAP;
+ asm("int $0x15; setc %0"
+ : "=am" (err), "+b" (next), "+d" (id), "+c" (size),
+ "=m" (*desc)
+ : "D" (desc), "a" (0xe820));
+
+ if (err || id != SMAP)
+ break;
+
+ boot_params.e820_entries++;
+ desc++;
+ } while (next && boot_params.e820_entries < E820MAX);
+
+ return boot_params.e820_entries;
+}
+
+static int detect_memory_e801(void)
+{
+ u16 ax, bx, cx, dx;
+ u8 err;
+
+ bx = cx = dx = 0;
+ ax = 0xe801;
+ asm("stc; int $0x15; setc %0"
+ : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+
+ if (err)
+ return -1;
+
+ /* Do we really need to do this? */
+ if (cx || dx) {
+ ax = cx;
+ bx = dx;
+ }
+
+ if (ax > 15*1024)
+ return -1; /* Bogus! */
+
+ /* This ignores memory above 16MB if we have a memory hole
+ there. If someone actually finds a machine with a memory
+ hole at 16MB and no support for 0E820h they should probably
+ generate a fake e820 map. */
+ boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+
+ return 0;
+}
+
+static int detect_memory_88(void)
+{
+ u16 ax;
+ u8 err;
+
+ ax = 0x8800;
+ asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+
+ boot_params.screen_info.ext_mem_k = ax;
+
+ return -err;
+}
+
+int detect_memory(void)
+{
+ if (detect_memory_e820() > 0)
+ return 0;
+
+ if (!detect_memory_e801())
+ return 0;
+
+ return detect_memory_88();
+}
diff --git a/arch/i386/boot/pm.c b/arch/i386/boot/pm.c
new file mode 100644
index 000000000000..1df025c73261
--- /dev/null
+++ b/arch/i386/boot/pm.c
@@ -0,0 +1,170 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/pm.c
+ *
+ * Prepare the machine for transition to protected mode.
+ */
+
+#include "boot.h"
+#include <asm/segment.h>
+
+/*
+ * Invoke the realmode switch hook if present; otherwise
+ * disable all interrupts.
+ */
+static void realmode_switch_hook(void)
+{
+ if (boot_params.hdr.realmode_swtch) {
+ asm volatile("lcallw *%0"
+ : : "m" (boot_params.hdr.realmode_swtch)
+ : "eax", "ebx", "ecx", "edx");
+ } else {
+ asm volatile("cli");
+ outb(0x80, 0x70); /* Disable NMI */
+ io_delay();
+ }
+}
+
+/*
+ * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
+ * A bzImage kernel is loaded and runs at 0x100000.
+ */
+static void move_kernel_around(void)
+{
+ /* Note: rely on the compile-time option here rather than
+ the LOADED_HIGH flag. The Qemu kernel loader unconditionally
+ sets the loadflags to zero. */
+#ifndef __BIG_KERNEL__
+ u16 dst_seg, src_seg;
+ u32 syssize;
+
+ dst_seg = 0x1000 >> 4;
+ src_seg = 0x10000 >> 4;
+ syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
+
+ while (syssize) {
+ int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
+ int dwords = paras << 2;
+
+ asm volatile("pushw %%es ; "
+ "pushw %%ds ; "
+ "movw %1,%%es ; "
+ "movw %2,%%ds ; "
+ "xorw %%di,%%di ; "
+ "xorw %%si,%%si ; "
+ "rep;movsl ; "
+ "popw %%ds ; "
+ "popw %%es"
+ : "+c" (dwords)
+ : "r" (dst_seg), "r" (src_seg)
+ : "esi", "edi");
+
+ syssize -= paras;
+ dst_seg += paras;
+ src_seg += paras;
+ }
+#endif
+}
+
+/*
+ * Disable all interrupts at the legacy PIC.
+ */
+static void mask_all_interrupts(void)
+{
+ outb(0xff, 0xa1); /* Mask all interrupts on the seconday PIC */
+ io_delay();
+ outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
+ io_delay();
+}
+
+/*
+ * Reset IGNNE# if asserted in the FPU.
+ */
+static void reset_coprocessor(void)
+{
+ outb(0, 0xf0);
+ io_delay();
+ outb(0, 0xf1);
+ io_delay();
+}
+
+/*
+ * Set up the GDT
+ */
+#define GDT_ENTRY(flags,base,limit) \
+ (((u64)(base & 0xff000000) << 32) | \
+ ((u64)flags << 40) | \
+ ((u64)(limit & 0x00ff0000) << 32) | \
+ ((u64)(base & 0x00ffff00) << 16) | \
+ ((u64)(limit & 0x0000ffff)))
+
+struct gdt_ptr {
+ u16 len;
+ u32 ptr;
+} __attribute__((packed));
+
+static void setup_gdt(void)
+{
+ /* There are machines which are known to not boot with the GDT
+ being 8-byte unaligned. Intel recommends 16 byte alignment. */
+ static const u64 boot_gdt[] __attribute__((aligned(16))) = {
+ /* CS: code, read/execute, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+ /* DS: data, read/write, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+ };
+ struct gdt_ptr gdt;
+
+ gdt.len = sizeof(boot_gdt)-1;
+ gdt.ptr = (u32)&boot_gdt + (ds() << 4);
+
+ asm volatile("lgdtl %0" : : "m" (gdt));
+}
+
+/*
+ * Set up the IDT
+ */
+static void setup_idt(void)
+{
+ static const struct gdt_ptr null_idt = {0, 0};
+ asm volatile("lidtl %0" : : "m" (null_idt));
+}
+
+/*
+ * Actual invocation sequence
+ */
+void go_to_protected_mode(void)
+{
+ /* Hook before leaving real mode, also disables interrupts */
+ realmode_switch_hook();
+
+ /* Move the kernel/setup to their final resting places */
+ move_kernel_around();
+
+ /* Enable the A20 gate */
+ if (enable_a20()) {
+ puts("A20 gate not responding, unable to boot...\n");
+ die();
+ }
+
+ /* Reset coprocessor (IGNNE#) */
+ reset_coprocessor();
+
+ /* Mask all interrupts in the PIC */
+ mask_all_interrupts();
+
+ /* Actual transition to protected mode... */
+ setup_idt();
+ setup_gdt();
+ protected_mode_jump(boot_params.hdr.code32_start,
+ (u32)&boot_params + (ds() << 4));
+}
diff --git a/arch/i386/boot/pmjump.S b/arch/i386/boot/pmjump.S
new file mode 100644
index 000000000000..2e559233725a
--- /dev/null
+++ b/arch/i386/boot/pmjump.S
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/pmjump.S
+ *
+ * The actual transition into protected mode
+ */
+
+#include <asm/boot.h>
+#include <asm/segment.h>
+
+ .text
+
+ .globl protected_mode_jump
+ .type protected_mode_jump, @function
+
+ .code16
+
+/*
+ * void protected_mode_jump(u32 entrypoint, u32 bootparams);
+ */
+protected_mode_jump:
+ xorl %ebx, %ebx # Flag to indicate this is a boot
+ movl %edx, %esi # Pointer to boot_params table
+ movl %eax, 2f # Patch ljmpl instruction
+ jmp 1f # Short jump to flush instruction q.
+
+1:
+ movw $__BOOT_DS, %cx
+
+ movl %cr0, %edx
+ orb $1, %dl # Protected mode (PE) bit
+ movl %edx, %cr0
+
+ movw %cx, %ds
+ movw %cx, %es
+ movw %cx, %fs
+ movw %cx, %gs
+ movw %cx, %ss
+
+ # Jump to the 32-bit entrypoint
+ .byte 0x66, 0xea # ljmpl opcode
+2: .long 0 # offset
+ .word __BOOT_CS # segment
+
+ .size protected_mode_jump, .-protected_mode_jump
diff --git a/arch/i386/boot/printf.c b/arch/i386/boot/printf.c
new file mode 100644
index 000000000000..1a09f9309d3c
--- /dev/null
+++ b/arch/i386/boot/printf.c
@@ -0,0 +1,307 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/printf.c
+ *
+ * Oh, it's a waste of space, but oh-so-yummy for debugging. This
+ * version of printf() does not include 64-bit support. "Live with
+ * it."
+ *
+ */
+
+#include "boot.h"
+
+static int skip_atoi(const char **s)
+{
+ int i = 0;
+
+ while (isdigit(**s))
+ i = i * 10 + *((*s)++) - '0';
+ return i;
+}
+
+#define ZEROPAD 1 /* pad with zero */
+#define SIGN 2 /* unsigned/signed long */
+#define PLUS 4 /* show plus */
+#define SPACE 8 /* space if plus */
+#define LEFT 16 /* left justified */
+#define SPECIAL 32 /* 0x */
+#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */
+
+#define do_div(n,base) ({ \
+int __res; \
+__res = ((unsigned long) n) % (unsigned) base; \
+n = ((unsigned long) n) / (unsigned) base; \
+__res; })
+
+static char *number(char *str, long num, int base, int size, int precision,
+ int type)
+{
+ char c, sign, tmp[66];
+ const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz";
+ int i;
+
+ if (type & LARGE)
+ digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ if (type & LEFT)
+ type &= ~ZEROPAD;
+ if (base < 2 || base > 36)
+ return 0;
+ c = (type & ZEROPAD) ? '0' : ' ';
+ sign = 0;
+ if (type & SIGN) {
+ if (num < 0) {
+ sign = '-';
+ num = -num;
+ size--;
+ } else if (type & PLUS) {
+ sign = '+';
+ size--;
+ } else if (type & SPACE) {
+ sign = ' ';
+ size--;
+ }
+ }
+ if (type & SPECIAL) {
+ if (base == 16)
+ size -= 2;
+ else if (base == 8)
+ size--;
+ }
+ i = 0;
+ if (num == 0)
+ tmp[i++] = '0';
+ else
+ while (num != 0)
+ tmp[i++] = digits[do_div(num, base)];
+ if (i > precision)
+ precision = i;
+ size -= precision;
+ if (!(type & (ZEROPAD + LEFT)))
+ while (size-- > 0)
+ *str++ = ' ';
+ if (sign)
+ *str++ = sign;
+ if (type & SPECIAL) {
+ if (base == 8)
+ *str++ = '0';
+ else if (base == 16) {
+ *str++ = '0';
+ *str++ = digits[33];
+ }
+ }
+ if (!(type & LEFT))
+ while (size-- > 0)
+ *str++ = c;
+ while (i < precision--)
+ *str++ = '0';
+ while (i-- > 0)
+ *str++ = tmp[i];
+ while (size-- > 0)
+ *str++ = ' ';
+ return str;
+}
+
+int vsprintf(char *buf, const char *fmt, va_list args)
+{
+ int len;
+ unsigned long num;
+ int i, base;
+ char *str;
+ const char *s;
+
+ int flags; /* flags to number() */
+
+ int field_width; /* width of output field */
+ int precision; /* min. # of digits for integers; max
+ number of chars for from string */
+ int qualifier; /* 'h', 'l', or 'L' for integer fields */
+
+ for (str = buf; *fmt; ++fmt) {
+ if (*fmt != '%') {
+ *str++ = *fmt;
+ continue;
+ }
+
+ /* process flags */
+ flags = 0;
+ repeat:
+ ++fmt; /* this also skips first '%' */
+ switch (*fmt) {
+ case '-':
+ flags |= LEFT;
+ goto repeat;
+ case '+':
+ flags |= PLUS;
+ goto repeat;
+ case ' ':
+ flags |= SPACE;
+ goto repeat;
+ case '#':
+ flags |= SPECIAL;
+ goto repeat;
+ case '0':
+ flags |= ZEROPAD;
+ goto repeat;
+ }
+
+ /* get field width */
+ field_width = -1;
+ if (isdigit(*fmt))
+ field_width = skip_atoi(&fmt);
+ else if (*fmt == '*') {
+ ++fmt;
+ /* it's the next argument */
+ field_width = va_arg(args, int);
+ if (field_width < 0) {
+ field_width = -field_width;
+ flags |= LEFT;
+ }
+ }
+
+ /* get the precision */
+ precision = -1;
+ if (*fmt == '.') {
+ ++fmt;
+ if (isdigit(*fmt))
+ precision = skip_atoi(&fmt);
+ else if (*fmt == '*') {
+ ++fmt;
+ /* it's the next argument */
+ precision = va_arg(args, int);
+ }
+ if (precision < 0)
+ precision = 0;
+ }
+
+ /* get the conversion qualifier */
+ qualifier = -1;
+ if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') {
+ qualifier = *fmt;
+ ++fmt;
+ }
+
+ /* default base */
+ base = 10;
+
+ switch (*fmt) {
+ case 'c':
+ if (!(flags & LEFT))
+ while (--field_width > 0)
+ *str++ = ' ';
+ *str++ = (unsigned char)va_arg(args, int);
+ while (--field_width > 0)
+ *str++ = ' ';
+ continue;
+
+ case 's':
+ s = va_arg(args, char *);
+ len = strnlen(s, precision);
+
+ if (!(flags & LEFT))
+ while (len < field_width--)
+ *str++ = ' ';
+ for (i = 0; i < len; ++i)
+ *str++ = *s++;
+ while (len < field_width--)
+ *str++ = ' ';
+ continue;
+
+ case 'p':
+ if (field_width == -1) {
+ field_width = 2 * sizeof(void *);
+ flags |= ZEROPAD;
+ }
+ str = number(str,
+ (unsigned long)va_arg(args, void *), 16,
+ field_width, precision, flags);
+ continue;
+
+ case 'n':
+ if (qualifier == 'l') {
+ long *ip = va_arg(args, long *);
+ *ip = (str - buf);
+ } else {
+ int *ip = va_arg(args, int *);
+ *ip = (str - buf);
+ }
+ continue;
+
+ case '%':
+ *str++ = '%';
+ continue;
+
+ /* integer number formats - set up the flags and "break" */
+ case 'o':
+ base = 8;
+ break;
+
+ case 'X':
+ flags |= LARGE;
+ case 'x':
+ base = 16;
+ break;
+
+ case 'd':
+ case 'i':
+ flags |= SIGN;
+ case 'u':
+ break;
+
+ default:
+ *str++ = '%';
+ if (*fmt)
+ *str++ = *fmt;
+ else
+ --fmt;
+ continue;
+ }
+ if (qualifier == 'l')
+ num = va_arg(args, unsigned long);
+ else if (qualifier == 'h') {
+ num = (unsigned short)va_arg(args, int);
+ if (flags & SIGN)
+ num = (short)num;
+ } else if (flags & SIGN)
+ num = va_arg(args, int);
+ else
+ num = va_arg(args, unsigned int);
+ str = number(str, num, base, field_width, precision, flags);
+ }
+ *str = '\0';
+ return str - buf;
+}
+
+int sprintf(char *buf, const char *fmt, ...)
+{
+ va_list args;
+ int i;
+
+ va_start(args, fmt);
+ i = vsprintf(buf, fmt, args);
+ va_end(args);
+ return i;
+}
+
+int printf(const char *fmt, ...)
+{
+ char printf_buf[1024];
+ va_list args;
+ int printed;
+
+ va_start(args, fmt);
+ printed = vsprintf(printf_buf, fmt, args);
+ va_end(args);
+
+ puts(printf_buf);
+
+ return printed;
+}
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
deleted file mode 100644
index 6dbcc95b2120..000000000000
--- a/arch/i386/boot/setup.S
+++ /dev/null
@@ -1,1075 +0,0 @@
-/*
- * setup.S Copyright (C) 1991, 1992 Linus Torvalds
- *
- * setup.s is responsible for getting the system data from the BIOS,
- * and putting them into the appropriate places in system memory.
- * both setup.s and system has been loaded by the bootblock.
- *
- * This code asks the bios for memory/disk/other parameters, and
- * puts them in a "safe" place: 0x90000-0x901FF, ie where the
- * boot-block used to be. It is then up to the protected mode
- * system to read them from there before the area is overwritten
- * for buffer-blocks.
- *
- * Move PS/2 aux init code to psaux.c
- * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
- *
- * some changes and additional features by Christoph Niemann,
- * March 1993/June 1994 (Christoph.Niemann@linux.org)
- *
- * add APM BIOS checking by Stephen Rothwell, May 1994
- * (sfr@canb.auug.org.au)
- *
- * High load stuff, initrd support and position independency
- * by Hans Lermen & Werner Almesberger, February 1996
- * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
- *
- * Video handling moved to video.S by Martin Mares, March 1996
- * <mj@k332.feld.cvut.cz>
- *
- * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
- * parsons) to avoid loadlin confusion, July 1997
- *
- * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
- * <stiker@northlink.com>
- *
- * Fix to work around buggy BIOSes which don't use carry bit correctly
- * and/or report extended memory in CX/DX for e801h memory size detection
- * call. As a result the kernel got wrong figures. The int15/e801h docs
- * from Ralf Brown interrupt list seem to indicate AX/BX should be used
- * anyway. So to avoid breaking many machines (presumably there was a reason
- * to orginally use CX/DX instead of AX/BX), we do a kludge to see
- * if CX/DX have been changed in the e801 call and if so use AX/BX .
- * Michael Miller, April 2001 <michaelm@mjmm.org>
- *
- * New A20 code ported from SYSLINUX by H. Peter Anvin. AMD Elan bugfixes
- * by Robert Schwebel, December 2001 <robert@schwebel.de>
- */
-
-#include <asm/segment.h>
-#include <linux/utsrelease.h>
-#include <linux/compile.h>
-#include <asm/boot.h>
-#include <asm/e820.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-
-/* Signature words to ensure LILO loaded us right */
-#define SIG1 0xAA55
-#define SIG2 0x5A5A
-
-INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way
-SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536).
-SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment
- # ... and the former contents of CS
-
-DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020
-
-.code16
-.globl begtext, begdata, begbss, endtext, enddata, endbss
-
-.text
-begtext:
-.data
-begdata:
-.bss
-begbss:
-.text
-
-start:
- jmp trampoline
-
-# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
-
- .ascii "HdrS" # header signature
- .word 0x0206 # header version number (>= 0x0105)
- # or else old loadlin-1.5 will fail)
-realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
-start_sys_seg: .word SYSSEG
- .word kernel_version # pointing to kernel version string
- # above section of header is compatible
- # with loadlin-1.5 (header v1.5). Don't
- # change it.
-
-type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
- # Bootlin, SYSLX, bootsect...)
- # See Documentation/i386/boot.txt for
- # assigned ids
-
-# flags, unused bits must be zero (RFU) bit within loadflags
-loadflags:
-LOADED_HIGH = 1 # If set, the kernel is loaded high
-CAN_USE_HEAP = 0x80 # If set, the loader also has set
- # heap_end_ptr to tell how much
- # space behind setup.S can be used for
- # heap purposes.
- # Only the loader knows what is free
-#ifndef __BIG_KERNEL__
- .byte 0
-#else
- .byte LOADED_HIGH
-#endif
-
-setup_move_size: .word 0x8000 # size to move, when setup is not
- # loaded at 0x90000. We will move setup
- # to 0x90000 then just before jumping
- # into the kernel. However, only the
- # loader knows how much data behind
- # us also needs to be loaded.
-
-code32_start: # here loaders can put a different
- # start address for 32-bit code.
-#ifndef __BIG_KERNEL__
- .long 0x1000 # 0x1000 = default for zImage
-#else
- .long 0x100000 # 0x100000 = default for big kernel
-#endif
-
-ramdisk_image: .long 0 # address of loaded ramdisk image
- # Here the loader puts the 32-bit
- # address where it loaded the image.
- # This only will be read by the kernel.
-
-ramdisk_size: .long 0 # its size in bytes
-
-bootsect_kludge:
- .long 0 # obsolete
-
-heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later)
- # space from here (exclusive) down to
- # end of setup code can be used by setup
- # for local heap purposes.
-
-pad1: .word 0
-cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
- # If nonzero, a 32-bit pointer
- # to the kernel command line.
- # The command line should be
- # located between the start of
- # setup and the end of low
- # memory (0xa0000), or it may
- # get overwritten before it
- # gets read. If this field is
- # used, there is no longer
- # anything magical about the
- # 0x90000 segment; the setup
- # can be located anywhere in
- # low memory 0x10000 or higher.
-
-ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
- # (Header version 0x0203 or later)
- # The highest safe address for
- # the contents of an initrd
-
-kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
- #required for protected mode
- #kernel
-#ifdef CONFIG_RELOCATABLE
-relocatable_kernel: .byte 1
-#else
-relocatable_kernel: .byte 0
-#endif
-pad2: .byte 0
-pad3: .word 0
-
-cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
- #added with boot protocol
- #version 2.06
-
-trampoline: call start_of_setup
- .align 16
- # The offset at this point is 0x240
- .space (0xeff-0x240+1) # E820 & EDD space (ending at 0xeff)
-# End of setup header #####################################################
-
-start_of_setup:
-# Bootlin depends on this being done early
- movw $0x01500, %ax
- movb $0x81, %dl
- int $0x13
-
-#ifdef SAFE_RESET_DISK_CONTROLLER
-# Reset the disk controller.
- movw $0x0000, %ax
- movb $0x80, %dl
- int $0x13
-#endif
-
-# Set %ds = %cs, we know that SETUPSEG = %cs at this point
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
-# Check signature at end of setup
- cmpw $SIG1, setup_sig1
- jne bad_sig
-
- cmpw $SIG2, setup_sig2
- jne bad_sig
-
- jmp good_sig1
-
-# Routine to print asciiz string at ds:si
-prtstr:
- lodsb
- andb %al, %al
- jz fin
-
- call prtchr
- jmp prtstr
-
-fin: ret
-
-# Space printing
-prtsp2: call prtspc # Print double space
-prtspc: movb $0x20, %al # Print single space (note: fall-thru)
-
-# Part of above routine, this one just prints ascii al
-prtchr: pushw %ax
- pushw %cx
- movw $7,%bx
- movw $0x01, %cx
- movb $0x0e, %ah
- int $0x10
- popw %cx
- popw %ax
- ret
-
-beep: movb $0x07, %al
- jmp prtchr
-
-no_sig_mess: .string "No setup signature found ..."
-
-good_sig1:
- jmp good_sig
-
-# We now have to find the rest of the setup code/data
-bad_sig:
- movw %cs, %ax # SETUPSEG
- subw $DELTA_INITSEG, %ax # INITSEG
- movw %ax, %ds
- xorb %bh, %bh
- movb (497), %bl # get setup sect from bootsect
- subw $4, %bx # LILO loads 4 sectors of setup
- shlw $8, %bx # convert to words (1sect=2^8 words)
- movw %bx, %cx
- shrw $3, %bx # convert to segment
- addw $SYSSEG, %bx
- movw %bx, %cs:start_sys_seg
-# Move rest of setup code/data to here
- movw $2048, %di # four sectors loaded by LILO
- subw %si, %si
- pushw %cs
- popw %es
- movw $SYSSEG, %ax
- movw %ax, %ds
- rep
- movsw
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
- cmpw $SIG1, setup_sig1
- jne no_sig
-
- cmpw $SIG2, setup_sig2
- jne no_sig
-
- jmp good_sig
-
-no_sig:
- lea no_sig_mess, %si
- call prtstr
-
-no_sig_loop:
- hlt
- jmp no_sig_loop
-
-good_sig:
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %ds
-# Check if an old loader tries to load a big-kernel
- testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel?
- jz loader_ok # No, no danger for old loaders.
-
- cmpb $0, %cs:type_of_loader # Do we have a loader that
- # can deal with us?
- jnz loader_ok # Yes, continue.
-
- pushw %cs # No, we have an old loader,
- popw %ds # die.
- lea loader_panic_mess, %si
- call prtstr
-
- jmp no_sig_loop
-
-loader_panic_mess: .string "Wrong loader, giving up..."
-
-# check minimum cpuid
-# we do this here because it is the last place we can actually
-# show a user visible error message. Later the video modus
-# might be already messed up.
-loader_ok:
- call verify_cpu
- testl %eax,%eax
- jz cpu_ok
- movw %cs,%ax # aka SETUPSEG
- movw %ax,%ds
- lea cpu_panic_mess,%si
- call prtstr
-1: jmp 1b
-
-cpu_panic_mess:
- .asciz "PANIC: CPU too old for this kernel."
-
-#include "../kernel/verify_cpu.S"
-
-cpu_ok:
-# Get memory size (extended mem, kB)
-
- xorl %eax, %eax
- movl %eax, (0x1e0)
-#ifndef STANDARD_MEMORY_BIOS_CALL
- movb %al, (E820NR)
-# Try three different memory detection schemes. First, try
-# e820h, which lets us assemble a memory map, then try e801h,
-# which returns a 32-bit memory size, and finally 88h, which
-# returns 0-64m
-
-# method E820H:
-# the memory map from hell. e820h returns memory classified into
-# a whole bunch of different types, and allows memory holes and
-# everything. We scan through this memory map and build a list
-# of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
-
-#define SMAP 0x534d4150
-
-meme820:
- xorl %ebx, %ebx # continuation counter
- movw $E820MAP, %di # point into the whitelist
- # so we can have the bios
- # directly write into it.
-
-jmpe820:
- movl $0x0000e820, %eax # e820, upper word zeroed
- movl $SMAP, %edx # ascii 'SMAP'
- movl $20, %ecx # size of the e820rec
- pushw %ds # data record.
- popw %es
- int $0x15 # make the call
- jc bail820 # fall to e801 if it fails
-
- cmpl $SMAP, %eax # check the return is `SMAP'
- jne bail820 # fall to e801 if it fails
-
-# cmpl $1, 16(%di) # is this usable memory?
-# jne again820
-
- # If this is usable memory, we save it by simply advancing %di by
- # sizeof(e820rec).
- #
-good820:
- movb (E820NR), %al # up to 128 entries
- cmpb $E820MAX, %al
- jae bail820
-
- incb (E820NR)
- movw %di, %ax
- addw $20, %ax
- movw %ax, %di
-again820:
- cmpl $0, %ebx # check to see if
- jne jmpe820 # %ebx is set to EOF
-bail820:
-
-
-# method E801H:
-# memory size is in 1k chunksizes, to avoid confusing loadlin.
-# we store the 0xe801 memory size in a completely different place,
-# because it will most likely be longer than 16 bits.
-# (use 1e0 because that's what Larry Augustine uses in his
-# alternative new memory detection scheme, and it's sensible
-# to write everything into the same place.)
-
-meme801:
- stc # fix to work around buggy
- xorw %cx,%cx # BIOSes which don't clear/set
- xorw %dx,%dx # carry on pass/error of
- # e801h memory size call
- # or merely pass cx,dx though
- # without changing them.
- movw $0xe801, %ax
- int $0x15
- jc mem88
-
- cmpw $0x0, %cx # Kludge to handle BIOSes
- jne e801usecxdx # which report their extended
- cmpw $0x0, %dx # memory in AX/BX rather than
- jne e801usecxdx # CX/DX. The spec I have read
- movw %ax, %cx # seems to indicate AX/BX
- movw %bx, %dx # are more reasonable anyway...
-
-e801usecxdx:
- andl $0xffff, %edx # clear sign extend
- shll $6, %edx # and go from 64k to 1k chunks
- movl %edx, (0x1e0) # store extended memory size
- andl $0xffff, %ecx # clear sign extend
- addl %ecx, (0x1e0) # and add lower memory into
- # total size.
-
-# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or
-# 64mb, depending on the bios) in ax.
-mem88:
-
-#endif
- movb $0x88, %ah
- int $0x15
- movw %ax, (2)
-
-# Set the keyboard repeat rate to the max
- movw $0x0305, %ax
- xorw %bx, %bx
- int $0x16
-
-# Check for video adapter and its parameters and allow the
-# user to browse video modes.
- call video # NOTE: we need %ds pointing
- # to bootsector
-
-# Get hd0 data...
- xorw %ax, %ax
- movw %ax, %ds
- ldsw (4 * 0x41), %si
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- pushw %ax
- movw %ax, %es
- movw $0x0080, %di
- movw $0x10, %cx
- pushw %cx
- cld
- rep
- movsb
-# Get hd1 data...
- xorw %ax, %ax
- movw %ax, %ds
- ldsw (4 * 0x46), %si
- popw %cx
- popw %es
- movw $0x0090, %di
- rep
- movsb
-# Check that there IS a hd1 :-)
- movw $0x01500, %ax
- movb $0x81, %dl
- int $0x13
- jc no_disk1
-
- cmpb $3, %ah
- je is_disk1
-
-no_disk1:
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %es
- movw $0x0090, %di
- movw $0x10, %cx
- xorw %ax, %ax
- cld
- rep
- stosb
-is_disk1:
-# check for Micro Channel (MCA) bus
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %ds
- xorw %ax, %ax
- movw %ax, (0xa0) # set table length to 0
- movb $0xc0, %ah
- stc
- int $0x15 # moves feature table to es:bx
- jc no_mca
-
- pushw %ds
- movw %es, %ax
- movw %ax, %ds
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %es
- movw %bx, %si
- movw $0xa0, %di
- movw (%si), %cx
- addw $2, %cx # table length is a short
- cmpw $0x10, %cx
- jc sysdesc_ok
-
- movw $0x10, %cx # we keep only first 16 bytes
-sysdesc_ok:
- rep
- movsb
- popw %ds
-no_mca:
-#ifdef CONFIG_X86_VOYAGER
- movb $0xff, 0x40 # flag on config found
- movb $0xc0, %al
- mov $0xff, %ah
- int $0x15 # put voyager config info at es:di
- jc no_voyager
- movw $0x40, %si # place voyager info in apm table
- cld
- movw $7, %cx
-voyager_rep:
- movb %es:(%di), %al
- movb %al,(%si)
- incw %di
- incw %si
- decw %cx
- jnz voyager_rep
-no_voyager:
-#endif
-# Check for PS/2 pointing device
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %ds
- movb $0, (0x1ff) # default is no pointing device
- int $0x11 # int 0x11: equipment list
- testb $0x04, %al # check if mouse installed
- jz no_psmouse
-
- movb $0xAA, (0x1ff) # device present
-no_psmouse:
-
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
- movl $0x0000E980, %eax # IST Support
- movl $0x47534943, %edx # Request value
- int $0x15
-
- movl %eax, (96)
- movl %ebx, (100)
- movl %ecx, (104)
- movl %edx, (108)
-#endif
-
-#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
-# Then check for an APM BIOS...
- # %ds points to the bootsector
- movw $0, 0x40 # version = 0 means no APM BIOS
- movw $0x05300, %ax # APM BIOS installation check
- xorw %bx, %bx
- int $0x15
- jc done_apm_bios # Nope, no APM BIOS
-
- cmpw $0x0504d, %bx # Check for "PM" signature
- jne done_apm_bios # No signature, no APM BIOS
-
- andw $0x02, %cx # Is 32 bit supported?
- je done_apm_bios # No 32-bit, no (good) APM BIOS
-
- movw $0x05304, %ax # Disconnect first just in case
- xorw %bx, %bx
- int $0x15 # ignore return code
- movw $0x05303, %ax # 32 bit connect
- xorl %ebx, %ebx
- xorw %cx, %cx # paranoia :-)
- xorw %dx, %dx # ...
- xorl %esi, %esi # ...
- xorw %di, %di # ...
- int $0x15
- jc no_32_apm_bios # Ack, error.
-
- movw %ax, (66) # BIOS code segment
- movl %ebx, (68) # BIOS entry point offset
- movw %cx, (72) # BIOS 16 bit code segment
- movw %dx, (74) # BIOS data segment
- movl %esi, (78) # BIOS code segment lengths
- movw %di, (82) # BIOS data segment length
-# Redo the installation check as the 32 bit connect
-# modifies the flags returned on some BIOSs
- movw $0x05300, %ax # APM BIOS installation check
- xorw %bx, %bx
- xorw %cx, %cx # paranoia
- int $0x15
- jc apm_disconnect # error -> shouldn't happen
-
- cmpw $0x0504d, %bx # check for "PM" signature
- jne apm_disconnect # no sig -> shouldn't happen
-
- movw %ax, (64) # record the APM BIOS version
- movw %cx, (76) # and flags
- jmp done_apm_bios
-
-apm_disconnect: # Tidy up
- movw $0x05304, %ax # Disconnect
- xorw %bx, %bx
- int $0x15 # ignore return code
-
- jmp done_apm_bios
-
-no_32_apm_bios:
- andw $0xfffd, (76) # remove 32 bit support bit
-done_apm_bios:
-#endif
-
-#include "edd.S"
-
-# Now we want to move to protected mode ...
- cmpw $0, %cs:realmode_swtch
- jz rmodeswtch_normal
-
- lcall *%cs:realmode_swtch
-
- jmp rmodeswtch_end
-
-rmodeswtch_normal:
- pushw %cs
- call default_switch
-
-rmodeswtch_end:
-# Now we move the system to its rightful place ... but we check if we have a
-# big-kernel. In that case we *must* not move it ...
- testb $LOADED_HIGH, %cs:loadflags
- jz do_move0 # .. then we have a normal low
- # loaded zImage
- # .. or else we have a high
- # loaded bzImage
- jmp end_move # ... and we skip moving
-
-do_move0:
- movw $0x100, %ax # start of destination segment
- movw %cs, %bp # aka SETUPSEG
- subw $DELTA_INITSEG, %bp # aka INITSEG
- movw %cs:start_sys_seg, %bx # start of source segment
- cld
-do_move:
- movw %ax, %es # destination segment
- incb %ah # instead of add ax,#0x100
- movw %bx, %ds # source segment
- addw $0x100, %bx
- subw %di, %di
- subw %si, %si
- movw $0x800, %cx
- rep
- movsw
- cmpw %bp, %bx # assume start_sys_seg > 0x200,
- # so we will perhaps read one
- # page more than needed, but
- # never overwrite INITSEG
- # because destination is a
- # minimum one page below source
- jb do_move
-
-end_move:
-# then we load the segment descriptors
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
-
-# Check whether we need to be downward compatible with version <=201
- cmpl $0, cmd_line_ptr
- jne end_move_self # loader uses version >=202 features
- cmpb $0x20, type_of_loader
- je end_move_self # bootsect loader, we know of it
-
-# Boot loader doesnt support boot protocol version 2.02.
-# If we have our code not at 0x90000, we need to move it there now.
-# We also then need to move the params behind it (commandline)
-# Because we would overwrite the code on the current IP, we move
-# it in two steps, jumping high after the first one.
- movw %cs, %ax
- cmpw $SETUPSEG, %ax
- je end_move_self
-
- cli # make sure we really have
- # interrupts disabled !
- # because after this the stack
- # should not be used
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ss, %dx
- cmpw %ax, %dx
- jb move_self_1
-
- addw $INITSEG, %dx
- subw %ax, %dx # this will go into %ss after
- # the move
-move_self_1:
- movw %ax, %ds
- movw $INITSEG, %ax # real INITSEG
- movw %ax, %es
- movw %cs:setup_move_size, %cx
- std # we have to move up, so we use
- # direction down because the
- # areas may overlap
- movw %cx, %di
- decw %di
- movw %di, %si
- subw $move_self_here+0x200, %cx
- rep
- movsb
- ljmp $SETUPSEG, $move_self_here
-
-move_self_here:
- movw $move_self_here+0x200, %cx
- rep
- movsb
- movw $SETUPSEG, %ax
- movw %ax, %ds
- movw %dx, %ss
-end_move_self: # now we are at the right place
-
-#
-# Enable A20. This is at the very best an annoying procedure.
-# A20 code ported from SYSLINUX 1.52-1.63 by H. Peter Anvin.
-# AMD Elan bug fix by Robert Schwebel.
-#
-
-#if defined(CONFIG_X86_ELAN)
- movb $0x02, %al # alternate A20 gate
- outb %al, $0x92 # this works on SC410/SC520
-a20_elan_wait:
- call a20_test
- jz a20_elan_wait
- jmp a20_done
-#endif
-
-
-A20_TEST_LOOPS = 32 # Iterations per wait
-A20_ENABLE_LOOPS = 255 # Total loops to try
-
-
-#ifndef CONFIG_X86_VOYAGER
-a20_try_loop:
-
- # First, see if we are on a system with no A20 gate.
-a20_none:
- call a20_test
- jnz a20_done
-
- # Next, try the BIOS (INT 0x15, AX=0x2401)
-a20_bios:
- movw $0x2401, %ax
- pushfl # Be paranoid about flags
- int $0x15
- popfl
-
- call a20_test
- jnz a20_done
-
- # Try enabling A20 through the keyboard controller
-#endif /* CONFIG_X86_VOYAGER */
-a20_kbc:
- call empty_8042
-
-#ifndef CONFIG_X86_VOYAGER
- call a20_test # Just in case the BIOS worked
- jnz a20_done # but had a delayed reaction.
-#endif
-
- movb $0xD1, %al # command write
- outb %al, $0x64
- call empty_8042
-
- movb $0xDF, %al # A20 on
- outb %al, $0x60
- call empty_8042
-
-#ifndef CONFIG_X86_VOYAGER
- # Wait until a20 really *is* enabled; it can take a fair amount of
- # time on certain systems; Toshiba Tecras are known to have this
- # problem.
-a20_kbc_wait:
- xorw %cx, %cx
-a20_kbc_wait_loop:
- call a20_test
- jnz a20_done
- loop a20_kbc_wait_loop
-
- # Final attempt: use "configuration port A"
-a20_fast:
- inb $0x92, %al # Configuration Port A
- orb $0x02, %al # "fast A20" version
- andb $0xFE, %al # don't accidentally reset
- outb %al, $0x92
-
- # Wait for configuration port A to take effect
-a20_fast_wait:
- xorw %cx, %cx
-a20_fast_wait_loop:
- call a20_test
- jnz a20_done
- loop a20_fast_wait_loop
-
- # A20 is still not responding. Try frobbing it again.
- #
- decb (a20_tries)
- jnz a20_try_loop
-
- movw $a20_err_msg, %si
- call prtstr
-
-a20_die:
- hlt
- jmp a20_die
-
-a20_tries:
- .byte A20_ENABLE_LOOPS
-
-a20_err_msg:
- .ascii "linux: fatal error: A20 gate not responding!"
- .byte 13, 10, 0
-
- # If we get here, all is good
-a20_done:
-
-#endif /* CONFIG_X86_VOYAGER */
-# set up gdt and idt and 32bit start address
- lidt idt_48 # load idt with 0,0
- xorl %eax, %eax # Compute gdt_base
- movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
- shll $4, %eax
- addl %eax, code32
- addl $gdt, %eax
- movl %eax, (gdt_48+2)
- lgdt gdt_48 # load gdt with whatever is
- # appropriate
-
-# make sure any possible coprocessor is properly reset..
- xorw %ax, %ax
- outb %al, $0xf0
- call delay
-
- outb %al, $0xf1
- call delay
-
-# well, that went ok, I hope. Now we mask all interrupts - the rest
-# is done in init_IRQ().
- movb $0xFF, %al # mask all interrupts for now
- outb %al, $0xA1
- call delay
-
- movb $0xFB, %al # mask all irq's but irq2 which
- outb %al, $0x21 # is cascaded
-
-# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
-# need no steenking BIOS anyway (except for the initial loading :-).
-# The BIOS-routine wants lots of unnecessary data, and it's less
-# "interesting" anyway. This is how REAL programmers do it.
-#
-# Well, now's the time to actually move into protected mode. To make
-# things as simple as possible, we do no register set-up or anything,
-# we let the gnu-compiled 32-bit programs do that. We just jump to
-# absolute address 0x1000 (or the loader supplied one),
-# in 32-bit protected mode.
-#
-# Note that the short jump isn't strictly needed, although there are
-# reasons why it might be a good idea. It won't hurt in any case.
- movw $1, %ax # protected mode (PE) bit
- lmsw %ax # This is it!
- jmp flush_instr
-
-flush_instr:
- xorw %bx, %bx # Flag to indicate a boot
- xorl %esi, %esi # Pointer to real-mode code
- movw %cs, %si
- subw $DELTA_INITSEG, %si
- shll $4, %esi # Convert to 32-bit pointer
-
-# jump to startup_32 in arch/i386/boot/compressed/head.S
-#
-# NOTE: For high loaded big kernels we need a
-# jmpi 0x100000,__BOOT_CS
-#
-# but we yet haven't reloaded the CS register, so the default size
-# of the target offset still is 16 bit.
-# However, using an operand prefix (0x66), the CPU will properly
-# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
-# Manual, Mixing 16-bit and 32-bit code, page 16-6)
-
- .byte 0x66, 0xea # prefix + jmpi-opcode
-code32: .long startup_32 # will be set to %cs+startup_32
- .word __BOOT_CS
-.code32
-startup_32:
- movl $(__BOOT_DS), %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %fs
- movl %eax, %gs
- movl %eax, %ss
-
- xorl %eax, %eax
-1: incl %eax # check that A20 really IS enabled
- movl %eax, 0x00000000 # loop forever if it isn't
- cmpl %eax, 0x00100000
- je 1b
-
- # Jump to the 32bit entry point
- jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
-.code16
-
-# Here's a bunch of information about your current kernel..
-kernel_version: .ascii UTS_RELEASE
- .ascii " ("
- .ascii LINUX_COMPILE_BY
- .ascii "@"
- .ascii LINUX_COMPILE_HOST
- .ascii ") "
- .ascii UTS_VERSION
- .byte 0
-
-# This is the default real mode switch routine.
-# to be called just before protected mode transition
-default_switch:
- cli # no interrupts allowed !
- movb $0x80, %al # disable NMI for bootup
- # sequence
- outb %al, $0x70
- lret
-
-
-#ifndef CONFIG_X86_VOYAGER
-# This routine tests whether or not A20 is enabled. If so, it
-# exits with zf = 0.
-#
-# The memory address used, 0x200, is the int $0x80 vector, which
-# should be safe.
-
-A20_TEST_ADDR = 4*0x80
-
-a20_test:
- pushw %cx
- pushw %ax
- xorw %cx, %cx
- movw %cx, %fs # Low memory
- decw %cx
- movw %cx, %gs # High memory area
- movw $A20_TEST_LOOPS, %cx
- movw %fs:(A20_TEST_ADDR), %ax
- pushw %ax
-a20_test_wait:
- incw %ax
- movw %ax, %fs:(A20_TEST_ADDR)
- call delay # Serialize and make delay constant
- cmpw %gs:(A20_TEST_ADDR+0x10), %ax
- loope a20_test_wait
-
- popw %fs:(A20_TEST_ADDR)
- popw %ax
- popw %cx
- ret
-
-#endif /* CONFIG_X86_VOYAGER */
-
-# This routine checks that the keyboard command queue is empty
-# (after emptying the output buffers)
-#
-# Some machines have delusions that the keyboard buffer is always full
-# with no keyboard attached...
-#
-# If there is no keyboard controller, we will usually get 0xff
-# to all the reads. With each IO taking a microsecond and
-# a timeout of 100,000 iterations, this can take about half a
-# second ("delay" == outb to port 0x80). That should be ok,
-# and should also be plenty of time for a real keyboard controller
-# to empty.
-#
-
-empty_8042:
- pushl %ecx
- movl $100000, %ecx
-
-empty_8042_loop:
- decl %ecx
- jz empty_8042_end_loop
-
- call delay
-
- inb $0x64, %al # 8042 status port
- testb $1, %al # output buffer?
- jz no_output
-
- call delay
- inb $0x60, %al # read it
- jmp empty_8042_loop
-
-no_output:
- testb $2, %al # is input buffer full?
- jnz empty_8042_loop # yes - loop
-empty_8042_end_loop:
- popl %ecx
- ret
-
-# Read the cmos clock. Return the seconds in al
-gettime:
- pushw %cx
- movb $0x02, %ah
- int $0x1a
- movb %dh, %al # %dh contains the seconds
- andb $0x0f, %al
- movb %dh, %ah
- movb $0x04, %cl
- shrb %cl, %ah
- aad
- popw %cx
- ret
-
-# Delay is needed after doing I/O
-delay:
- outb %al,$0x80
- ret
-
-# Descriptor tables
-#
-# NOTE: The intel manual says gdt should be sixteen bytes aligned for
-# efficiency reasons. However, there are machines which are known not
-# to boot with misaligned GDTs, so alter this at your peril! If you alter
-# GDT_ENTRY_BOOT_CS (in asm/segment.h) remember to leave at least two
-# empty GDT entries (one for NULL and one reserved).
-#
-# NOTE: On some CPUs, the GDT must be 8 byte aligned. This is
-# true for the Voyager Quad CPU card which will not boot without
-# This directive. 16 byte aligment is recommended by intel.
-#
- .align 16
-gdt:
- .fill GDT_ENTRY_BOOT_CS,8,0
-
- .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
- .word 0 # base address = 0
- .word 0x9A00 # code read/exec
- .word 0x00CF # granularity = 4096, 386
- # (+5th nibble of limit)
-
- .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
- .word 0 # base address = 0
- .word 0x9200 # data read/write
- .word 0x00CF # granularity = 4096, 386
- # (+5th nibble of limit)
-gdt_end:
- .align 4
-
- .word 0 # alignment byte
-idt_48:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- .word 0 # alignment byte
-gdt_48:
- .word gdt_end - gdt - 1 # gdt limit
- .word 0, 0 # gdt base (filled in later)
-
-# Include video setup & detection code
-
-#include "video.S"
-
-# Setup signature -- must be last
-setup_sig1: .word SIG1
-setup_sig2: .word SIG2
-
-# After this point, there is some free space which is used by the video mode
-# handling code to store the temporary mode table (not used by the kernel).
-
-modelist:
-
-.text
-endtext:
-.data
-enddata:
-.bss
-endbss:
diff --git a/arch/i386/boot/setup.ld b/arch/i386/boot/setup.ld
new file mode 100644
index 000000000000..df9234b3a5e0
--- /dev/null
+++ b/arch/i386/boot/setup.ld
@@ -0,0 +1,54 @@
+/*
+ * setup.ld
+ *
+ * Linker script for the i386 setup code
+ */
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = 0;
+ .bstext : { *(.bstext) }
+ .bsdata : { *(.bsdata) }
+
+ . = 497;
+ .header : { *(.header) }
+ .inittext : { *(.inittext) }
+ .initdata : { *(.initdata) }
+ .text : { *(.text*) }
+
+ . = ALIGN(16);
+ .rodata : { *(.rodata*) }
+
+ .videocards : {
+ video_cards = .;
+ *(.videocards)
+ video_cards_end = .;
+ }
+
+ . = ALIGN(16);
+ .data : { *(.data*) }
+
+ .signature : {
+ setup_sig = .;
+ LONG(0x5a5aaa55)
+ }
+
+
+ . = ALIGN(16);
+ .bss :
+ {
+ __bss_start = .;
+ *(.bss)
+ __bss_end = .;
+ }
+ . = ALIGN(16);
+ _end = .;
+
+ /DISCARD/ : { *(.note*) }
+
+ . = ASSERT(_end <= 0x8000, "Setup too big!");
+ . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+}
diff --git a/arch/i386/boot/string.c b/arch/i386/boot/string.c
new file mode 100644
index 000000000000..481a22097781
--- /dev/null
+++ b/arch/i386/boot/string.c
@@ -0,0 +1,52 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/string.c
+ *
+ * Very basic string functions
+ */
+
+#include "boot.h"
+
+int strcmp(const char *str1, const char *str2)
+{
+ const unsigned char *s1 = (const unsigned char *)str1;
+ const unsigned char *s2 = (const unsigned char *)str2;
+ int delta = 0;
+
+ while (*s1 || *s2) {
+ delta = *s2 - *s1;
+ if (delta)
+ return delta;
+ s1++;
+ s2++;
+ }
+ return 0;
+}
+
+size_t strnlen(const char *s, size_t maxlen)
+{
+ const char *es = s;
+ while (*es && maxlen) {
+ es++;
+ maxlen--;
+ }
+
+ return (es - s);
+}
+
+unsigned int atou(const char *s)
+{
+ unsigned int i = 0;
+ while (isdigit(*s))
+ i = i * 10 + (*s++ - '0');
+ return i;
+}
diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c
index 05798419a6a9..b4248740ff0d 100644
--- a/arch/i386/boot/tools/build.c
+++ b/arch/i386/boot/tools/build.c
@@ -1,13 +1,12 @@
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 1997 Martin Mares
+ * Copyright (C) 2007 H. Peter Anvin
*/
/*
- * This file builds a disk-image from three different files:
+ * This file builds a disk-image from two different files:
*
- * - bootsect: compatibility mbr which prints an error message if
- * someone tries to boot the kernel directly.
* - setup: 8086 machine code, sets up system parm
* - system: 80386 code for actual system
*
@@ -21,6 +20,7 @@
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
* Cross compiling fixes by Gertjan van Wingerde, July 1996
* Rewritten by Martin Mares, April 1997
+ * Substantially overhauled by H. Peter Anvin, April 2007
*/
#include <stdio.h>
@@ -32,23 +32,25 @@
#include <sys/sysmacros.h>
#include <unistd.h>
#include <fcntl.h>
+#include <sys/mman.h>
#include <asm/boot.h>
-typedef unsigned char byte;
-typedef unsigned short word;
-typedef unsigned long u32;
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned long u32;
#define DEFAULT_MAJOR_ROOT 0
#define DEFAULT_MINOR_ROOT 0
-/* Minimal number of setup sectors (see also bootsect.S) */
-#define SETUP_SECTS 4
+/* Minimal number of setup sectors */
+#define SETUP_SECT_MIN 5
+#define SETUP_SECT_MAX 64
-byte buf[1024];
-int fd;
+/* This must be large enough to hold the entire setup */
+u8 buf[SETUP_SECT_MAX*512];
int is_big_kernel;
-void die(const char * str, ...)
+static void die(const char * str, ...)
{
va_list args;
va_start(args, str);
@@ -57,15 +59,9 @@ void die(const char * str, ...)
exit(1);
}
-void file_open(const char *name)
+static void usage(void)
{
- if ((fd = open(name, O_RDONLY, 0)) < 0)
- die("Unable to open `%s': %m", name);
-}
-
-void usage(void)
-{
- die("Usage: build [-b] bootsect setup system [rootdev] [> image]");
+ die("Usage: build [-b] setup system [rootdev] [> image]");
}
int main(int argc, char ** argv)
@@ -73,27 +69,30 @@ int main(int argc, char ** argv)
unsigned int i, sz, setup_sectors;
int c;
u32 sys_size;
- byte major_root, minor_root;
+ u8 major_root, minor_root;
struct stat sb;
+ FILE *file;
+ int fd;
+ void *kernel;
if (argc > 2 && !strcmp(argv[1], "-b"))
{
is_big_kernel = 1;
argc--, argv++;
}
- if ((argc < 4) || (argc > 5))
+ if ((argc < 3) || (argc > 4))
usage();
- if (argc > 4) {
- if (!strcmp(argv[4], "CURRENT")) {
+ if (argc > 3) {
+ if (!strcmp(argv[3], "CURRENT")) {
if (stat("/", &sb)) {
perror("/");
die("Couldn't stat /");
}
major_root = major(sb.st_dev);
minor_root = minor(sb.st_dev);
- } else if (strcmp(argv[4], "FLOPPY")) {
- if (stat(argv[4], &sb)) {
- perror(argv[4]);
+ } else if (strcmp(argv[3], "FLOPPY")) {
+ if (stat(argv[3], &sb)) {
+ perror(argv[3]);
die("Couldn't stat root device.");
}
major_root = major(sb.st_rdev);
@@ -108,79 +107,62 @@ int main(int argc, char ** argv)
}
fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
- file_open(argv[1]);
- i = read(fd, buf, sizeof(buf));
- fprintf(stderr,"Boot sector %d bytes.\n",i);
- if (i != 512)
- die("Boot block must be exactly 512 bytes");
+ /* Copy the setup code */
+ file = fopen(argv[1], "r");
+ if (!file)
+ die("Unable to open `%s': %m", argv[1]);
+ c = fread(buf, 1, sizeof(buf), file);
+ if (ferror(file))
+ die("read-error on `setup'");
+ if (c < 1024)
+ die("The setup must be at least 1024 bytes");
if (buf[510] != 0x55 || buf[511] != 0xaa)
die("Boot block hasn't got boot flag (0xAA55)");
+ fclose(file);
+
+ /* Pad unused space with zeros */
+ setup_sectors = (c + 511) / 512;
+ if (setup_sectors < SETUP_SECT_MIN)
+ setup_sectors = SETUP_SECT_MIN;
+ i = setup_sectors*512;
+ memset(buf+c, 0, i-c);
+
+ /* Set the default root device */
buf[508] = minor_root;
buf[509] = major_root;
- if (write(1, buf, 512) != 512)
- die("Write call failed");
- close (fd);
-
- file_open(argv[2]); /* Copy the setup code */
- for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c )
- if (write(1, buf, c) != c)
- die("Write call failed");
- if (c != 0)
- die("read-error on `setup'");
- close (fd);
-
- setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */
- /* for compatibility with ancient versions of LILO. */
- if (setup_sectors < SETUP_SECTS)
- setup_sectors = SETUP_SECTS;
- fprintf(stderr, "Setup is %d bytes.\n", i);
- memset(buf, 0, sizeof(buf));
- while (i < setup_sectors * 512) {
- c = setup_sectors * 512 - i;
- if (c > sizeof(buf))
- c = sizeof(buf);
- if (write(1, buf, c) != c)
- die("Write call failed");
- i += c;
- }
- file_open(argv[3]);
- if (fstat (fd, &sb))
- die("Unable to stat `%s': %m", argv[3]);
+ fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
+
+ /* Open and stat the kernel file */
+ fd = open(argv[2], O_RDONLY);
+ if (fd < 0)
+ die("Unable to open `%s': %m", argv[2]);
+ if (fstat(fd, &sb))
+ die("Unable to stat `%s': %m", argv[2]);
sz = sb.st_size;
- fprintf (stderr, "System is %d kB\n", sz/1024);
+ fprintf (stderr, "System is %d kB\n", (sz+1023)/1024);
+ kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
+ if (kernel == MAP_FAILED)
+ die("Unable to mmap '%s': %m", argv[2]);
sys_size = (sz + 15) / 16;
if (!is_big_kernel && sys_size > DEF_SYSSIZE)
die("System is too big. Try using bzImage or modules.");
- while (sz > 0) {
- int l, n;
-
- l = (sz > sizeof(buf)) ? sizeof(buf) : sz;
- if ((n=read(fd, buf, l)) != l) {
- if (n < 0)
- die("Error reading %s: %m", argv[3]);
- else
- die("%s: Unexpected EOF", argv[3]);
- }
- if (write(1, buf, l) != l)
- die("Write failed");
- sz -= l;
- }
+
+ /* Patch the setup code with the appropriate size parameters */
+ buf[0x1f1] = setup_sectors-1;
+ buf[0x1f4] = sys_size;
+ buf[0x1f5] = sys_size >> 8;
+ buf[0x1f6] = sys_size >> 16;
+ buf[0x1f7] = sys_size >> 24;
+
+ if (fwrite(buf, 1, i, stdout) != i)
+ die("Writing setup failed");
+
+ /* Copy the kernel code */
+ if (fwrite(kernel, 1, sz, stdout) != sz)
+ die("Writing kernel failed");
close(fd);
- if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */
- die("Output: seek failed");
- buf[0] = setup_sectors;
- if (write(1, buf, 1) != 1)
- die("Write of setup sector count failed");
- if (lseek(1, 500, SEEK_SET) != 500)
- die("Output: seek failed");
- buf[0] = (sys_size & 0xff);
- buf[1] = ((sys_size >> 8) & 0xff);
- buf[2] = ((sys_size >> 16) & 0xff);
- buf[3] = ((sys_size >> 24) & 0xff);
- if (write(1, buf, 4) != 4)
- die("Write of image length failed");
-
- return 0; /* Everything is OK */
+ /* Everything is OK */
+ return 0;
}
diff --git a/arch/i386/boot/tty.c b/arch/i386/boot/tty.c
new file mode 100644
index 000000000000..9c668aad3515
--- /dev/null
+++ b/arch/i386/boot/tty.c
@@ -0,0 +1,112 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/tty.c
+ *
+ * Very simple screen I/O
+ * XXX: Probably should add very simple serial I/O?
+ */
+
+#include "boot.h"
+
+/*
+ * These functions are in .inittext so they can be used to signal
+ * error during initialization.
+ */
+
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+ unsigned char c = ch;
+
+ if (c == '\n')
+ putchar('\r'); /* \n -> \r\n */
+
+ /* int $0x10 is known to have bugs involving touching registers
+ it shouldn't. Be extra conservative... */
+ asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
+ : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+}
+
+void __attribute__((section(".inittext"))) puts(const char *str)
+{
+ int n = 0;
+ while (*str) {
+ putchar(*str++);
+ n++;
+ }
+}
+
+/*
+ * Read the CMOS clock through the BIOS, and return the
+ * seconds in BCD.
+ */
+
+static u8 gettime(void)
+{
+ u16 ax = 0x0200;
+ u16 cx, dx;
+
+ asm("int $0x1a"
+ : "+a" (ax), "=c" (cx), "=d" (dx)
+ : : "ebx", "esi", "edi");
+
+ return dx >> 8;
+}
+
+/*
+ * Read from the keyboard
+ */
+int getchar(void)
+{
+ u16 ax = 0;
+ asm("int $0x16" : "+a" (ax));
+
+ return ax & 0xff;
+}
+
+static int kbd_pending(void)
+{
+ u8 pending;
+ asm("int $0x16; setnz %0"
+ : "=rm" (pending)
+ : "a" (0x0100));
+ return pending;
+}
+
+void kbd_flush(void)
+{
+ for (;;) {
+ if (!kbd_pending())
+ break;
+ getchar();
+ }
+}
+
+int getchar_timeout(void)
+{
+ int cnt = 30;
+ int t0, t1;
+
+ t0 = gettime();
+
+ while (cnt) {
+ if (kbd_pending())
+ return getchar();
+
+ t1 = gettime();
+ if (t0 != t1) {
+ cnt--;
+ t0 = t1;
+ }
+ }
+
+ return 0; /* Timeout! */
+}
diff --git a/arch/i386/boot/version.c b/arch/i386/boot/version.c
new file mode 100644
index 000000000000..c61462f7d9a7
--- /dev/null
+++ b/arch/i386/boot/version.c
@@ -0,0 +1,23 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/version.c
+ *
+ * Kernel version string
+ */
+
+#include "boot.h"
+#include <linux/utsrelease.h>
+#include <linux/compile.h>
+
+const char kernel_version[] =
+ UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
+ UTS_VERSION;
diff --git a/arch/i386/boot/vesa.h b/arch/i386/boot/vesa.h
new file mode 100644
index 000000000000..ff5b73cd406f
--- /dev/null
+++ b/arch/i386/boot/vesa.h
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifndef BOOT_VESA_H
+#define BOOT_VESA_H
+
+typedef struct {
+ u16 off, seg;
+} far_ptr;
+
+/* VESA General Information table */
+struct vesa_general_info {
+ u32 signature; /* 0 Magic number = "VESA" */
+ u16 version; /* 4 */
+ far_ptr vendor_string; /* 6 */
+ u32 capabilities; /* 10 */
+ far_ptr video_mode_ptr; /* 14 */
+ u16 total_memory; /* 18 */
+
+ u16 oem_software_rev; /* 20 */
+ far_ptr oem_vendor_name_ptr; /* 22 */
+ far_ptr oem_product_name_ptr; /* 26 */
+ far_ptr oem_product_rev_ptr; /* 30 */
+
+ u8 reserved[222]; /* 34 */
+ u8 oem_data[256]; /* 256 */
+} __attribute__ ((packed));
+
+#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
+#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
+
+struct vesa_mode_info {
+ u16 mode_attr; /* 0 */
+ u8 win_attr[2]; /* 2 */
+ u16 win_grain; /* 4 */
+ u16 win_size; /* 6 */
+ u16 win_seg[2]; /* 8 */
+ far_ptr win_scheme; /* 12 */
+ u16 logical_scan; /* 16 */
+
+ u16 h_res; /* 18 */
+ u16 v_res; /* 20 */
+ u8 char_width; /* 22 */
+ u8 char_height; /* 23 */
+ u8 memory_planes; /* 24 */
+ u8 bpp; /* 25 */
+ u8 banks; /* 26 */
+ u8 memory_layout; /* 27 */
+ u8 bank_size; /* 28 */
+ u8 image_planes; /* 29 */
+ u8 page_function; /* 30 */
+
+ u8 rmask; /* 31 */
+ u8 rpos; /* 32 */
+ u8 gmask; /* 33 */
+ u8 gpos; /* 34 */
+ u8 bmask; /* 35 */
+ u8 bpos; /* 36 */
+ u8 resv_mask; /* 37 */
+ u8 resv_pos; /* 38 */
+ u8 dcm_info; /* 39 */
+
+ u32 lfb_ptr; /* 40 Linear frame buffer address */
+ u32 offscreen_ptr; /* 44 Offscreen memory address */
+ u16 offscreen_size; /* 48 */
+
+ u8 reserved[206]; /* 50 */
+} __attribute__ ((packed));
+
+#endif /* LIB_SYS_VESA_H */
diff --git a/arch/i386/boot/video-bios.c b/arch/i386/boot/video-bios.c
new file mode 100644
index 000000000000..afea46c500cc
--- /dev/null
+++ b/arch/i386/boot/video-bios.c
@@ -0,0 +1,125 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video-bios.c
+ *
+ * Standard video BIOS modes
+ *
+ * We have two options for this; silent and scanned.
+ */
+
+#include "boot.h"
+#include "video.h"
+
+__videocard video_bios;
+
+/* Set a conventional BIOS mode */
+static int set_bios_mode(u8 mode);
+
+static int bios_set_mode(struct mode_info *mi)
+{
+ return set_bios_mode(mi->mode - VIDEO_FIRST_BIOS);
+}
+
+static int set_bios_mode(u8 mode)
+{
+ u16 ax;
+ u8 new_mode;
+
+ ax = mode; /* AH=0x00 Set Video Mode */
+ asm volatile(INT10
+ : "+a" (ax)
+ : : "ebx", "ecx", "edx", "esi", "edi");
+
+ ax = 0x0f00; /* Get Current Video Mode */
+ asm volatile(INT10
+ : "+a" (ax)
+ : : "ebx", "ecx", "edx", "esi", "edi");
+
+ do_restore = 1; /* Assume video contents was lost */
+ new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */
+
+ if (new_mode == mode)
+ return 0; /* Mode change OK */
+
+ if (new_mode != boot_params.screen_info.orig_video_mode) {
+ /* Mode setting failed, but we didn't end up where we
+ started. That's bad. Try to revert to the original
+ video mode. */
+ ax = boot_params.screen_info.orig_video_mode;
+ asm volatile(INT10
+ : "+a" (ax)
+ : : "ebx", "ecx", "edx", "esi", "edi");
+ }
+ return -1;
+}
+
+static int bios_probe(void)
+{
+ u8 mode;
+ u8 saved_mode = boot_params.screen_info.orig_video_mode;
+ u16 crtc;
+ struct mode_info *mi;
+ int nmodes = 0;
+
+ if (adapter != ADAPTER_EGA && adapter != ADAPTER_VGA)
+ return 0;
+
+ set_fs(0);
+ crtc = vga_crtc();
+
+ video_bios.modes = GET_HEAP(struct mode_info, 0);
+
+ for (mode = 0x14; mode <= 0x7f; mode++) {
+ if (heap_free() < sizeof(struct mode_info))
+ break;
+
+ if (mode_defined(VIDEO_FIRST_BIOS+mode))
+ continue;
+
+ if (set_bios_mode(mode))
+ continue;
+
+ /* Try to verify that it's a text mode. */
+
+ /* Attribute Controller: make graphics controller disabled */
+ if (in_idx(0x3c0, 0x10) & 0x01)
+ continue;
+
+ /* Graphics Controller: verify Alpha addressing enabled */
+ if (in_idx(0x3ce, 0x06) & 0x01)
+ continue;
+
+ /* CRTC cursor location low should be zero(?) */
+ if (in_idx(crtc, 0x0f))
+ continue;
+
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = VIDEO_FIRST_BIOS+mode;
+ mi->x = rdfs16(0x44a);
+ mi->y = rdfs8(0x484)+1;
+ nmodes++;
+ }
+
+ set_bios_mode(saved_mode);
+
+ return nmodes;
+}
+
+__videocard video_bios =
+{
+ .card_name = "BIOS (scanned)",
+ .probe = bios_probe,
+ .set_mode = bios_set_mode,
+ .unsafe = 1,
+ .xmode_first = VIDEO_FIRST_BIOS,
+ .xmode_n = 0x80,
+};
diff --git a/arch/i386/boot/video-vesa.c b/arch/i386/boot/video-vesa.c
new file mode 100644
index 000000000000..e6aa9eb8d93a
--- /dev/null
+++ b/arch/i386/boot/video-vesa.c
@@ -0,0 +1,284 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video-vesa.c
+ *
+ * VESA text modes
+ */
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+
+/* VESA information */
+static struct vesa_general_info vginfo;
+static struct vesa_mode_info vminfo;
+
+__videocard video_vesa;
+
+static void vesa_store_mode_params_graphics(void);
+
+static int vesa_probe(void)
+{
+#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
+ u16 ax;
+ u16 mode;
+ addr_t mode_ptr;
+ struct mode_info *mi;
+ int nmodes = 0;
+
+ video_vesa.modes = GET_HEAP(struct mode_info, 0);
+
+ vginfo.signature = VBE2_MAGIC;
+
+ /* Optimistically assume a VESA BIOS is register-clean... */
+ ax = 0x4f00;
+ asm("int $0x10" : "+a" (ax), "=m" (vginfo) : "D" (&vginfo));
+
+ if (ax != 0x004f ||
+ vginfo.signature != VESA_MAGIC ||
+ vginfo.version < 0x0102)
+ return 0; /* Not present */
+#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */
+#ifdef CONFIG_VIDEO_VESA
+ set_fs(vginfo.video_mode_ptr.seg);
+ mode_ptr = vginfo.video_mode_ptr.off;
+
+ while ((mode = rdfs16(mode_ptr)) != 0xffff) {
+ mode_ptr += 2;
+
+ if (heap_free() < sizeof(struct mode_info))
+ break; /* Heap full, can't save mode info */
+
+ if (mode & ~0x1ff)
+ continue;
+
+ memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+
+ ax = 0x4f01;
+ asm("int $0x10"
+ : "+a" (ax), "=m" (vminfo)
+ : "c" (mode), "D" (&vminfo));
+
+ if (ax != 0x004f)
+ continue;
+
+ if ((vminfo.mode_attr & 0x15) == 0x05) {
+ /* Text Mode, TTY BIOS supported,
+ supported by hardware */
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = mode + VIDEO_FIRST_VESA;
+ mi->x = vminfo.h_res;
+ mi->y = vminfo.v_res;
+ nmodes++;
+ } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+#ifdef CONFIG_FB
+ /* Graphics mode, color, linear frame buffer
+ supported -- register the mode but hide from
+ the menu. Only do this if framebuffer is
+ configured, however, otherwise the user will
+ be left without a screen. */
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = mode + VIDEO_FIRST_VESA;
+ mi->x = mi->y = 0;
+ nmodes++;
+#endif
+ }
+ }
+
+ return nmodes;
+#else
+ return 0;
+#endif /* CONFIG_VIDEO_VESA */
+}
+
+static int vesa_set_mode(struct mode_info *mode)
+{
+ u16 ax;
+ int is_graphic;
+ u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
+
+ memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+
+ ax = 0x4f01;
+ asm("int $0x10"
+ : "+a" (ax), "=m" (vminfo)
+ : "c" (vesa_mode), "D" (&vminfo));
+
+ if (ax != 0x004f)
+ return -1;
+
+ if ((vminfo.mode_attr & 0x15) == 0x05) {
+ /* It's a supported text mode */
+ is_graphic = 0;
+ } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+ /* It's a graphics mode with linear frame buffer */
+ is_graphic = 1;
+ vesa_mode |= 0x4000; /* Request linear frame buffer */
+ } else {
+ return -1; /* Invalid mode */
+ }
+
+
+ ax = 0x4f02;
+ asm volatile("int $0x10"
+ : "+a" (ax)
+ : "b" (vesa_mode), "D" (0));
+
+ if (ax != 0x004f)
+ return -1;
+
+ graphic_mode = is_graphic;
+ if (!is_graphic) {
+ /* Text mode */
+ force_x = mode->x;
+ force_y = mode->y;
+ do_restore = 1;
+ } else {
+ /* Graphics mode */
+ vesa_store_mode_params_graphics();
+ }
+
+ return 0;
+}
+
+
+/* Switch DAC to 8-bit mode */
+static void vesa_dac_set_8bits(void)
+{
+ u8 dac_size = 6;
+
+ /* If possible, switch the DAC to 8-bit mode */
+ if (vginfo.capabilities & 1) {
+ u16 ax, bx;
+
+ ax = 0x4f08;
+ bx = 0x0800;
+ asm volatile(INT10
+ : "+a" (ax), "+b" (bx)
+ : : "ecx", "edx", "esi", "edi");
+
+ if (ax == 0x004f)
+ dac_size = bx >> 8;
+ }
+
+ /* Set the color sizes to the DAC size, and offsets to 0 */
+ boot_params.screen_info.red_size = dac_size;
+ boot_params.screen_info.green_size = dac_size;
+ boot_params.screen_info.blue_size = dac_size;
+ boot_params.screen_info.rsvd_size = dac_size;
+
+ boot_params.screen_info.red_pos = 0;
+ boot_params.screen_info.green_pos = 0;
+ boot_params.screen_info.blue_pos = 0;
+ boot_params.screen_info.rsvd_pos = 0;
+}
+
+/* Save the VESA protected mode info */
+static void vesa_store_pm_info(void)
+{
+ u16 ax, bx, di, es;
+
+ ax = 0x4f0a;
+ bx = di = 0;
+ asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
+ : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
+ : : "ecx", "esi");
+
+ if (ax != 0x004f)
+ return;
+
+ boot_params.screen_info.vesapm_seg = es;
+ boot_params.screen_info.vesapm_off = di;
+}
+
+/*
+ * Save video mode parameters for graphics mode
+ */
+static void vesa_store_mode_params_graphics(void)
+{
+ /* Tell the kernel we're in VESA graphics mode */
+ boot_params.screen_info.orig_video_isVGA = 0x23;
+
+ /* Mode parameters */
+ boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
+ boot_params.screen_info.lfb_linelength = vminfo.logical_scan;
+ boot_params.screen_info.lfb_width = vminfo.h_res;
+ boot_params.screen_info.lfb_height = vminfo.v_res;
+ boot_params.screen_info.lfb_depth = vminfo.bpp;
+ boot_params.screen_info.pages = vminfo.image_planes;
+ boot_params.screen_info.lfb_base = vminfo.lfb_ptr;
+ memcpy(&boot_params.screen_info.red_size,
+ &vminfo.rmask, 8);
+
+ /* General parameters */
+ boot_params.screen_info.lfb_size = vginfo.total_memory;
+
+ if (vminfo.bpp <= 8)
+ vesa_dac_set_8bits();
+
+ vesa_store_pm_info();
+}
+
+/*
+ * Save EDID information for the kernel; this is invoked, separately,
+ * after mode-setting.
+ */
+void vesa_store_edid(void)
+{
+#ifdef CONFIG_FIRMWARE_EDID
+ u16 ax, bx, cx, dx, di;
+
+ /* Apparently used as a nonsense token... */
+ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
+
+ if (vginfo.version < 0x0200)
+ return; /* EDID requires VBE 2.0+ */
+
+ ax = 0x4f15; /* VBE DDC */
+ bx = 0x0000; /* Report DDC capabilities */
+ cx = 0; /* Controller 0 */
+ di = 0; /* ES:DI must be 0 by spec */
+
+ /* Note: The VBE DDC spec is different from the main VESA spec;
+ we genuinely have to assume all registers are destroyed here. */
+
+ asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
+ : "+a" (ax), "+b" (bx)
+ : "c" (cx), "D" (di)
+ : "esi");
+
+ if (ax != 0x004f)
+ return; /* No EDID */
+
+ /* BH = time in seconds to transfer EDD information */
+ /* BL = DDC level supported */
+
+ ax = 0x4f15; /* VBE DDC */
+ bx = 0x0001; /* Read EDID */
+ cx = 0; /* Controller 0 */
+ dx = 0; /* EDID block number */
+ di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
+ asm(INT10
+ : "+a" (ax), "+b" (bx), "+d" (dx)
+ : "c" (cx), "D" (di)
+ : "esi");
+#endif /* CONFIG_FIRMWARE_EDID */
+}
+
+__videocard video_vesa =
+{
+ .card_name = "VESA",
+ .probe = vesa_probe,
+ .set_mode = vesa_set_mode,
+ .xmode_first = VIDEO_FIRST_VESA,
+ .xmode_n = 0x200,
+};
diff --git a/arch/i386/boot/video-vga.c b/arch/i386/boot/video-vga.c
new file mode 100644
index 000000000000..700d09a9c9b3
--- /dev/null
+++ b/arch/i386/boot/video-vga.c
@@ -0,0 +1,260 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video-vga.c
+ *
+ * Common all-VGA modes
+ */
+
+#include "boot.h"
+#include "video.h"
+
+static struct mode_info vga_modes[] = {
+ { VIDEO_80x25, 80, 25 },
+ { VIDEO_8POINT, 80, 50 },
+ { VIDEO_80x43, 80, 43 },
+ { VIDEO_80x28, 80, 28 },
+ { VIDEO_80x30, 80, 30 },
+ { VIDEO_80x34, 80, 34 },
+ { VIDEO_80x60, 80, 60 },
+};
+
+static struct mode_info ega_modes[] = {
+ { VIDEO_80x25, 80, 25 },
+ { VIDEO_8POINT, 80, 43 },
+};
+
+static struct mode_info cga_modes[] = {
+ { VIDEO_80x25, 80, 25 },
+};
+
+__videocard video_vga;
+
+/* Set basic 80x25 mode */
+static u8 vga_set_basic_mode(void)
+{
+ u16 ax;
+ u8 rows;
+ u8 mode;
+
+#ifdef CONFIG_VIDEO_400_HACK
+ if (adapter >= ADAPTER_VGA) {
+ asm(INT10
+ : : "a" (0x1202), "b" (0x0030)
+ : "ecx", "edx", "esi", "edi");
+ }
+#endif
+
+ ax = 0x0f00;
+ asm(INT10
+ : "+a" (ax)
+ : : "ebx", "ecx", "edx", "esi", "edi");
+
+ mode = (u8)ax;
+
+ set_fs(0);
+ rows = rdfs8(0x484); /* rows minus one */
+
+#ifndef CONFIG_VIDEO_400_HACK
+ if ((ax == 0x5003 || ax == 0x5007) &&
+ (rows == 0 || rows == 24))
+ return mode;
+#endif
+
+ if (mode != 3 && mode != 7)
+ mode = 3;
+
+ /* Set the mode */
+ asm volatile(INT10
+ : : "a" (mode)
+ : "ebx", "ecx", "edx", "esi", "edi");
+ do_restore = 1;
+ return mode;
+}
+
+static void vga_set_8font(void)
+{
+ /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+
+ /* Set 8x8 font */
+ asm volatile(INT10 : : "a" (0x1112), "b" (0));
+
+ /* Use alternate print screen */
+ asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+
+ /* Turn off cursor emulation */
+ asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+
+ /* Cursor is scan lines 6-7 */
+ asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+}
+
+static void vga_set_14font(void)
+{
+ /* Set 9x14 font - 80x28 on VGA */
+
+ /* Set 9x14 font */
+ asm volatile(INT10 : : "a" (0x1111), "b" (0));
+
+ /* Turn off cursor emulation */
+ asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+
+ /* Cursor is scan lines 11-12 */
+ asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+}
+
+static void vga_set_80x43(void)
+{
+ /* Set 80x43 mode on VGA (not EGA) */
+
+ /* Set 350 scans */
+ asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+
+ /* Reset video mode */
+ asm volatile(INT10 : : "a" (0x0003));
+
+ vga_set_8font();
+}
+
+/* I/O address of the VGA CRTC */
+u16 vga_crtc(void)
+{
+ return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
+}
+
+static void vga_set_480_scanlines(int end)
+{
+ u16 crtc;
+ u8 csel;
+
+ crtc = vga_crtc();
+
+ out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
+ out_idx(0x0b, crtc, 0x06); /* Vertical total */
+ out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+ out_idx(0xea, crtc, 0x10); /* Vertical sync start */
+ out_idx(end, crtc, 0x12); /* Vertical display end */
+ out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
+ out_idx(0x04, crtc, 0x16); /* Vertical blank end */
+ csel = inb(0x3cc);
+ csel &= 0x0d;
+ csel |= 0xe2;
+ outb(csel, 0x3cc);
+}
+
+static void vga_set_80x30(void)
+{
+ vga_set_480_scanlines(0xdf);
+}
+
+static void vga_set_80x34(void)
+{
+ vga_set_14font();
+ vga_set_480_scanlines(0xdb);
+}
+
+static void vga_set_80x60(void)
+{
+ vga_set_8font();
+ vga_set_480_scanlines(0xdf);
+}
+
+static int vga_set_mode(struct mode_info *mode)
+{
+ /* Set the basic mode */
+ vga_set_basic_mode();
+
+ /* Override a possibly broken BIOS */
+ force_x = mode->x;
+ force_y = mode->y;
+
+ switch (mode->mode) {
+ case VIDEO_80x25:
+ break;
+ case VIDEO_8POINT:
+ vga_set_8font();
+ break;
+ case VIDEO_80x43:
+ vga_set_80x43();
+ break;
+ case VIDEO_80x28:
+ vga_set_14font();
+ break;
+ case VIDEO_80x30:
+ vga_set_80x30();
+ break;
+ case VIDEO_80x34:
+ vga_set_80x34();
+ break;
+ case VIDEO_80x60:
+ vga_set_80x60();
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Note: this probe includes basic information required by all
+ * systems. It should be executed first, by making sure
+ * video-vga.c is listed first in the Makefile.
+ */
+static int vga_probe(void)
+{
+ static const char *card_name[] = {
+ "CGA/MDA/HGC", "EGA", "VGA"
+ };
+ static struct mode_info *mode_lists[] = {
+ cga_modes,
+ ega_modes,
+ vga_modes,
+ };
+ static int mode_count[] = {
+ sizeof(cga_modes)/sizeof(struct mode_info),
+ sizeof(ega_modes)/sizeof(struct mode_info),
+ sizeof(vga_modes)/sizeof(struct mode_info),
+ };
+ u8 vga_flag;
+
+ asm(INT10
+ : "=b" (boot_params.screen_info.orig_video_ega_bx)
+ : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
+ : "ecx", "edx", "esi", "edi");
+
+ /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
+ if ((u8)boot_params.screen_info.orig_video_ega_bx != 0x10) {
+ /* EGA/VGA */
+ asm(INT10
+ : "=a" (vga_flag)
+ : "a" (0x1a00)
+ : "ebx", "ecx", "edx", "esi", "edi");
+
+ if (vga_flag == 0x1a) {
+ adapter = ADAPTER_VGA;
+ boot_params.screen_info.orig_video_isVGA = 1;
+ } else {
+ adapter = ADAPTER_EGA;
+ }
+ } else {
+ adapter = ADAPTER_CGA;
+ }
+
+ video_vga.modes = mode_lists[adapter];
+ video_vga.card_name = card_name[adapter];
+ return mode_count[adapter];
+}
+
+__videocard video_vga =
+{
+ .card_name = "VGA",
+ .probe = vga_probe,
+ .set_mode = vga_set_mode,
+};
diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S
deleted file mode 100644
index 8143c9516cb4..000000000000
--- a/arch/i386/boot/video.S
+++ /dev/null
@@ -1,2043 +0,0 @@
-/* video.S
- *
- * Display adapter & video mode setup, version 2.13 (14-May-99)
- *
- * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
- * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
- *
- * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
- *
- * For further information, look at Documentation/svga.txt.
- *
- */
-
-/* Enable autodetection of SVGA adapters and modes. */
-#undef CONFIG_VIDEO_SVGA
-
-/* Enable autodetection of VESA modes */
-#define CONFIG_VIDEO_VESA
-
-/* Enable compacting of mode table */
-#define CONFIG_VIDEO_COMPACT
-
-/* Retain screen contents when switching modes */
-#define CONFIG_VIDEO_RETAIN
-
-/* Enable local mode list */
-#undef CONFIG_VIDEO_LOCAL
-
-/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
-#undef CONFIG_VIDEO_400_HACK
-
-/* Hack that lets you force specific BIOS mode ID and specific dimensions */
-#undef CONFIG_VIDEO_GFX_HACK
-#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */
-#define VIDEO_GFX_BIOS_BX 0x0102
-#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */
-
-/* This code uses an extended set of video mode numbers. These include:
- * Aliases for standard modes
- * NORMAL_VGA (-1)
- * EXTENDED_VGA (-2)
- * ASK_VGA (-3)
- * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
- * of compatibility when extending the table. These are between 0x00 and 0xff.
- */
-#define VIDEO_FIRST_MENU 0x0000
-
-/* Standard BIOS video modes (BIOS number + 0x0100) */
-#define VIDEO_FIRST_BIOS 0x0100
-
-/* VESA BIOS video modes (VESA number + 0x0200) */
-#define VIDEO_FIRST_VESA 0x0200
-
-/* Video7 special modes (BIOS number + 0x0900) */
-#define VIDEO_FIRST_V7 0x0900
-
-/* Special video modes */
-#define VIDEO_FIRST_SPECIAL 0x0f00
-#define VIDEO_80x25 0x0f00
-#define VIDEO_8POINT 0x0f01
-#define VIDEO_80x43 0x0f02
-#define VIDEO_80x28 0x0f03
-#define VIDEO_CURRENT_MODE 0x0f04
-#define VIDEO_80x30 0x0f05
-#define VIDEO_80x34 0x0f06
-#define VIDEO_80x60 0x0f07
-#define VIDEO_GFX_HACK 0x0f08
-#define VIDEO_LAST_SPECIAL 0x0f09
-
-/* Video modes given by resolution */
-#define VIDEO_FIRST_RESOLUTION 0x1000
-
-/* The "recalculate timings" flag */
-#define VIDEO_RECALC 0x8000
-
-/* Positions of various video parameters passed to the kernel */
-/* (see also include/linux/tty.h) */
-#define PARAM_CURSOR_POS 0x00
-#define PARAM_VIDEO_PAGE 0x04
-#define PARAM_VIDEO_MODE 0x06
-#define PARAM_VIDEO_COLS 0x07
-#define PARAM_VIDEO_EGA_BX 0x0a
-#define PARAM_VIDEO_LINES 0x0e
-#define PARAM_HAVE_VGA 0x0f
-#define PARAM_FONT_POINTS 0x10
-
-#define PARAM_LFB_WIDTH 0x12
-#define PARAM_LFB_HEIGHT 0x14
-#define PARAM_LFB_DEPTH 0x16
-#define PARAM_LFB_BASE 0x18
-#define PARAM_LFB_SIZE 0x1c
-#define PARAM_LFB_LINELENGTH 0x24
-#define PARAM_LFB_COLORS 0x26
-#define PARAM_VESAPM_SEG 0x2e
-#define PARAM_VESAPM_OFF 0x30
-#define PARAM_LFB_PAGES 0x32
-#define PARAM_VESA_ATTRIB 0x34
-#define PARAM_CAPABILITIES 0x36
-
-/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
-#ifdef CONFIG_VIDEO_RETAIN
-#define DO_STORE call store_screen
-#else
-#define DO_STORE
-#endif /* CONFIG_VIDEO_RETAIN */
-
-# This is the main entry point called by setup.S
-# %ds *must* be pointing to the bootsector
-video: pushw %ds # We use different segments
- pushw %ds # FS contains original DS
- popw %fs
- pushw %cs # DS is equal to CS
- popw %ds
- pushw %cs # ES is equal to CS
- popw %es
- xorw %ax, %ax
- movw %ax, %gs # GS is zero
- cld
- call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA)
-#ifdef CONFIG_VIDEO_SELECT
- movw %fs:(0x01fa), %ax # User selected video mode
- cmpw $ASK_VGA, %ax # Bring up the menu
- jz vid2
-
- call mode_set # Set the mode
- jc vid1
-
- leaw badmdt, %si # Invalid mode ID
- call prtstr
-vid2: call mode_menu
-vid1:
-#ifdef CONFIG_VIDEO_RETAIN
- call restore_screen # Restore screen contents
-#endif /* CONFIG_VIDEO_RETAIN */
- call store_edid
-#endif /* CONFIG_VIDEO_SELECT */
- call mode_params # Store mode parameters
- popw %ds # Restore original DS
- ret
-
-# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
-basic_detect:
- movb $0, %fs:(PARAM_HAVE_VGA)
- movb $0x12, %ah # Check EGA/VGA
- movb $0x10, %bl
- int $0x10
- movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel
- cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card.
- je basret
-
- incb adapter
- movw $0x1a00, %ax # Check EGA or VGA?
- int $0x10
- cmpb $0x1a, %al # 1a means VGA...
- jne basret # anything else is EGA.
-
- incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA
- incb adapter
-basret: ret
-
-# Store the video mode parameters for later usage by the kernel.
-# This is done by asking the BIOS except for the rows/columns
-# parameters in the default 80x25 mode -- these are set directly,
-# because some very obscure BIOSes supply insane values.
-mode_params:
-#ifdef CONFIG_VIDEO_SELECT
- cmpb $0, graphic_mode
- jnz mopar_gr
-#endif
- movb $0x03, %ah # Read cursor position
- xorb %bh, %bh
- int $0x10
- movw %dx, %fs:(PARAM_CURSOR_POS)
- movb $0x0f, %ah # Read page/mode/width
- int $0x10
- movw %bx, %fs:(PARAM_VIDEO_PAGE)
- movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width
- cmpb $0x7, %al # MDA/HGA => segment differs
- jnz mopar0
-
- movw $0xb000, video_segment
-mopar0: movw %gs:(0x485), %ax # Font size
- movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA)
- movw force_size, %ax # Forced size?
- orw %ax, %ax
- jz mopar1
-
- movb %ah, %fs:(PARAM_VIDEO_COLS)
- movb %al, %fs:(PARAM_VIDEO_LINES)
- ret
-
-mopar1: movb $25, %al
- cmpb $0, adapter # If we are on CGA/MDA/HGA, the
- jz mopar2 # screen must have 25 lines.
-
- movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS
- incb %al # location of max lines.
-mopar2: movb %al, %fs:(PARAM_VIDEO_LINES)
- ret
-
-#ifdef CONFIG_VIDEO_SELECT
-# Fetching of VESA frame buffer parameters
-mopar_gr:
- leaw modelist+1024, %di
- movb $0x23, %fs:(PARAM_HAVE_VGA)
- movw 16(%di), %ax
- movw %ax, %fs:(PARAM_LFB_LINELENGTH)
- movw 18(%di), %ax
- movw %ax, %fs:(PARAM_LFB_WIDTH)
- movw 20(%di), %ax
- movw %ax, %fs:(PARAM_LFB_HEIGHT)
- movb 25(%di), %al
- movb $0, %ah
- movw %ax, %fs:(PARAM_LFB_DEPTH)
- movb 29(%di), %al
- movb $0, %ah
- movw %ax, %fs:(PARAM_LFB_PAGES)
- movl 40(%di), %eax
- movl %eax, %fs:(PARAM_LFB_BASE)
- movl 31(%di), %eax
- movl %eax, %fs:(PARAM_LFB_COLORS)
- movl 35(%di), %eax
- movl %eax, %fs:(PARAM_LFB_COLORS+4)
- movw 0(%di), %ax
- movw %ax, %fs:(PARAM_VESA_ATTRIB)
-
-# get video mem size
- leaw modelist+1024, %di
- movw $0x4f00, %ax
- int $0x10
- xorl %eax, %eax
- movw 18(%di), %ax
- movl %eax, %fs:(PARAM_LFB_SIZE)
-
-# store mode capabilities
- movl 10(%di), %eax
- movl %eax, %fs:(PARAM_CAPABILITIES)
-
-# switching the DAC to 8-bit is for <= 8 bpp only
- movw %fs:(PARAM_LFB_DEPTH), %ax
- cmpw $8, %ax
- jg dac_done
-
-# get DAC switching capability
- xorl %eax, %eax
- movb 10(%di), %al
- testb $1, %al
- jz dac_set
-
-# attempt to switch DAC to 8-bit
- movw $0x4f08, %ax
- movw $0x0800, %bx
- int $0x10
- cmpw $0x004f, %ax
- jne dac_set
- movb %bh, dac_size # store actual DAC size
-
-dac_set:
-# set color size to DAC size
- movb dac_size, %al
- movb %al, %fs:(PARAM_LFB_COLORS+0)
- movb %al, %fs:(PARAM_LFB_COLORS+2)
- movb %al, %fs:(PARAM_LFB_COLORS+4)
- movb %al, %fs:(PARAM_LFB_COLORS+6)
-
-# set color offsets to 0
- movb $0, %fs:(PARAM_LFB_COLORS+1)
- movb $0, %fs:(PARAM_LFB_COLORS+3)
- movb $0, %fs:(PARAM_LFB_COLORS+5)
- movb $0, %fs:(PARAM_LFB_COLORS+7)
-
-dac_done:
-# get protected mode interface informations
- movw $0x4f0a, %ax
- xorw %bx, %bx
- xorw %di, %di
- int $0x10
- cmp $0x004f, %ax
- jnz no_pm
-
- movw %es, %fs:(PARAM_VESAPM_SEG)
- movw %di, %fs:(PARAM_VESAPM_OFF)
-no_pm: ret
-
-# The video mode menu
-mode_menu:
- leaw keymsg, %si # "Return/Space/Timeout" message
- call prtstr
- call flush
-nokey: call getkt
-
- cmpb $0x0d, %al # ENTER ?
- je listm # yes - manual mode selection
-
- cmpb $0x20, %al # SPACE ?
- je defmd1 # no - repeat
-
- call beep
- jmp nokey
-
-defmd1: ret # No mode chosen? Default 80x25
-
-listm: call mode_table # List mode table
-listm0: leaw name_bann, %si # Print adapter name
- call prtstr
- movw card_name, %si
- orw %si, %si
- jnz an2
-
- movb adapter, %al
- leaw old_name, %si
- orb %al, %al
- jz an1
-
- leaw ega_name, %si
- decb %al
- jz an1
-
- leaw vga_name, %si
- jmp an1
-
-an2: call prtstr
- leaw svga_name, %si
-an1: call prtstr
- leaw listhdr, %si # Table header
- call prtstr
- movb $0x30, %dl # DL holds mode number
- leaw modelist, %si
-lm1: cmpw $ASK_VGA, (%si) # End?
- jz lm2
-
- movb %dl, %al # Menu selection number
- call prtchr
- call prtsp2
- lodsw
- call prthw # Mode ID
- call prtsp2
- movb 0x1(%si), %al
- call prtdec # Rows
- movb $0x78, %al # the letter 'x'
- call prtchr
- lodsw
- call prtdec # Columns
- movb $0x0d, %al # New line
- call prtchr
- movb $0x0a, %al
- call prtchr
- incb %dl # Next character
- cmpb $0x3a, %dl
- jnz lm1
-
- movb $0x61, %dl
- jmp lm1
-
-lm2: leaw prompt, %si # Mode prompt
- call prtstr
- leaw edit_buf, %di # Editor buffer
-lm3: call getkey
- cmpb $0x0d, %al # Enter?
- jz lment
-
- cmpb $0x08, %al # Backspace?
- jz lmbs
-
- cmpb $0x20, %al # Printable?
- jc lm3
-
- cmpw $edit_buf+4, %di # Enough space?
- jz lm3
-
- stosb
- call prtchr
- jmp lm3
-
-lmbs: cmpw $edit_buf, %di # Backspace
- jz lm3
-
- decw %di
- movb $0x08, %al
- call prtchr
- call prtspc
- movb $0x08, %al
- call prtchr
- jmp lm3
-
-lment: movb $0, (%di)
- leaw crlft, %si
- call prtstr
- leaw edit_buf, %si
- cmpb $0, (%si) # Empty string = default mode
- jz lmdef
-
- cmpb $0, 1(%si) # One character = menu selection
- jz mnusel
-
- cmpw $0x6373, (%si) # "scan" => mode scanning
- jnz lmhx
-
- cmpw $0x6e61, 2(%si)
- jz lmscan
-
-lmhx: xorw %bx, %bx # Else => mode ID in hex
-lmhex: lodsb
- orb %al, %al
- jz lmuse1
-
- subb $0x30, %al
- jc lmbad
-
- cmpb $10, %al
- jc lmhx1
-
- subb $7, %al
- andb $0xdf, %al
- cmpb $10, %al
- jc lmbad
-
- cmpb $16, %al
- jnc lmbad
-
-lmhx1: shlw $4, %bx
- orb %al, %bl
- jmp lmhex
-
-lmuse1: movw %bx, %ax
- jmp lmuse
-
-mnusel: lodsb # Menu selection
- xorb %ah, %ah
- subb $0x30, %al
- jc lmbad
-
- cmpb $10, %al
- jc lmuse
-
- cmpb $0x61-0x30, %al
- jc lmbad
-
- subb $0x61-0x30-10, %al
- cmpb $36, %al
- jnc lmbad
-
-lmuse: call mode_set
- jc lmdef
-
-lmbad: leaw unknt, %si
- call prtstr
- jmp lm2
-lmscan: cmpb $0, adapter # Scanning only on EGA/VGA
- jz lmbad
-
- movw $0, mt_end # Scanning of modes is
- movb $1, scanning # done as new autodetection.
- call mode_table
- jmp listm0
-lmdef: ret
-
-# Additional parts of mode_set... (relative jumps, you know)
-setv7: # Video7 extended modes
- DO_STORE
- subb $VIDEO_FIRST_V7>>8, %bh
- movw $0x6f05, %ax
- int $0x10
- stc
- ret
-
-_setrec: jmp setrec # Ugly...
-_set_80x25: jmp set_80x25
-
-# Aliases for backward compatibility.
-setalias:
- movw $VIDEO_80x25, %ax
- incw %bx
- jz mode_set
-
- movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
- incw %bx
- jnz setbad # Fall-through!
-
-# Setting of user mode (AX=mode ID) => CF=success
-mode_set:
- movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S
- movw %ax, %bx
- cmpb $0xff, %ah
- jz setalias
-
- testb $VIDEO_RECALC>>8, %ah
- jnz _setrec
-
- cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
- jnc setres
-
- cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
- jz setspc
-
- cmpb $VIDEO_FIRST_V7>>8, %ah
- jz setv7
-
- cmpb $VIDEO_FIRST_VESA>>8, %ah
- jnc check_vesa
-
- orb %ah, %ah
- jz setmenu
-
- decb %ah
- jz setbios
-
-setbad: clc
- movb $0, do_restore # The screen needn't be restored
- ret
-
-setvesa:
- DO_STORE
- subb $VIDEO_FIRST_VESA>>8, %bh
- movw $0x4f02, %ax # VESA BIOS mode set call
- int $0x10
- cmpw $0x004f, %ax # AL=4f if implemented
- jnz setbad # AH=0 if OK
-
- stc
- ret
-
-setbios:
- DO_STORE
- int $0x10 # Standard BIOS mode set call
- pushw %bx
- movb $0x0f, %ah # Check if really set
- int $0x10
- popw %bx
- cmpb %bl, %al
- jnz setbad
-
- stc
- ret
-
-setspc: xorb %bh, %bh # Set special mode
- cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
- jnc setbad
-
- addw %bx, %bx
- jmp *spec_inits(%bx)
-
-setmenu:
- orb %al, %al # 80x25 is an exception
- jz _set_80x25
-
- pushw %bx # Set mode chosen from menu
- call mode_table # Build the mode table
- popw %ax
- shlw $2, %ax
- addw %ax, %si
- cmpw %di, %si
- jnc setbad
-
- movw (%si), %ax # Fetch mode ID
-_m_s: jmp mode_set
-
-setres: pushw %bx # Set mode chosen by resolution
- call mode_table
- popw %bx
- xchgb %bl, %bh
-setr1: lodsw
- cmpw $ASK_VGA, %ax # End of the list?
- jz setbad
-
- lodsw
- cmpw %bx, %ax
- jnz setr1
-
- movw -4(%si), %ax # Fetch mode ID
- jmp _m_s
-
-check_vesa:
-#ifdef CONFIG_FIRMWARE_EDID
- leaw modelist+1024, %di
- movw $0x4f00, %ax
- int $0x10
- cmpw $0x004f, %ax
- jnz setbad
-
- movw 4(%di), %ax
- movw %ax, vbe_version
-#endif
- leaw modelist+1024, %di
- subb $VIDEO_FIRST_VESA>>8, %bh
- movw %bx, %cx # Get mode information structure
- movw $0x4f01, %ax
- int $0x10
- addb $VIDEO_FIRST_VESA>>8, %bh
- cmpw $0x004f, %ax
- jnz setbad
-
- movb (%di), %al # Check capabilities.
- andb $0x19, %al
- cmpb $0x09, %al
- jz setvesa # This is a text mode
-
- movb (%di), %al # Check capabilities.
- andb $0x99, %al
- cmpb $0x99, %al
- jnz _setbad # Doh! No linear frame buffer.
-
- subb $VIDEO_FIRST_VESA>>8, %bh
- orw $0x4000, %bx # Use linear frame buffer
- movw $0x4f02, %ax # VESA BIOS mode set call
- int $0x10
- cmpw $0x004f, %ax # AL=4f if implemented
- jnz _setbad # AH=0 if OK
-
- movb $1, graphic_mode # flag graphic mode
- movb $0, do_restore # no screen restore
- stc
- ret
-
-_setbad: jmp setbad # Ugly...
-
-# Recalculate vertical display end registers -- this fixes various
-# inconsistencies of extended modes on many adapters. Called when
-# the VIDEO_RECALC flag is set in the mode ID.
-
-setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode
- call mode_set
- jnc rct3
-
- movw %gs:(0x485), %ax # Font size in pixels
- movb %gs:(0x484), %bl # Number of rows
- incb %bl
- mulb %bl # Number of visible
- decw %ax # scan lines - 1
- movw $0x3d4, %dx
- movw %ax, %bx
- movb $0x12, %al # Lower 8 bits
- movb %bl, %ah
- outw %ax, %dx
- movb $0x07, %al # Bits 8 and 9 in the overflow register
- call inidx
- xchgb %al, %ah
- andb $0xbd, %ah
- shrb %bh
- jnc rct1
- orb $0x02, %ah
-rct1: shrb %bh
- jnc rct2
- orb $0x40, %ah
-rct2: movb $0x07, %al
- outw %ax, %dx
- stc
-rct3: ret
-
-# Table of routines for setting of the special modes.
-spec_inits:
- .word set_80x25
- .word set_8pixel
- .word set_80x43
- .word set_80x28
- .word set_current
- .word set_80x30
- .word set_80x34
- .word set_80x60
- .word set_gfx
-
-# Set the 80x25 mode. If already set, do nothing.
-set_80x25:
- movw $0x5019, force_size # Override possibly broken BIOS
-use_80x25:
-#ifdef CONFIG_VIDEO_400_HACK
- movw $0x1202, %ax # Force 400 scan lines
- movb $0x30, %bl
- int $0x10
-#else
- movb $0x0f, %ah # Get current mode ID
- int $0x10
- cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available
- jz st80 # on CGA/MDA/HGA and is also available on EGAM
-
- cmpw $0x5003, %ax # Unknown mode, force 80x25 color
- jnz force3
-
-st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25
- jz set80
-
- movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc.
- orb %al, %al # Some buggy BIOS'es set 0 rows
- jz set80
-
- cmpb $24, %al # It's hopefully correct
- jz set80
-#endif /* CONFIG_VIDEO_400_HACK */
-force3: DO_STORE
- movw $0x0003, %ax # Forced set
- int $0x10
-set80: stc
- ret
-
-# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
-set_8pixel:
- DO_STORE
- call use_80x25 # The base is 80x25
-set_8pt:
- movw $0x1112, %ax # Use 8x8 font
- xorb %bl, %bl
- int $0x10
- movw $0x1200, %ax # Use alternate print screen
- movb $0x20, %bl
- int $0x10
- movw $0x1201, %ax # Turn off cursor emulation
- movb $0x34, %bl
- int $0x10
- movb $0x01, %ah # Define cursor scan lines 6-7
- movw $0x0607, %cx
- int $0x10
-set_current:
- stc
- ret
-
-# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
-# 80x25 mode with 14-point fonts instead of 16-point.
-set_80x28:
- DO_STORE
- call use_80x25 # The base is 80x25
-set14: movw $0x1111, %ax # Use 9x14 font
- xorb %bl, %bl
- int $0x10
- movb $0x01, %ah # Define cursor scan lines 11-12
- movw $0x0b0c, %cx
- int $0x10
- stc
- ret
-
-# Set the 80x43 mode. This mode is works on all VGA's.
-# It's a 350-scanline mode with 8-pixel font.
-set_80x43:
- DO_STORE
- movw $0x1201, %ax # Set 350 scans
- movb $0x30, %bl
- int $0x10
- movw $0x0003, %ax # Reset video mode
- int $0x10
- jmp set_8pt # Use 8-pixel font
-
-# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
-set_80x30:
- call use_80x25 # Start with real 80x25
- DO_STORE
- movw $0x3cc, %dx # Get CRTC port
- inb %dx, %al
- movb $0xd4, %dl
- rorb %al # Mono or color?
- jc set48a
-
- movb $0xb4, %dl
-set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7)
- call outidx
- movw $0x0b06, %ax # Vertical total
- call outidx
- movw $0x3e07, %ax # (Vertical) overflow
- call outidx
- movw $0xea10, %ax # Vertical sync start
- call outidx
- movw $0xdf12, %ax # Vertical display end
- call outidx
- movw $0xe715, %ax # Vertical blank start
- call outidx
- movw $0x0416, %ax # Vertical blank end
- call outidx
- pushw %dx
- movb $0xcc, %dl # Misc output register (read)
- inb %dx, %al
- movb $0xc2, %dl # (write)
- andb $0x0d, %al # Preserve clock select bits and color bit
- orb $0xe2, %al # Set correct sync polarity
- outb %al, %dx
- popw %dx
- movw $0x501e, force_size
- stc # That's all.
- ret
-
-# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
-set_80x34:
- call set_80x30 # Set 480 scans
- call set14 # And 14-pt font
- movw $0xdb12, %ax # VGA vertical display end
- movw $0x5022, force_size
-setvde: call outidx
- stc
- ret
-
-# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
-set_80x60:
- call set_80x30 # Set 480 scans
- call set_8pt # And 8-pt font
- movw $0xdf12, %ax # VGA vertical display end
- movw $0x503c, force_size
- jmp setvde
-
-# Special hack for ThinkPad graphics
-set_gfx:
-#ifdef CONFIG_VIDEO_GFX_HACK
- movw $VIDEO_GFX_BIOS_AX, %ax
- movw $VIDEO_GFX_BIOS_BX, %bx
- int $0x10
- movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size
- stc
-#endif
- ret
-
-#ifdef CONFIG_VIDEO_RETAIN
-
-# Store screen contents to temporary buffer.
-store_screen:
- cmpb $0, do_restore # Already stored?
- jnz stsr
-
- testb $CAN_USE_HEAP, loadflags # Have we space for storing?
- jz stsr
-
- pushw %ax
- pushw %bx
- pushw force_size # Don't force specific size
- movw $0, force_size
- call mode_params # Obtain params of current mode
- popw force_size
- movb %fs:(PARAM_VIDEO_LINES), %ah
- movb %fs:(PARAM_VIDEO_COLS), %al
- movw %ax, %bx # BX=dimensions
- mulb %ah
- movw %ax, %cx # CX=number of characters
- addw %ax, %ax # Calculate image size
- addw $modelist+1024+4, %ax
- cmpw heap_end_ptr, %ax
- jnc sts1 # Unfortunately, out of memory
-
- movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params
- leaw modelist+1024, %di
- stosw
- movw %bx, %ax
- stosw
- pushw %ds # Store the screen
- movw video_segment, %ds
- xorw %si, %si
- rep
- movsw
- popw %ds
- incb do_restore # Screen will be restored later
-sts1: popw %bx
- popw %ax
-stsr: ret
-
-# Restore screen contents from temporary buffer.
-restore_screen:
- cmpb $0, do_restore # Has the screen been stored?
- jz res1
-
- call mode_params # Get parameters of current mode
- movb %fs:(PARAM_VIDEO_LINES), %cl
- movb %fs:(PARAM_VIDEO_COLS), %ch
- leaw modelist+1024, %si # Screen buffer
- lodsw # Set cursor position
- movw %ax, %dx
- cmpb %cl, %dh
- jc res2
-
- movb %cl, %dh
- decb %dh
-res2: cmpb %ch, %dl
- jc res3
-
- movb %ch, %dl
- decb %dl
-res3: movb $0x02, %ah
- movb $0x00, %bh
- int $0x10
- lodsw # Display size
- movb %ah, %dl # DL=number of lines
- movb $0, %ah # BX=phys. length of orig. line
- movw %ax, %bx
- cmpb %cl, %dl # Too many?
- jc res4
-
- pushw %ax
- movb %dl, %al
- subb %cl, %al
- mulb %bl
- addw %ax, %si
- addw %ax, %si
- popw %ax
- movb %cl, %dl
-res4: cmpb %ch, %al # Too wide?
- jc res5
-
- movb %ch, %al # AX=width of src. line
-res5: movb $0, %cl
- xchgb %ch, %cl
- movw %cx, %bp # BP=width of dest. line
- pushw %es
- movw video_segment, %es
- xorw %di, %di # Move the data
- addw %bx, %bx # Convert BX and BP to _bytes_
- addw %bp, %bp
-res6: pushw %si
- pushw %di
- movw %ax, %cx
- rep
- movsw
- popw %di
- popw %si
- addw %bp, %di
- addw %bx, %si
- decb %dl
- jnz res6
-
- popw %es # Done
-res1: ret
-#endif /* CONFIG_VIDEO_RETAIN */
-
-# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
-outidx: outb %al, %dx
- pushw %ax
- movb %ah, %al
- incw %dx
- outb %al, %dx
- decw %dx
- popw %ax
- ret
-
-# Build the table of video modes (stored after the setup.S code at the
-# `modelist' label. Each video mode record looks like:
-# .word MODE-ID (our special mode ID (see above))
-# .byte rows (number of rows)
-# .byte columns (number of columns)
-# Returns address of the end of the table in DI, the end is marked
-# with a ASK_VGA ID.
-mode_table:
- movw mt_end, %di # Already filled?
- orw %di, %di
- jnz mtab1x
-
- leaw modelist, %di # Store standard modes:
- movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL)
- stosl
- movb adapter, %al # CGA/MDA/HGA -- no more modes
- orb %al, %al
- jz mtabe
-
- decb %al
- jnz mtabv
-
- movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode
- stosl
- jmp mtabe
-
-mtab1x: jmp mtab1
-
-mtabv: leaw vga_modes, %si # All modes for std VGA
- movw $vga_modes_end-vga_modes, %cx
- rep # I'm unable to use movsw as I don't know how to store a half
- movsb # of the expression above to cx without using explicit shr.
-
- cmpb $0, scanning # Mode scan requested?
- jz mscan1
-
- call mode_scan
-mscan1:
-
-#ifdef CONFIG_VIDEO_LOCAL
- call local_modes
-#endif /* CONFIG_VIDEO_LOCAL */
-
-#ifdef CONFIG_VIDEO_VESA
- call vesa_modes # Detect VESA VGA modes
-#endif /* CONFIG_VIDEO_VESA */
-
-#ifdef CONFIG_VIDEO_SVGA
- cmpb $0, scanning # Bypass when scanning
- jnz mscan2
-
- call svga_modes # Detect SVGA cards & modes
-mscan2:
-#endif /* CONFIG_VIDEO_SVGA */
-
-mtabe:
-
-#ifdef CONFIG_VIDEO_COMPACT
- leaw modelist, %si
- movw %di, %dx
- movw %si, %di
-cmt1: cmpw %dx, %si # Scan all modes
- jz cmt2
-
- leaw modelist, %bx # Find in previous entries
- movw 2(%si), %cx
-cmt3: cmpw %bx, %si
- jz cmt4
-
- cmpw 2(%bx), %cx # Found => don't copy this entry
- jz cmt5
-
- addw $4, %bx
- jmp cmt3
-
-cmt4: movsl # Copy entry
- jmp cmt1
-
-cmt5: addw $4, %si # Skip entry
- jmp cmt1
-
-cmt2:
-#endif /* CONFIG_VIDEO_COMPACT */
-
- movw $ASK_VGA, (%di) # End marker
- movw %di, mt_end
-mtab1: leaw modelist, %si # SI=mode list, DI=list end
-ret0: ret
-
-# Modes usable on all standard VGAs
-vga_modes:
- .word VIDEO_8POINT
- .word 0x5032 # 80x50
- .word VIDEO_80x43
- .word 0x502b # 80x43
- .word VIDEO_80x28
- .word 0x501c # 80x28
- .word VIDEO_80x30
- .word 0x501e # 80x30
- .word VIDEO_80x34
- .word 0x5022 # 80x34
- .word VIDEO_80x60
- .word 0x503c # 80x60
-#ifdef CONFIG_VIDEO_GFX_HACK
- .word VIDEO_GFX_HACK
- .word VIDEO_GFX_DUMMY_RESOLUTION
-#endif
-
-vga_modes_end:
-# Detect VESA modes.
-
-#ifdef CONFIG_VIDEO_VESA
-vesa_modes:
- cmpb $2, adapter # VGA only
- jnz ret0
-
- movw %di, %bp # BP=original mode table end
- addw $0x200, %di # Buffer space
- movw $0x4f00, %ax # VESA Get card info call
- int $0x10
- movw %bp, %di
- cmpw $0x004f, %ax # Successful?
- jnz ret0
-
- cmpw $0x4556, 0x200(%di)
- jnz ret0
-
- cmpw $0x4153, 0x202(%di)
- jnz ret0
-
- movw $vesa_name, card_name # Set name to "VESA VGA"
- pushw %gs
- lgsw 0x20e(%di), %si # GS:SI=mode list
- movw $128, %cx # Iteration limit
-vesa1:
-# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
-# XXX: lodsw %gs:(%si), %ax # Get next mode in the list
- gs; lodsw
- cmpw $0xffff, %ax # End of the table?
- jz vesar
-
- cmpw $0x0080, %ax # Check validity of mode ID
- jc vesa2
-
- orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff
- jz vesan # Certain BIOSes report 0x80-0xff!
-
- cmpw $0x0800, %ax
- jnc vesae
-
-vesa2: pushw %cx
- movw %ax, %cx # Get mode information structure
- movw $0x4f01, %ax
- int $0x10
- movw %cx, %bx # BX=mode number
- addb $VIDEO_FIRST_VESA>>8, %bh
- popw %cx
- cmpw $0x004f, %ax
- jnz vesan # Don't report errors (buggy BIOSES)
-
- movb (%di), %al # Check capabilities. We require
- andb $0x19, %al # a color text mode.
- cmpb $0x09, %al
- jnz vesan
-
- cmpw $0xb800, 8(%di) # Standard video memory address required
- jnz vesan
-
- testb $2, (%di) # Mode characteristics supplied?
- movw %bx, (%di) # Store mode number
- jz vesa3
-
- xorw %dx, %dx
- movw 0x12(%di), %bx # Width
- orb %bh, %bh
- jnz vesan
-
- movb %bl, 0x3(%di)
- movw 0x14(%di), %ax # Height
- orb %ah, %ah
- jnz vesan
-
- movb %al, 2(%di)
- mulb %bl
- cmpw $8193, %ax # Small enough for Linux console driver?
- jnc vesan
-
- jmp vesaok
-
-vesa3: subw $0x8108, %bx # This mode has no detailed info specified,
- jc vesan # so it must be a standard VESA mode.
-
- cmpw $5, %bx
- jnc vesan
-
- movw vesa_text_mode_table(%bx), %ax
- movw %ax, 2(%di)
-vesaok: addw $4, %di # The mode is valid. Store it.
-vesan: loop vesa1 # Next mode. Limit exceeded => error
-vesae: leaw vesaer, %si
- call prtstr
- movw %bp, %di # Discard already found modes.
-vesar: popw %gs
- ret
-
-# Dimensions of standard VESA text modes
-vesa_text_mode_table:
- .byte 60, 80 # 0108
- .byte 25, 132 # 0109
- .byte 43, 132 # 010A
- .byte 50, 132 # 010B
- .byte 60, 132 # 010C
-#endif /* CONFIG_VIDEO_VESA */
-
-# Scan for video modes. A bit dirty, but should work.
-mode_scan:
- movw $0x0100, %cx # Start with mode 0
-scm1: movb $0, %ah # Test the mode
- movb %cl, %al
- int $0x10
- movb $0x0f, %ah
- int $0x10
- cmpb %cl, %al
- jnz scm2 # Mode not set
-
- movw $0x3c0, %dx # Test if it's a text mode
- movb $0x10, %al # Mode bits
- call inidx
- andb $0x03, %al
- jnz scm2
-
- movb $0xce, %dl # Another set of mode bits
- movb $0x06, %al
- call inidx
- shrb %al
- jc scm2
-
- movb $0xd4, %dl # Cursor location
- movb $0x0f, %al
- call inidx
- orb %al, %al
- jnz scm2
-
- movw %cx, %ax # Ok, store the mode
- stosw
- movb %gs:(0x484), %al # Number of rows
- incb %al
- stosb
- movw %gs:(0x44a), %ax # Number of columns
- stosb
-scm2: incb %cl
- jns scm1
-
- movw $0x0003, %ax # Return back to mode 3
- int $0x10
- ret
-
-tstidx: outw %ax, %dx # OUT DX,AX and inidx
-inidx: outb %al, %dx # Read from indexed VGA register
- incw %dx # AL=index, DX=index reg port -> AL=data
- inb %dx, %al
- decw %dx
- ret
-
-# Try to detect type of SVGA card and supply (usually approximate) video
-# mode table for it.
-
-#ifdef CONFIG_VIDEO_SVGA
-svga_modes:
- leaw svga_table, %si # Test all known SVGA adapters
-dosvga: lodsw
- movw %ax, %bp # Default mode table
- orw %ax, %ax
- jz didsv1
-
- lodsw # Pointer to test routine
- pushw %si
- pushw %di
- pushw %es
- movw $0xc000, %bx
- movw %bx, %es
- call *%ax # Call test routine
- popw %es
- popw %di
- popw %si
- orw %bp, %bp
- jz dosvga
-
- movw %bp, %si # Found, copy the modes
- movb svga_prefix, %ah
-cpsvga: lodsb
- orb %al, %al
- jz didsv
-
- stosw
- movsw
- jmp cpsvga
-
-didsv: movw %si, card_name # Store pointer to card name
-didsv1: ret
-
-# Table of all known SVGA cards. For each card, we store a pointer to
-# a table of video modes supported by the card and a pointer to a routine
-# used for testing of presence of the card. The video mode table is always
-# followed by the name of the card or the chipset.
-svga_table:
- .word ati_md, ati_test
- .word oak_md, oak_test
- .word paradise_md, paradise_test
- .word realtek_md, realtek_test
- .word s3_md, s3_test
- .word chips_md, chips_test
- .word video7_md, video7_test
- .word cirrus5_md, cirrus5_test
- .word cirrus6_md, cirrus6_test
- .word cirrus1_md, cirrus1_test
- .word ahead_md, ahead_test
- .word everex_md, everex_test
- .word genoa_md, genoa_test
- .word trident_md, trident_test
- .word tseng_md, tseng_test
- .word 0
-
-# Test routines and mode tables:
-
-# S3 - The test algorithm was taken from the SuperProbe package
-# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
-s3_test:
- movw $0x0f35, %cx # we store some constants in cl/ch
- movw $0x03d4, %dx
- movb $0x38, %al
- call inidx
- movb %al, %bh # store current CRT-register 0x38
- movw $0x0038, %ax
- call outidx # disable writing to special regs
- movb %cl, %al # check whether we can write special reg 0x35
- call inidx
- movb %al, %bl # save the current value of CRT reg 0x35
- andb $0xf0, %al # clear bits 0-3
- movb %al, %ah
- movb %cl, %al # and write it to CRT reg 0x35
- call outidx
- call inidx # now read it back
- andb %ch, %al # clear the upper 4 bits
- jz s3_2 # the first test failed. But we have a
-
- movb %bl, %ah # second chance
- movb %cl, %al
- call outidx
- jmp s3_1 # do the other tests
-
-s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35
- orb %bl, %ah # set the upper 4 bits of ah with the orig value
- call outidx # write ...
- call inidx # ... and reread
- andb %cl, %al # turn off the upper 4 bits
- pushw %ax
- movb %bl, %ah # restore old value in register 0x35
- movb %cl, %al
- call outidx
- popw %ax
- cmpb %ch, %al # setting lower 4 bits was successful => bad
- je no_s3 # writing is allowed => this is not an S3
-
-s3_1: movw $0x4838, %ax # allow writing to special regs by putting
- call outidx # magic number into CRT-register 0x38
- movb %cl, %al # check whether we can write special reg 0x35
- call inidx
- movb %al, %bl
- andb $0xf0, %al
- movb %al, %ah
- movb %cl, %al
- call outidx
- call inidx
- andb %ch, %al
- jnz no_s3 # no, we can't write => no S3
-
- movw %cx, %ax
- orb %bl, %ah
- call outidx
- call inidx
- andb %ch, %al
- pushw %ax
- movb %bl, %ah # restore old value in register 0x35
- movb %cl, %al
- call outidx
- popw %ax
- cmpb %ch, %al
- jne no_s31 # writing not possible => no S3
- movb $0x30, %al
- call inidx # now get the S3 id ...
- leaw idS3, %di
- movw $0x10, %cx
- repne
- scasb
- je no_s31
-
- movb %bh, %ah
- movb $0x38, %al
- jmp s3rest
-
-no_s3: movb $0x35, %al # restore CRT register 0x35
- movb %bl, %ah
- call outidx
-no_s31: xorw %bp, %bp # Detection failed
-s3rest: movb %bh, %ah
- movb $0x38, %al # restore old value of CRT register 0x38
- jmp outidx
-
-idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
- .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
-
-s3_md: .byte 0x54, 0x2b, 0x84
- .byte 0x55, 0x19, 0x84
- .byte 0
- .ascii "S3"
- .byte 0
-
-# ATI cards.
-ati_test:
- leaw idati, %si
- movw $0x31, %di
- movw $0x09, %cx
- repe
- cmpsb
- je atiok
-
- xorw %bp, %bp
-atiok: ret
-
-idati: .ascii "761295520"
-
-ati_md: .byte 0x23, 0x19, 0x84
- .byte 0x33, 0x2c, 0x84
- .byte 0x22, 0x1e, 0x64
- .byte 0x21, 0x19, 0x64
- .byte 0x58, 0x21, 0x50
- .byte 0x5b, 0x1e, 0x50
- .byte 0
- .ascii "ATI"
- .byte 0
-
-# AHEAD
-ahead_test:
- movw $0x200f, %ax
- movw $0x3ce, %dx
- outw %ax, %dx
- incw %dx
- inb %dx, %al
- cmpb $0x20, %al
- je isahed
-
- cmpb $0x21, %al
- je isahed
-
- xorw %bp, %bp
-isahed: ret
-
-ahead_md:
- .byte 0x22, 0x2c, 0x84
- .byte 0x23, 0x19, 0x84
- .byte 0x24, 0x1c, 0x84
- .byte 0x2f, 0x32, 0xa0
- .byte 0x32, 0x22, 0x50
- .byte 0x34, 0x42, 0x50
- .byte 0
- .ascii "Ahead"
- .byte 0
-
-# Chips & Tech.
-chips_test:
- movw $0x3c3, %dx
- inb %dx, %al
- orb $0x10, %al
- outb %al, %dx
- movw $0x104, %dx
- inb %dx, %al
- movb %al, %bl
- movw $0x3c3, %dx
- inb %dx, %al
- andb $0xef, %al
- outb %al, %dx
- cmpb $0xa5, %bl
- je cantok
-
- xorw %bp, %bp
-cantok: ret
-
-chips_md:
- .byte 0x60, 0x19, 0x84
- .byte 0x61, 0x32, 0x84
- .byte 0
- .ascii "Chips & Technologies"
- .byte 0
-
-# Cirrus Logic 5X0
-cirrus1_test:
- movw $0x3d4, %dx
- movb $0x0c, %al
- outb %al, %dx
- incw %dx
- inb %dx, %al
- movb %al, %bl
- xorb %al, %al
- outb %al, %dx
- decw %dx
- movb $0x1f, %al
- outb %al, %dx
- incw %dx
- inb %dx, %al
- movb %al, %bh
- xorb %ah, %ah
- shlb $4, %al
- movw %ax, %cx
- movb %bh, %al
- shrb $4, %al
- addw %ax, %cx
- shlw $8, %cx
- addw $6, %cx
- movw %cx, %ax
- movw $0x3c4, %dx
- outw %ax, %dx
- incw %dx
- inb %dx, %al
- andb %al, %al
- jnz nocirr
-
- movb %bh, %al
- outb %al, %dx
- inb %dx, %al
- cmpb $0x01, %al
- je iscirr
-
-nocirr: xorw %bp, %bp
-iscirr: movw $0x3d4, %dx
- movb %bl, %al
- xorb %ah, %ah
- shlw $8, %ax
- addw $0x0c, %ax
- outw %ax, %dx
- ret
-
-cirrus1_md:
- .byte 0x1f, 0x19, 0x84
- .byte 0x20, 0x2c, 0x84
- .byte 0x22, 0x1e, 0x84
- .byte 0x31, 0x25, 0x64
- .byte 0
- .ascii "Cirrus Logic 5X0"
- .byte 0
-
-# Cirrus Logic 54XX
-cirrus5_test:
- movw $0x3c4, %dx
- movb $6, %al
- call inidx
- movb %al, %bl # BL=backup
- movw $6, %ax
- call tstidx
- cmpb $0x0f, %al
- jne c5fail
-
- movw $0x1206, %ax
- call tstidx
- cmpb $0x12, %al
- jne c5fail
-
- movb $0x1e, %al
- call inidx
- movb %al, %bh
- movb %bh, %ah
- andb $0xc0, %ah
- movb $0x1e, %al
- call tstidx
- andb $0x3f, %al
- jne c5xx
-
- movb $0x1e, %al
- movb %bh, %ah
- orb $0x3f, %ah
- call tstidx
- xorb $0x3f, %al
- andb $0x3f, %al
-c5xx: pushf
- movb $0x1e, %al
- movb %bh, %ah
- outw %ax, %dx
- popf
- je c5done
-
-c5fail: xorw %bp, %bp
-c5done: movb $6, %al
- movb %bl, %ah
- outw %ax, %dx
- ret
-
-cirrus5_md:
- .byte 0x14, 0x19, 0x84
- .byte 0x54, 0x2b, 0x84
- .byte 0
- .ascii "Cirrus Logic 54XX"
- .byte 0
-
-# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
-# it's misidentified by the Ahead test.
-cirrus6_test:
- movw $0x3ce, %dx
- movb $0x0a, %al
- call inidx
- movb %al, %bl # BL=backup
- movw $0xce0a, %ax
- call tstidx
- orb %al, %al
- jne c2fail
-
- movw $0xec0a, %ax
- call tstidx
- cmpb $0x01, %al
- jne c2fail
-
- movb $0xaa, %al
- call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's.
- shrb $4, %al
- subb $4, %al
- jz c6done
-
- decb %al
- jz c6done
-
- subb $2, %al
- jz c6done
-
- decb %al
- jz c6done
-
-c2fail: xorw %bp, %bp
-c6done: movb $0x0a, %al
- movb %bl, %ah
- outw %ax, %dx
- ret
-
-cirrus6_md:
- .byte 0
- .ascii "Cirrus Logic 64XX"
- .byte 0
-
-# Everex / Trident
-everex_test:
- movw $0x7000, %ax
- xorw %bx, %bx
- int $0x10
- cmpb $0x70, %al
- jne noevrx
-
- shrw $4, %dx
- cmpw $0x678, %dx
- je evtrid
-
- cmpw $0x236, %dx
- jne evrxok
-
-evtrid: leaw trident_md, %bp
-evrxok: ret
-
-noevrx: xorw %bp, %bp
- ret
-
-everex_md:
- .byte 0x03, 0x22, 0x50
- .byte 0x04, 0x3c, 0x50
- .byte 0x07, 0x2b, 0x64
- .byte 0x08, 0x4b, 0x64
- .byte 0x0a, 0x19, 0x84
- .byte 0x0b, 0x2c, 0x84
- .byte 0x16, 0x1e, 0x50
- .byte 0x18, 0x1b, 0x64
- .byte 0x21, 0x40, 0xa0
- .byte 0x40, 0x1e, 0x84
- .byte 0
- .ascii "Everex/Trident"
- .byte 0
-
-# Genoa.
-genoa_test:
- leaw idgenoa, %si # Check Genoa 'clues'
- xorw %ax, %ax
- movb %es:(0x37), %al
- movw %ax, %di
- movw $0x04, %cx
- decw %si
- decw %di
-l1: incw %si
- incw %di
- movb (%si), %al
- testb %al, %al
- jz l2
-
- cmpb %es:(%di), %al
-l2: loope l1
- orw %cx, %cx
- je isgen
-
- xorw %bp, %bp
-isgen: ret
-
-idgenoa: .byte 0x77, 0x00, 0x99, 0x66
-
-genoa_md:
- .byte 0x58, 0x20, 0x50
- .byte 0x5a, 0x2a, 0x64
- .byte 0x60, 0x19, 0x84
- .byte 0x61, 0x1d, 0x84
- .byte 0x62, 0x20, 0x84
- .byte 0x63, 0x2c, 0x84
- .byte 0x64, 0x3c, 0x84
- .byte 0x6b, 0x4f, 0x64
- .byte 0x72, 0x3c, 0x50
- .byte 0x74, 0x42, 0x50
- .byte 0x78, 0x4b, 0x64
- .byte 0
- .ascii "Genoa"
- .byte 0
-
-# OAK
-oak_test:
- leaw idoakvga, %si
- movw $0x08, %di
- movw $0x08, %cx
- repe
- cmpsb
- je isoak
-
- xorw %bp, %bp
-isoak: ret
-
-idoakvga: .ascii "OAK VGA "
-
-oak_md: .byte 0x4e, 0x3c, 0x50
- .byte 0x4f, 0x3c, 0x84
- .byte 0x50, 0x19, 0x84
- .byte 0x51, 0x2b, 0x84
- .byte 0
- .ascii "OAK"
- .byte 0
-
-# WD Paradise.
-paradise_test:
- leaw idparadise, %si
- movw $0x7d, %di
- movw $0x04, %cx
- repe
- cmpsb
- je ispara
-
- xorw %bp, %bp
-ispara: ret
-
-idparadise: .ascii "VGA="
-
-paradise_md:
- .byte 0x41, 0x22, 0x50
- .byte 0x47, 0x1c, 0x84
- .byte 0x55, 0x19, 0x84
- .byte 0x54, 0x2c, 0x84
- .byte 0
- .ascii "Paradise"
- .byte 0
-
-# Trident.
-trident_test:
- movw $0x3c4, %dx
- movb $0x0e, %al
- outb %al, %dx
- incw %dx
- inb %dx, %al
- xchgb %al, %ah
- xorb %al, %al
- outb %al, %dx
- inb %dx, %al
- xchgb %ah, %al
- movb %al, %bl # Strange thing ... in the book this wasn't
- andb $0x02, %bl # necessary but it worked on my card which
- jz setb2 # is a trident. Without it the screen goes
- # blurred ...
- andb $0xfd, %al
- jmp clrb2
-
-setb2: orb $0x02, %al
-clrb2: outb %al, %dx
- andb $0x0f, %ah
- cmpb $0x02, %ah
- je istrid
-
- xorw %bp, %bp
-istrid: ret
-
-trident_md:
- .byte 0x50, 0x1e, 0x50
- .byte 0x51, 0x2b, 0x50
- .byte 0x52, 0x3c, 0x50
- .byte 0x57, 0x19, 0x84
- .byte 0x58, 0x1e, 0x84
- .byte 0x59, 0x2b, 0x84
- .byte 0x5a, 0x3c, 0x84
- .byte 0
- .ascii "Trident"
- .byte 0
-
-# Tseng.
-tseng_test:
- movw $0x3cd, %dx
- inb %dx, %al # Could things be this simple ! :-)
- movb %al, %bl
- movb $0x55, %al
- outb %al, %dx
- inb %dx, %al
- movb %al, %ah
- movb %bl, %al
- outb %al, %dx
- cmpb $0x55, %ah
- je istsen
-
-isnot: xorw %bp, %bp
-istsen: ret
-
-tseng_md:
- .byte 0x26, 0x3c, 0x50
- .byte 0x2a, 0x28, 0x64
- .byte 0x23, 0x19, 0x84
- .byte 0x24, 0x1c, 0x84
- .byte 0x22, 0x2c, 0x84
- .byte 0x21, 0x3c, 0x84
- .byte 0
- .ascii "Tseng"
- .byte 0
-
-# Video7.
-video7_test:
- movw $0x3cc, %dx
- inb %dx, %al
- movw $0x3b4, %dx
- andb $0x01, %al
- jz even7
-
- movw $0x3d4, %dx
-even7: movb $0x0c, %al
- outb %al, %dx
- incw %dx
- inb %dx, %al
- movb %al, %bl
- movb $0x55, %al
- outb %al, %dx
- inb %dx, %al
- decw %dx
- movb $0x1f, %al
- outb %al, %dx
- incw %dx
- inb %dx, %al
- movb %al, %bh
- decw %dx
- movb $0x0c, %al
- outb %al, %dx
- incw %dx
- movb %bl, %al
- outb %al, %dx
- movb $0x55, %al
- xorb $0xea, %al
- cmpb %bh, %al
- jne isnot
-
- movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
- ret
-
-video7_md:
- .byte 0x40, 0x2b, 0x50
- .byte 0x43, 0x3c, 0x50
- .byte 0x44, 0x3c, 0x64
- .byte 0x41, 0x19, 0x84
- .byte 0x42, 0x2c, 0x84
- .byte 0x45, 0x1c, 0x84
- .byte 0
- .ascii "Video 7"
- .byte 0
-
-# Realtek VGA
-realtek_test:
- leaw idrtvga, %si
- movw $0x45, %di
- movw $0x0b, %cx
- repe
- cmpsb
- je isrt
-
- xorw %bp, %bp
-isrt: ret
-
-idrtvga: .ascii "REALTEK VGA"
-
-realtek_md:
- .byte 0x1a, 0x3c, 0x50
- .byte 0x1b, 0x19, 0x84
- .byte 0x1c, 0x1e, 0x84
- .byte 0x1d, 0x2b, 0x84
- .byte 0x1e, 0x3c, 0x84
- .byte 0
- .ascii "REALTEK"
- .byte 0
-
-#endif /* CONFIG_VIDEO_SVGA */
-
-# User-defined local mode table (VGA only)
-#ifdef CONFIG_VIDEO_LOCAL
-local_modes:
- leaw local_mode_table, %si
-locm1: lodsw
- orw %ax, %ax
- jz locm2
-
- stosw
- movsw
- jmp locm1
-
-locm2: ret
-
-# This is the table of local video modes which can be supplied manually
-# by the user. Each entry consists of mode ID (word) and dimensions
-# (byte for column count and another byte for row count). These modes
-# are placed before all SVGA and VESA modes and override them if table
-# compacting is enabled. The table must end with a zero word followed
-# by NUL-terminated video adapter name.
-local_mode_table:
- .word 0x0100 # Example: 40x25
- .byte 25,40
- .word 0
- .ascii "Local"
- .byte 0
-#endif /* CONFIG_VIDEO_LOCAL */
-
-# Read a key and return the ASCII code in al, scan code in ah
-getkey: xorb %ah, %ah
- int $0x16
- ret
-
-# Read a key with a timeout of 30 seconds.
-# The hardware clock is used to get the time.
-getkt: call gettime
- addb $30, %al # Wait 30 seconds
- cmpb $60, %al
- jl lminute
-
- subb $60, %al
-lminute:
- movb %al, %cl
-again: movb $0x01, %ah
- int $0x16
- jnz getkey # key pressed, so get it
-
- call gettime
- cmpb %cl, %al
- jne again
-
- movb $0x20, %al # timeout, return `space'
- ret
-
-# Flush the keyboard buffer
-flush: movb $0x01, %ah
- int $0x16
- jz empty
-
- xorb %ah, %ah
- int $0x16
- jmp flush
-
-empty: ret
-
-# Print hexadecimal number.
-prthw: pushw %ax
- movb %ah, %al
- call prthb
- popw %ax
-prthb: pushw %ax
- shrb $4, %al
- call prthn
- popw %ax
- andb $0x0f, %al
-prthn: cmpb $0x0a, %al
- jc prth1
-
- addb $0x07, %al
-prth1: addb $0x30, %al
- jmp prtchr
-
-# Print decimal number in al
-prtdec: pushw %ax
- pushw %cx
- xorb %ah, %ah
- movb $0x0a, %cl
- idivb %cl
- cmpb $0x09, %al
- jbe lt100
-
- call prtdec
- jmp skip10
-
-lt100: addb $0x30, %al
- call prtchr
-skip10: movb %ah, %al
- addb $0x30, %al
- call prtchr
- popw %cx
- popw %ax
- ret
-
-store_edid:
-#ifdef CONFIG_FIRMWARE_EDID
- pushw %es # just save all registers
- pushw %ax
- pushw %bx
- pushw %cx
- pushw %dx
- pushw %di
-
- pushw %fs
- popw %es
-
- movl $0x13131313, %eax # memset block with 0x13
- movw $32, %cx
- movw $0x140, %di
- cld
- rep
- stosl
-
- cmpw $0x0200, vbe_version # only do EDID on >= VBE2.0
- jl no_edid
-
- pushw %es # save ES
- xorw %di, %di # Report Capability
- pushw %di
- popw %es # ES:DI must be 0:0
- movw $0x4f15, %ax
- xorw %bx, %bx
- xorw %cx, %cx
- int $0x10
- popw %es # restore ES
-
- cmpb $0x00, %ah # call successful
- jne no_edid
-
- cmpb $0x4f, %al # function supported
- jne no_edid
-
- movw $0x4f15, %ax # do VBE/DDC
- movw $0x01, %bx
- movw $0x00, %cx
- movw $0x00, %dx
- movw $0x140, %di
- int $0x10
-
-no_edid:
- popw %di # restore all registers
- popw %dx
- popw %cx
- popw %bx
- popw %ax
- popw %es
-#endif
- ret
-
-# VIDEO_SELECT-only variables
-mt_end: .word 0 # End of video mode table if built
-edit_buf: .space 6 # Line editor buffer
-card_name: .word 0 # Pointer to adapter name
-scanning: .byte 0 # Performing mode scan
-do_restore: .byte 0 # Screen contents altered during mode change
-svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes
-graphic_mode: .byte 0 # Graphic mode with a linear frame buffer
-dac_size: .byte 6 # DAC bit depth
-vbe_version: .word 0 # VBE bios version
-
-# Status messages
-keymsg: .ascii "Press <RETURN> to see video modes available, "
- .ascii "<SPACE> to continue or wait 30 secs"
- .byte 0x0d, 0x0a, 0
-
-listhdr: .byte 0x0d, 0x0a
- .ascii "Mode: COLSxROWS:"
-
-crlft: .byte 0x0d, 0x0a, 0
-
-prompt: .byte 0x0d, 0x0a
- .asciz "Enter mode number or `scan': "
-
-unknt: .asciz "Unknown mode ID. Try again."
-
-badmdt: .ascii "You passed an undefined mode number."
- .byte 0x0d, 0x0a, 0
-
-vesaer: .ascii "Error: Scanning of VESA modes failed. Please "
- .ascii "report to <mj@ucw.cz>."
- .byte 0x0d, 0x0a, 0
-
-old_name: .asciz "CGA/MDA/HGA"
-
-ega_name: .asciz "EGA"
-
-svga_name: .ascii " "
-
-vga_name: .asciz "VGA"
-
-vesa_name: .asciz "VESA"
-
-name_bann: .asciz "Video adapter: "
-#endif /* CONFIG_VIDEO_SELECT */
-
-# Other variables:
-adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
-video_segment: .word 0xb800 # Video memory segment
-force_size: .word 0 # Use this size instead of the one in BIOS vars
diff --git a/arch/i386/boot/video.c b/arch/i386/boot/video.c
new file mode 100644
index 000000000000..958130ef0042
--- /dev/null
+++ b/arch/i386/boot/video.c
@@ -0,0 +1,461 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video.c
+ *
+ * Select video mode
+ */
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+
+/*
+ * Mode list variables
+ */
+static struct card_info cards[]; /* List of cards to probe for */
+
+/*
+ * Common variables
+ */
+int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
+u16 video_segment;
+int force_x, force_y; /* Don't query the BIOS for cols/rows */
+
+int do_restore = 0; /* Screen contents changed during mode flip */
+int graphic_mode; /* Graphic mode with linear frame buffer */
+
+static void store_cursor_position(void)
+{
+ u16 curpos;
+ u16 ax, bx;
+
+ ax = 0x0300;
+ bx = 0;
+ asm(INT10
+ : "=d" (curpos), "+a" (ax), "+b" (bx)
+ : : "ecx", "esi", "edi");
+
+ boot_params.screen_info.orig_x = curpos;
+ boot_params.screen_info.orig_y = curpos >> 8;
+}
+
+static void store_video_mode(void)
+{
+ u16 ax, page;
+
+ /* N.B.: the saving of the video page here is a bit silly,
+ since we pretty much assume page 0 everywhere. */
+ ax = 0x0f00;
+ asm(INT10
+ : "+a" (ax), "=b" (page)
+ : : "ecx", "edx", "esi", "edi");
+
+ /* Not all BIOSes are clean with respect to the top bit */
+ boot_params.screen_info.orig_video_mode = ax & 0x7f;
+ boot_params.screen_info.orig_video_page = page;
+}
+
+/*
+ * Store the video mode parameters for later usage by the kernel.
+ * This is done by asking the BIOS except for the rows/columns
+ * parameters in the default 80x25 mode -- these are set directly,
+ * because some very obscure BIOSes supply insane values.
+ */
+static void store_mode_params(void)
+{
+ u16 font_size;
+ int x, y;
+
+ /* For graphics mode, it is up to the mode-setting driver
+ (currently only video-vesa.c) to store the parameters */
+ if (graphic_mode)
+ return;
+
+ store_cursor_position();
+ store_video_mode();
+
+ if (boot_params.screen_info.orig_video_mode == 0x07) {
+ /* MDA, HGC, or VGA in monochrome mode */
+ video_segment = 0xb000;
+ } else {
+ /* CGA, EGA, VGA and so forth */
+ video_segment = 0xb800;
+ }
+
+ set_fs(0);
+ font_size = rdfs16(0x485); /* Font size, BIOS area */
+ boot_params.screen_info.orig_video_points = font_size;
+
+ x = rdfs16(0x44a);
+ y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
+
+ if (force_x)
+ x = force_x;
+ if (force_y)
+ y = force_y;
+
+ boot_params.screen_info.orig_video_cols = x;
+ boot_params.screen_info.orig_video_lines = y;
+}
+
+/* Probe the video drivers and have them generate their mode lists. */
+static void probe_cards(int unsafe)
+{
+ struct card_info *card;
+ static u8 probed[2];
+
+ if (probed[unsafe])
+ return;
+
+ probed[unsafe] = 1;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (card->unsafe == unsafe) {
+ if (card->probe)
+ card->nmodes = card->probe();
+ else
+ card->nmodes = 0;
+ }
+ }
+}
+
+/* Test if a mode is defined */
+int mode_defined(u16 mode)
+{
+ struct card_info *card;
+ struct mode_info *mi;
+ int i;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ if (mi->mode == mode)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Set mode (without recalc) */
+static int raw_set_mode(u16 mode)
+{
+ int nmode, i;
+ struct card_info *card;
+ struct mode_info *mi;
+
+ /* Drop the recalc bit if set */
+ mode &= ~VIDEO_RECALC;
+
+ /* Scan for mode based on fixed ID, position, or resolution */
+ nmode = 0;
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ int visible = mi->x || mi->y;
+
+ if ((mode == nmode && visible) ||
+ mode == mi->mode ||
+ mode == (mi->y << 8)+mi->x)
+ return card->set_mode(mi);
+
+ if (visible)
+ nmode++;
+ }
+ }
+
+ /* Nothing found? Is it an "exceptional" (unprobed) mode? */
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (mode >= card->xmode_first &&
+ mode < card->xmode_first+card->xmode_n) {
+ struct mode_info mix;
+ mix.mode = mode;
+ mix.x = mix.y = 0;
+ return card->set_mode(&mix);
+ }
+ }
+
+ /* Otherwise, failure... */
+ return -1;
+}
+
+/*
+ * Recalculate the vertical video cutoff (hack!)
+ */
+static void vga_recalc_vertical(void)
+{
+ unsigned int font_size, rows;
+ u16 crtc;
+ u8 pt, ov;
+
+ set_fs(0);
+ font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
+ rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
+
+ rows *= font_size; /* Visible scan lines */
+ rows--; /* ... minus one */
+
+ crtc = vga_crtc();
+
+ pt = in_idx(crtc, 0x11);
+ pt &= ~0x80; /* Unlock CR0-7 */
+ out_idx(pt, crtc, 0x11);
+
+ out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+
+ ov = in_idx(crtc, 0x07); /* Overflow register */
+ ov &= 0xbd;
+ ov |= (rows >> (8-1)) & 0x02;
+ ov |= (rows >> (9-6)) & 0x40;
+ out_idx(ov, crtc, 0x07);
+}
+
+/* Set mode (with recalc if specified) */
+static int set_mode(u16 mode)
+{
+ int rv;
+
+ /* Very special mode numbers... */
+ if (mode == VIDEO_CURRENT_MODE)
+ return 0; /* Nothing to do... */
+ else if (mode == NORMAL_VGA)
+ mode = VIDEO_80x25;
+ else if (mode == EXTENDED_VGA)
+ mode = VIDEO_8POINT;
+
+ rv = raw_set_mode(mode);
+ if (rv)
+ return rv;
+
+ if (mode & VIDEO_RECALC)
+ vga_recalc_vertical();
+
+ return 0;
+}
+
+static unsigned int get_entry(void)
+{
+ char entry_buf[4];
+ int i, len = 0;
+ int key;
+ unsigned int v;
+
+ do {
+ key = getchar();
+
+ if (key == '\b') {
+ if (len > 0) {
+ puts("\b \b");
+ len--;
+ }
+ } else if ((key >= '0' && key <= '9') ||
+ (key >= 'A' && key <= 'Z') ||
+ (key >= 'a' && key <= 'z')) {
+ if (len < sizeof entry_buf) {
+ entry_buf[len++] = key;
+ putchar(key);
+ }
+ }
+ } while (key != '\r');
+ putchar('\n');
+
+ if (len == 0)
+ return VIDEO_CURRENT_MODE; /* Default */
+
+ v = 0;
+ for (i = 0; i < len; i++) {
+ v <<= 4;
+ key = entry_buf[i] | 0x20;
+ v += (key > '9') ? key-'a'+10 : key-'0';
+ }
+
+ return v;
+}
+
+static void display_menu(void)
+{
+ struct card_info *card;
+ struct mode_info *mi;
+ char ch;
+ int i;
+
+ puts("Mode: COLSxROWS:\n");
+
+ ch = '0';
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ int visible = mi->x && mi->y;
+ u16 mode_id = mi->mode ? mi->mode :
+ (mi->y << 8)+mi->x;
+
+ if (!visible)
+ continue; /* Hidden mode */
+
+ printf("%c %04X %3dx%-3d %s\n",
+ ch, mode_id, mi->x, mi->y, card->card_name);
+
+ if (ch == '9')
+ ch = 'a';
+ else if (ch == 'z' || ch == ' ')
+ ch = ' '; /* Out of keys... */
+ else
+ ch++;
+ }
+ }
+}
+
+#define H(x) ((x)-'a'+10)
+#define SCAN ((H('s')<<12)+(H('c')<<8)+(H('a')<<4)+H('n'))
+
+static unsigned int mode_menu(void)
+{
+ int key;
+ unsigned int sel;
+
+ puts("Press <ENTER> to see video modes available, "
+ "<SPACE> to continue, or wait 30 sec\n");
+
+ kbd_flush();
+ while (1) {
+ key = getchar_timeout();
+ if (key == ' ' || key == 0)
+ return VIDEO_CURRENT_MODE; /* Default */
+ if (key == '\r')
+ break;
+ putchar('\a'); /* Beep! */
+ }
+
+
+ for (;;) {
+ display_menu();
+
+ puts("Enter a video mode or \"scan\" to scan for "
+ "additional modes: ");
+ sel = get_entry();
+ if (sel != SCAN)
+ return sel;
+
+ probe_cards(1);
+ }
+}
+
+#ifdef CONFIG_VIDEO_RETAIN
+/* Save screen content to the heap */
+struct saved_screen {
+ int x, y;
+ int curx, cury;
+ u16 *data;
+} saved;
+
+static void save_screen(void)
+{
+ /* Should be called after store_mode_params() */
+ saved.x = boot_params.screen_info.orig_video_cols;
+ saved.y = boot_params.screen_info.orig_video_lines;
+ saved.curx = boot_params.screen_info.orig_x;
+ saved.cury = boot_params.screen_info.orig_y;
+
+ if (heap_free() < saved.x*saved.y*sizeof(u16)+512)
+ return; /* Not enough heap to save the screen */
+
+ saved.data = GET_HEAP(u16, saved.x*saved.y);
+
+ set_fs(video_segment);
+ copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16));
+}
+
+static void restore_screen(void)
+{
+ /* Should be called after store_mode_params() */
+ int xs = boot_params.screen_info.orig_video_cols;
+ int ys = boot_params.screen_info.orig_video_lines;
+ int y;
+ addr_t dst = 0;
+ u16 *src = saved.data;
+ u16 ax, bx, dx;
+
+ if (graphic_mode)
+ return; /* Can't restore onto a graphic mode */
+
+ if (!src)
+ return; /* No saved screen contents */
+
+ /* Restore screen contents */
+
+ set_fs(video_segment);
+ for (y = 0; y < ys; y++) {
+ int npad;
+
+ if (y < saved.y) {
+ int copy = (xs < saved.x) ? xs : saved.x;
+ copy_to_fs(dst, src, copy*sizeof(u16));
+ dst += copy*sizeof(u16);
+ src += saved.x;
+ npad = (xs < saved.x) ? 0 : xs-saved.x;
+ } else {
+ npad = xs;
+ }
+
+ /* Writes "npad" blank characters to
+ video_segment:dst and advances dst */
+ asm volatile("pushw %%es ; "
+ "movw %2,%%es ; "
+ "shrw %%cx ; "
+ "jnc 1f ; "
+ "stosw \n\t"
+ "1: rep;stosl ; "
+ "popw %%es"
+ : "+D" (dst), "+c" (npad)
+ : "bdS" (video_segment),
+ "a" (0x07200720));
+ }
+
+ /* Restore cursor position */
+ ax = 0x0200; /* Set cursor position */
+ bx = 0; /* Page number (<< 8) */
+ dx = (saved.cury << 8)+saved.curx;
+ asm volatile(INT10
+ : "+a" (ax), "+b" (bx), "+d" (dx)
+ : : "ecx", "esi", "edi");
+}
+#else
+#define save_screen() ((void)0)
+#define restore_screen() ((void)0)
+#endif
+
+void set_video(void)
+{
+ u16 mode = boot_params.hdr.vid_mode;
+
+ RESET_HEAP();
+
+ store_mode_params();
+ save_screen();
+ probe_cards(0);
+
+ for (;;) {
+ if (mode == ASK_VGA)
+ mode = mode_menu();
+
+ if (!set_mode(mode))
+ break;
+
+ printf("Undefined video mode number: %x\n", mode);
+ mode = ASK_VGA;
+ }
+ vesa_store_edid();
+ store_mode_params();
+
+ if (do_restore)
+ restore_screen();
+}
diff --git a/arch/i386/boot/video.h b/arch/i386/boot/video.h
new file mode 100644
index 000000000000..b92447d51213
--- /dev/null
+++ b/arch/i386/boot/video.h
@@ -0,0 +1,152 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video.h
+ *
+ * Header file for the real-mode video probing code
+ */
+
+#ifndef BOOT_VIDEO_H
+#define BOOT_VIDEO_H
+
+#include <linux/types.h>
+
+/* Enable autodetection of SVGA adapters and modes. */
+#undef CONFIG_VIDEO_SVGA
+
+/* Enable autodetection of VESA modes */
+#define CONFIG_VIDEO_VESA
+
+/* Retain screen contents when switching modes */
+#define CONFIG_VIDEO_RETAIN
+
+/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
+#undef CONFIG_VIDEO_400_HACK
+
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ * NORMAL_VGA (-1)
+ * EXTENDED_VGA (-2)
+ * ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+/* Special video modes */
+#define VIDEO_FIRST_SPECIAL 0x0f00
+#define VIDEO_80x25 0x0f00
+#define VIDEO_8POINT 0x0f01
+#define VIDEO_80x43 0x0f02
+#define VIDEO_80x28 0x0f03
+#define VIDEO_CURRENT_MODE 0x0f04
+#define VIDEO_80x30 0x0f05
+#define VIDEO_80x34 0x0f06
+#define VIDEO_80x60 0x0f07
+#define VIDEO_GFX_HACK 0x0f08
+#define VIDEO_LAST_SPECIAL 0x0f09
+
+/* Video modes given by resolution */
+#define VIDEO_FIRST_RESOLUTION 0x1000
+
+/* The "recalculate timings" flag */
+#define VIDEO_RECALC 0x8000
+
+/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
+#ifdef CONFIG_VIDEO_RETAIN
+void store_screen(void);
+#define DO_STORE() store_screen()
+#else
+#define DO_STORE() ((void)0)
+#endif /* CONFIG_VIDEO_RETAIN */
+
+/*
+ * Mode table structures
+ */
+
+struct mode_info {
+ u16 mode; /* Mode number (vga= style) */
+ u8 x, y; /* Width, height */
+};
+
+struct card_info {
+ const char *card_name;
+ int (*set_mode)(struct mode_info *mode);
+ int (*probe)(void);
+ struct mode_info *modes;
+ int nmodes; /* Number of probed modes so far */
+ int unsafe; /* Probing is unsafe, only do after "scan" */
+ u16 xmode_first; /* Unprobed modes to try to call anyway */
+ u16 xmode_n; /* Size of unprobed mode range */
+};
+
+#define __videocard struct card_info __attribute__((section(".videocards")))
+extern struct card_info video_cards[], video_cards_end[];
+
+int mode_defined(u16 mode); /* video.c */
+
+/* Basic video information */
+#define ADAPTER_CGA 0 /* CGA/MDA/HGC */
+#define ADAPTER_EGA 1
+#define ADAPTER_VGA 2
+
+extern int adapter;
+extern u16 video_segment;
+extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
+extern int do_restore; /* Restore screen contents */
+extern int graphic_mode; /* Graphics mode with linear frame buffer */
+
+/*
+ * int $0x10 is notorious for touching registers it shouldn't.
+ * gcc doesn't like %ebp being clobbered, so define it as a push/pop
+ * sequence here.
+ *
+ * A number of systems, including the original PC can clobber %bp in
+ * certain circumstances, like when scrolling. There exists at least
+ * one Trident video card which could clobber DS under a set of
+ * circumstances that we are unlikely to encounter (scrolling when
+ * using an extended graphics mode of more than 800x600 pixels), but
+ * it's cheap insurance to deal with that here.
+ */
+#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
+
+/* Accessing VGA indexed registers */
+static inline u8 in_idx(u16 port, u8 index)
+{
+ outb(index, port);
+ return inb(port+1);
+}
+
+static inline void out_idx(u8 v, u16 port, u8 index)
+{
+ outw(index+(v << 8), port);
+}
+
+/* Writes a value to an indexed port and then reads the port again */
+static inline u8 tst_idx(u8 v, u16 port, u8 index)
+{
+ out_idx(port, index, v);
+ return in_idx(port, index);
+}
+
+/* Get the I/O port of the VGA CRTC */
+u16 vga_crtc(void); /* video-vga.c */
+
+#endif /* BOOT_VIDEO_H */
diff --git a/arch/i386/boot/voyager.c b/arch/i386/boot/voyager.c
new file mode 100644
index 000000000000..61c8fe0453be
--- /dev/null
+++ b/arch/i386/boot/voyager.c
@@ -0,0 +1,46 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/voyager.c
+ *
+ * Get the Voyager config information
+ */
+
+#include "boot.h"
+
+#ifdef CONFIG_X86_VOYAGER
+
+int query_voyager(void)
+{
+ u8 err;
+ u16 es, di;
+ /* Abuse the apm_bios_info area for this */
+ u8 *data_ptr = (u8 *)&boot_params.apm_bios_info;
+
+ data_ptr[0] = 0xff; /* Flag on config not found(?) */
+
+ asm("pushw %%es ; "
+ "int $0x15 ; "
+ "setc %0 ; "
+ "movw %%es, %1 ; "
+ "popw %%es"
+ : "=q" (err), "=r" (es), "=D" (di)
+ : "a" (0xffc0));
+
+ if (err)
+ return -1; /* Not Voyager */
+
+ set_fs(es);
+ copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */
+ return 0;
+}
+
+#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 1a3a2217b7c2..54ee1764fdae 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.22-rc2
-# Mon May 21 13:23:44 2007
+# Linux kernel version: 2.6.22-git14
+# Fri Jul 20 09:53:15 2007
#
CONFIG_X86_32=y
CONFIG_GENERIC_TIME=y
@@ -37,19 +37,18 @@ CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
-# CONFIG_IPC_NS is not set
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
# CONFIG_TASKSTATS is not set
-# CONFIG_UTS_NS is not set
+# CONFIG_USER_NS is not set
# CONFIG_AUDIT is not set
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_LOG_BUF_SHIFT=18
# CONFIG_CPUSETS is not set
CONFIG_SYSFS_DEPRECATED=y
-# CONFIG_RELAY is not set
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -73,16 +72,13 @@ CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_SLAB=y
-# CONFIG_SLUB is not set
+CONFIG_SLUB_DEBUG=y
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
# CONFIG_SLOB is not set
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
-
-#
-# Loadable module support
-#
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -90,14 +86,11 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
# CONFIG_KMOD is not set
CONFIG_STOP_MACHINE=y
-
-#
-# Block layer
-#
CONFIG_BLOCK=y
CONFIG_LBD=y
# CONFIG_BLK_DEV_IO_TRACE is not set
# CONFIG_LSF is not set
+# CONFIG_BLK_DEV_BSG is not set
#
# IO Schedulers
@@ -166,7 +159,6 @@ CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
CONFIG_X86_POPAD_OK=y
-CONFIG_X86_CMPXCHG64=y
CONFIG_X86_GOOD_APIC=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
@@ -202,6 +194,7 @@ CONFIG_X86_CPUID=y
# CONFIG_EDD is not set
# CONFIG_DELL_RBU is not set
# CONFIG_DCDBAS is not set
+CONFIG_DMIID=y
# CONFIG_NOHIGHMEM is not set
CONFIG_HIGHMEM4G=y
# CONFIG_HIGHMEM64G is not set
@@ -218,7 +211,9 @@ CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_RESOURCES_64BIT=y
CONFIG_ZONE_DMA_FLAG=1
+CONFIG_BOUNCE=y
CONFIG_NR_QUICK=1
+CONFIG_VIRT_TO_BUS=y
# CONFIG_HIGHPTE is not set
# CONFIG_MATH_EMULATION is not set
CONFIG_MTRR=y
@@ -245,7 +240,6 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_PM=y
CONFIG_PM_LEGACY=y
# CONFIG_PM_DEBUG is not set
-# CONFIG_PM_SYSFS_DEPRECATED is not set
#
# ACPI (Advanced Configuration and Power Interface) Support
@@ -285,7 +279,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
#
# CPUFreq processor drivers
@@ -326,7 +320,7 @@ CONFIG_PCI_MMCONFIG=y
CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
-CONFIG_HT_IRQ=y
+# CONFIG_HT_IRQ is not set
CONFIG_ISA_DMA_API=y
# CONFIG_ISA is not set
# CONFIG_MCA is not set
@@ -382,7 +376,7 @@ CONFIG_IP_PNP_DHCP=y
CONFIG_INET_TUNNEL=y
CONFIG_INET_XFRM_MODE_TRANSPORT=y
CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_INET_XFRM_MODE_BEET=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
CONFIG_INET_DIAG=y
CONFIG_INET_TCP_DIAG=y
# CONFIG_TCP_CONG_ADVANCED is not set
@@ -401,27 +395,15 @@ CONFIG_IPV6=y
# CONFIG_INET6_TUNNEL is not set
CONFIG_INET6_XFRM_MODE_TRANSPORT=y
CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
+# CONFIG_INET6_XFRM_MODE_BEET is not set
# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
CONFIG_IPV6_SIT=y
# CONFIG_IPV6_TUNNEL is not set
# CONFIG_IPV6_MULTIPLE_TABLES is not set
# CONFIG_NETWORK_SECMARK is not set
# CONFIG_NETFILTER is not set
-
-#
-# DCCP Configuration (EXPERIMENTAL)
-#
# CONFIG_IP_DCCP is not set
-
-#
-# SCTP Configuration (EXPERIMENTAL)
-#
# CONFIG_IP_SCTP is not set
-
-#
-# TIPC Configuration (EXPERIMENTAL)
-#
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
@@ -458,6 +440,7 @@ CONFIG_IPV6_SIT=y
# CONFIG_MAC80211 is not set
# CONFIG_IEEE80211 is not set
# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
#
# Device Drivers
@@ -472,21 +455,9 @@ CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
-
-#
-# Connector - unified userspace <-> kernelspace linker
-#
# CONFIG_CONNECTOR is not set
# CONFIG_MTD is not set
-
-#
-# Parallel port support
-#
# CONFIG_PARPORT is not set
-
-#
-# Plug and Play support
-#
CONFIG_PNP=y
# CONFIG_PNP_DEBUG is not set
@@ -494,10 +465,7 @@ CONFIG_PNP=y
# Protocols
#
CONFIG_PNPACPI=y
-
-#
-# Block devices
-#
+CONFIG_BLK_DEV=y
CONFIG_BLK_DEV_FD=y
# CONFIG_BLK_CPQ_DA is not set
# CONFIG_BLK_CPQ_CISS_DA is not set
@@ -515,17 +483,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096
CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set
-
-#
-# Misc devices
-#
+CONFIG_MISC_DEVICES=y
# CONFIG_IBM_ASM is not set
# CONFIG_PHANTOM is not set
+# CONFIG_EEPROM_93CX6 is not set
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set
# CONFIG_SONY_LAPTOP is not set
# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_BLINK is not set
CONFIG_IDE=y
CONFIG_BLK_DEV_IDE=y
@@ -597,6 +562,7 @@ CONFIG_BLK_DEV_IDEDMA=y
#
# CONFIG_RAID_ATTRS is not set
CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
# CONFIG_SCSI_TGT is not set
CONFIG_SCSI_NETLINK=y
# CONFIG_SCSI_PROC_FS is not set
@@ -607,8 +573,9 @@ CONFIG_SCSI_NETLINK=y
CONFIG_BLK_DEV_SD=y
# CONFIG_CHR_DEV_ST is not set
# CONFIG_CHR_DEV_OSST is not set
-# CONFIG_BLK_DEV_SR is not set
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_BLK_DEV_SR=y
+# CONFIG_BLK_DEV_SR_VENDOR is not set
+CONFIG_CHR_DEV_SG=y
# CONFIG_CHR_DEV_SCH is not set
#
@@ -668,6 +635,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_SCSI_INIA100 is not set
# CONFIG_SCSI_STEX is not set
# CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_IPR is not set
# CONFIG_SCSI_QLOGIC_1280 is not set
# CONFIG_SCSI_QLA_FC is not set
# CONFIG_SCSI_QLA_ISCSI is not set
@@ -676,14 +644,73 @@ CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_NSP32 is not set
# CONFIG_SCSI_DEBUG is not set
-# CONFIG_SCSI_ESP_CORE is not set
# CONFIG_SCSI_SRP is not set
-# CONFIG_ATA is not set
-
-#
-# Multi-device support (RAID and LVM)
-#
-# CONFIG_MD is not set
+CONFIG_ATA=y
+# CONFIG_ATA_NONSTANDARD is not set
+CONFIG_ATA_ACPI=y
+CONFIG_SATA_AHCI=y
+CONFIG_SATA_SVW=y
+CONFIG_ATA_PIIX=y
+# CONFIG_SATA_MV is not set
+CONFIG_SATA_NV=y
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+CONFIG_SATA_SIL=y
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+CONFIG_SATA_VIA=y
+# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CS5535 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+# CONFIG_PATA_PDC2027X is not set
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+CONFIG_MD=y
+# CONFIG_BLK_DEV_MD is not set
+CONFIG_BLK_DEV_DM=y
+# CONFIG_DM_DEBUG is not set
+# CONFIG_DM_CRYPT is not set
+# CONFIG_DM_SNAPSHOT is not set
+# CONFIG_DM_MIRROR is not set
+# CONFIG_DM_ZERO is not set
+# CONFIG_DM_MULTIPATH is not set
+# CONFIG_DM_DELAY is not set
#
# Fusion MPT device support
@@ -724,42 +751,27 @@ CONFIG_IEEE1394_OHCI1394=y
# CONFIG_IEEE1394_ETH1394 is not set
# CONFIG_IEEE1394_DV1394 is not set
CONFIG_IEEE1394_RAWIO=y
-
-#
-# I2O device support
-#
# CONFIG_I2O is not set
-# CONFIG_MACINTOSH_DRIVERS is not set
-
-#
-# Network device support
-#
+CONFIG_MACINTOSH_DRIVERS=y
+# CONFIG_MAC_EMUMOUSEBTN is not set
CONFIG_NETDEVICES=y
+CONFIG_NETDEVICES_MULTIQUEUE=y
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
# CONFIG_EQUALIZER is not set
# CONFIG_TUN is not set
# CONFIG_NET_SB1000 is not set
-
-#
-# ARCnet devices
-#
# CONFIG_ARCNET is not set
# CONFIG_PHYLIB is not set
-
-#
-# Ethernet (10 or 100Mbit)
-#
CONFIG_NET_ETHERNET=y
CONFIG_MII=y
# CONFIG_HAPPYMEAL is not set
# CONFIG_SUNGEM is not set
# CONFIG_CASSINI is not set
-# CONFIG_NET_VENDOR_3COM is not set
-
-#
-# Tulip family network device support
-#
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=y
+# CONFIG_TYPHOON is not set
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
CONFIG_TULIP=y
@@ -810,7 +822,6 @@ CONFIG_R8169=y
# CONFIG_SIS190 is not set
# CONFIG_SKGE is not set
CONFIG_SKY2=y
-# CONFIG_SK98LIN is not set
# CONFIG_VIA_VELOCITY is not set
CONFIG_TIGON3=y
CONFIG_BNX2=y
@@ -824,10 +835,6 @@ CONFIG_NETDEV_10000=y
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_MLX4_CORE is not set
-
-#
-# Token Ring devices
-#
# CONFIG_TR is not set
#
@@ -856,15 +863,7 @@ CONFIG_NETCONSOLE=y
CONFIG_NETPOLL=y
# CONFIG_NETPOLL_TRAP is not set
CONFIG_NET_POLL_CONTROLLER=y
-
-#
-# ISDN subsystem
-#
# CONFIG_ISDN is not set
-
-#
-# Telephony Support
-#
# CONFIG_PHONE is not set
#
@@ -872,6 +871,7 @@ CONFIG_NET_POLL_CONTROLLER=y
#
CONFIG_INPUT=y
# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
#
# Userland interfaces
@@ -937,6 +937,7 @@ CONFIG_HW_CONSOLE=y
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_FIX_EARLYCON_MEM=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_PNP=y
CONFIG_SERIAL_8250_NR_UARTS=4
@@ -952,10 +953,6 @@ CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_UNIX98_PTYS=y
CONFIG_LEGACY_PTYS=y
CONFIG_LEGACY_PTY_COUNT=256
-
-#
-# IPMI
-#
# CONFIG_IPMI_HANDLER is not set
# CONFIG_WATCHDOG is not set
CONFIG_HW_RANDOM=y
@@ -989,11 +986,7 @@ CONFIG_MAX_RAW_DEVS=256
CONFIG_HPET=y
# CONFIG_HPET_RTC_IRQ is not set
CONFIG_HPET_MMAP=y
-CONFIG_HANGCHECK_TIMER=y
-
-#
-# TPM devices
-#
+# CONFIG_HANGCHECK_TIMER is not set
# CONFIG_TCG_TPM is not set
# CONFIG_TELCLOCK is not set
CONFIG_DEVPORT=y
@@ -1004,11 +997,8 @@ CONFIG_DEVPORT=y
#
# CONFIG_SPI is not set
# CONFIG_SPI_MASTER is not set
-
-#
-# Dallas's 1-wire bus
-#
# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
# CONFIG_HWMON is not set
#
@@ -1042,7 +1032,7 @@ CONFIG_DAB=y
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128
-# CONFIG_VIDEO_SELECT is not set
+CONFIG_VIDEO_SELECT=y
CONFIG_DUMMY_CONSOLE=y
#
@@ -1059,15 +1049,11 @@ CONFIG_SOUND=y
# Open Sound System
#
CONFIG_SOUND_PRIME=y
-# CONFIG_OSS_OBSOLETE is not set
# CONFIG_SOUND_TRIDENT is not set
# CONFIG_SOUND_MSNDCLAS is not set
# CONFIG_SOUND_MSNDPIN is not set
# CONFIG_SOUND_OSS is not set
-
-#
-# HID Devices
-#
+CONFIG_HID_SUPPORT=y
CONFIG_HID=y
# CONFIG_HID_DEBUG is not set
@@ -1078,10 +1064,7 @@ CONFIG_USB_HID=y
# CONFIG_USB_HIDINPUT_POWERBOOK is not set
# CONFIG_HID_FF is not set
# CONFIG_USB_HIDDEV is not set
-
-#
-# USB support
-#
+CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
@@ -1095,6 +1078,7 @@ CONFIG_USB_DEVICEFS=y
# CONFIG_USB_DEVICE_CLASS is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_PERSIST is not set
# CONFIG_USB_OTG is not set
#
@@ -1104,7 +1088,6 @@ CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_SPLIT_ISO is not set
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
@@ -1112,6 +1095,7 @@ CONFIG_USB_OHCI_HCD=y
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
+# CONFIG_USB_R8A66597_HCD is not set
#
# USB Device Class drivers
@@ -1202,15 +1186,7 @@ CONFIG_USB_MON=y
#
# LED Triggers
#
-
-#
-# InfiniBand support
-#
# CONFIG_INFINIBAND is not set
-
-#
-# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
-#
# CONFIG_EDAC is not set
#
@@ -1230,11 +1206,13 @@ CONFIG_USB_MON=y
#
# DMA Devices
#
+CONFIG_VIRTUALIZATION=y
+# CONFIG_KVM is not set
#
-# Virtualization
+# Userspace I/O
#
-# CONFIG_KVM is not set
+# CONFIG_UIO is not set
#
# File systems
@@ -1272,6 +1250,7 @@ CONFIG_DNOTIFY=y
# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=y
# CONFIG_FUSE_FS is not set
+CONFIG_GENERIC_ACL=y
#
# CD-ROM/DVD Filesystems
@@ -1299,7 +1278,7 @@ CONFIG_PROC_KCORE=y
CONFIG_PROC_SYSCTL=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
-# CONFIG_TMPFS_POSIX_ACL is not set
+CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
@@ -1349,7 +1328,6 @@ CONFIG_SUNRPC=y
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set
-# CONFIG_9P_FS is not set
#
# Partition Types
@@ -1405,10 +1383,7 @@ CONFIG_NLS_UTF8=y
# Distributed Lock Manager
#
# CONFIG_DLM is not set
-
-#
-# Instrumentation Support
-#
+CONFIG_INSTRUMENTATION=y
CONFIG_PROFILING=y
CONFIG_OPROFILE=y
CONFIG_KPROBES=y
@@ -1418,7 +1393,7 @@ CONFIG_KPROBES=y
#
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
# CONFIG_PRINTK_TIME is not set
-CONFIG_ENABLE_MUST_CHECK=y
+# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
CONFIG_UNUSED_SYMBOLS=y
# CONFIG_DEBUG_FS is not set
@@ -1426,15 +1401,17 @@ CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_SCHED_DEBUG is not set
# CONFIG_SCHEDSTATS is not set
-# CONFIG_TIMER_STATS is not set
-# CONFIG_DEBUG_SLAB is not set
+CONFIG_TIMER_STATS=y
+# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_DEBUG_KOBJECT is not set
@@ -1444,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
-# CONFIG_UNWIND_INFO is not set
# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
@@ -1463,10 +1439,6 @@ CONFIG_DOUBLEFAULT=y
#
# CONFIG_KEYS is not set
# CONFIG_SECURITY is not set
-
-#
-# Cryptographic options
-#
# CONFIG_CRYPTO is not set
#
@@ -1477,6 +1449,7 @@ CONFIG_BITREVERSE=y
# CONFIG_CRC16 is not set
# CONFIG_CRC_ITU_T is not set
CONFIG_CRC32=y
+# CONFIG_CRC7 is not set
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
CONFIG_PLIST=y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 06da59f6f837..dbe5e87e0d66 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_VM86) += vm86.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_MGEODE_LX) += geode.o
obj-$(CONFIG_VMI) += vmi.o vmiclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index a574cd2c8b61..cacdd883bf2b 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -618,6 +618,8 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
#ifdef CONFIG_HPET_TIMER
#include <asm/hpet.h>
+static struct __initdata resource *hpet_res;
+
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
struct acpi_table_hpet *hpet_tbl;
@@ -638,8 +640,42 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, hpet_address);
+ /*
+ * Allocate and initialize the HPET firmware resource for adding into
+ * the resource tree during the lateinit timeframe.
+ */
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+
+ if (!hpet_res)
+ return 0;
+
+ memset(hpet_res, 0, sizeof(*hpet_res));
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+ hpet_tbl->sequence);
+
+ hpet_res->start = hpet_address;
+ hpet_res->end = hpet_address + (1 * 1024) - 1;
+
return 0;
}
+
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+ if (!hpet_res)
+ return 1;
+
+ return insert_resource(&iomem_resource, hpet_res);
+}
+
+late_initcall(hpet_insert_resource);
+
#else
#define acpi_parse_hpet NULL
#endif
@@ -950,14 +986,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
},
{
.callback = force_acpi_ht,
- .ident = "DELL GX240",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
- },
- },
- {
- .callback = force_acpi_ht,
.ident = "HP VISUALIZE NT Workstation",
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c
index 4ee83577bf61..c42b5ab49deb 100644
--- a/arch/i386/kernel/acpi/sleep.c
+++ b/arch/i386/kernel/acpi/sleep.c
@@ -14,7 +14,7 @@
/* address in low memory of the wakeup routine. */
unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
extern char wakeup_start, wakeup_end;
extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
@@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str)
{
while ((str != NULL) && (*str != '\0')) {
if (strncmp(str, "s3_bios", 7) == 0)
- acpi_video_flags = 1;
+ acpi_realmode_flags |= 1;
if (strncmp(str, "s3_mode", 7) == 0)
- acpi_video_flags |= 2;
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
@@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str)
__setup("acpi_sleep=", acpi_sleep_setup);
+/* Ouch, we want to delete this. We already have better version in userspace, in
+ s2ram from suspend.sf.net project */
static __init int reset_videomode_after_s3(struct dmi_system_id *d)
{
- acpi_video_flags |= 2;
+ acpi_realmode_flags |= 2;
return 0;
}
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index a2295a34b2c7..ed0a0f2c1597 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -13,6 +13,21 @@
# cs = 0x1234, eip = 0x05
#
+#define BEEP \
+ inb $97, %al; \
+ outb %al, $0x80; \
+ movb $3, %al; \
+ outb %al, $97; \
+ outb %al, $0x80; \
+ movb $-74, %al; \
+ outb %al, $67; \
+ outb %al, $0x80; \
+ movb $-119, %al; \
+ outb %al, $66; \
+ outb %al, $0x80; \
+ movb $15, %al; \
+ outb %al, $66;
+
ALIGN
.align 4096
ENTRY(wakeup_start)
@@ -31,6 +46,11 @@ wakeup_code:
movw %cs, %ax
movw %ax, %ds # Make ds:0 point to wakeup_start
movw %ax, %ss
+
+ testl $4, realmode_flags - wakeup_code
+ jz 1f
+ BEEP
+1:
mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
movw $0x0e00 + 'S', %fs:(0x12)
@@ -41,7 +61,7 @@ wakeup_code:
cmpl $0x12345678, %eax
jne bogus_real_magic
- testl $1, video_flags - wakeup_code
+ testl $1, realmode_flags - wakeup_code
jz 1f
lcall $0xc000,$3
movw %cs, %ax
@@ -49,7 +69,7 @@ wakeup_code:
movw %ax, %ss
1:
- testl $2, video_flags - wakeup_code
+ testl $2, realmode_flags - wakeup_code
jz 1f
mov video_mode - wakeup_code, %ax
call mode_set
@@ -88,7 +108,11 @@ wakeup_code:
cmpl $0x12345678, %eax
jne bogus_real_magic
- ljmpl $__KERNEL_CS,$wakeup_pmode_return
+ testl $8, realmode_flags - wakeup_code
+ jz 1f
+ BEEP
+1:
+ ljmpl $__KERNEL_CS, $wakeup_pmode_return
real_save_gdt: .word 0
.long 0
@@ -97,7 +121,8 @@ real_save_cr3: .long 0
real_save_cr4: .long 0
real_magic: .long 0
video_mode: .long 0
-video_flags: .long 0
+realmode_flags: .long 0
+beep_flags: .long 0
real_efer_save_restore: .long 0
real_save_efer_edx: .long 0
real_save_efer_eax: .long 0
@@ -260,8 +285,8 @@ ENTRY(acpi_copy_wakeup_routine)
movl saved_videomode, %edx
movl %edx, video_mode - wakeup_start (%eax)
- movl acpi_video_flags, %edx
- movl %edx, video_flags - wakeup_start (%eax)
+ movl acpi_realmode_flags, %edx
+ movl %edx, realmode_flags - wakeup_start (%eax)
movl $0x12345678, real_magic - wakeup_start (%eax)
movl $0x12345678, saved_magic
popl %ebx
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index d8cda14fff8b..c3750c2c4113 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -2,12 +2,17 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/alternative.h>
#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
-static int noreplace_smp = 0;
-static int smp_alt_once = 0;
-static int debug_alternative = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
static int __init bootonly(char *str)
{
@@ -15,6 +20,11 @@ static int __init bootonly(char *str)
return 1;
}
__setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int debug_alternative;
static int __init debug_alt(char *str)
{
@@ -23,6 +33,8 @@ static int __init debug_alt(char *str)
}
__setup("debug-alternative", debug_alt);
+static int noreplace_smp;
+
static int __init setup_noreplace_smp(char *str)
{
noreplace_smp = 1;
@@ -144,7 +156,7 @@ static void nop_out(void *insns, unsigned int len)
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
- memcpy(insns, noptable[noplen], noplen);
+ text_poke(insns, noptable[noplen], noplen);
insns += noplen;
len -= noplen;
}
@@ -196,7 +208,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- **ptr = 0xf0; /* lock prefix */
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
};
}
@@ -354,10 +366,6 @@ void apply_paravirt(struct paravirt_patch_site *start,
/* Pad the rest with nops */
nop_out(p->instr + used, p->len - used);
}
-
- /* Sync to be conservative, in case we patched following
- * instructions */
- sync_core();
}
extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
@@ -367,6 +375,14 @@ void __init alternative_instructions(void)
{
unsigned long flags;
+ /* The patching is not fully atomic, so try to avoid local interruptions
+ that might execute the to be patched code.
+ Other CPUs are not running. */
+ stop_nmi();
+#ifdef CONFIG_MCE
+ stop_mce();
+#endif
+
local_irq_save(flags);
apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -376,8 +392,6 @@ void __init alternative_instructions(void)
#ifdef CONFIG_HOTPLUG_CPU
if (num_possible_cpus() < 2)
smp_alt_once = 1;
-#else
- smp_alt_once = 1;
#endif
#ifdef CONFIG_SMP
@@ -401,4 +415,37 @@ void __init alternative_instructions(void)
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
local_irq_restore(flags);
+
+ restart_nmi();
+#ifdef CONFIG_MCE
+ restart_mce();
+#endif
+}
+
+/*
+ * Warning:
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these instructions.
+ * And on the local CPU you need to be protected again NMI or MCE handlers
+ * seeing an inconsistent instruction while you patch.
+ */
+void __kprobes text_poke(void *oaddr, unsigned char *opcode, int len)
+{
+ u8 *addr = oaddr;
+ if (!pte_write(*lookup_address((unsigned long)addr))) {
+ struct page *p[2] = { virt_to_page(addr), virt_to_page(addr+PAGE_SIZE) };
+ addr = vmap(p, 2, VM_MAP, PAGE_KERNEL);
+ if (!addr)
+ return;
+ addr += ((unsigned long)oaddr) % PAGE_SIZE;
+ }
+ memcpy(addr, opcode, len);
+ sync_core();
+ /* Not strictly needed, but can speed CPU recovery up. Ignore cross cacheline
+ case. */
+ if (cpu_has_clflush)
+ asm("clflush (%0) " :: "r" (oaddr) : "memory");
+ if (addr != oaddr)
+ vunmap(addr);
}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 67824f3bb974..bfc6cb7df7e7 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write_around(APIC_LVTT, v);
break;
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here */
+ break;
}
local_irq_restore(flags);
@@ -315,7 +318,7 @@ static void __devinit setup_APIC_timer(void)
#define LAPIC_CAL_LOOPS (HZ/10)
-static __initdata volatile int lapic_cal_loops = -1;
+static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
@@ -485,7 +488,7 @@ void __init setup_boot_APIC_clock(void)
/* Let the interrupts run */
local_irq_enable();
- while(lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
cpu_relax();
local_irq_disable();
@@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void)
*/
if (nmi_watchdog != NMI_IO_APIC)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ else
+ printk(KERN_WARNING "APIC timer registered as dummy,"
+ " due to nmi_watchdog=1!\n");
}
/* Setup the lapic or request the broadcast */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 4112afe712b9..47001d50a083 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -222,6 +222,7 @@
#include <linux/capability.h>
#include <linux/device.h>
#include <linux/kernel.h>
+#include <linux/freezer.h>
#include <linux/smp.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
@@ -2311,7 +2312,6 @@ static int __init apm_init(void)
remove_proc_entry("apm", NULL);
return err;
}
- kapmd_task->flags |= PF_NOFREEZE;
wake_up_process(kapmd_task);
if (num_online_cpus() > 1 && !smp ) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044d..7288ac88d746 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,13 @@
#include <asm/thread_info.h>
#include <asm/elf.h>
+#include <xen/interface/xen.h>
+
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
+
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -59,6 +66,7 @@ void foo(void)
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+ OFFSET(TI_cpu, thread_info, cpu);
BLANK();
OFFSET(GDS_size, Xgt_desc_struct, size);
@@ -115,4 +123,25 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
+
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+ BLANK();
+ OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+ OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+ OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+ OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+ OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+ OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+ OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+ OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+ OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+ OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
}
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 74f27a463db0..778396c78d65 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -8,8 +8,7 @@ obj-y += amd.o
obj-y += cyrix.o
obj-y += centaur.o
obj-y += transmeta.o
-obj-y += intel.o intel_cacheinfo.o
-obj-y += rise.o
+obj-y += intel.o intel_cacheinfo.o addon_cpuid_features.o
obj-y += nexgen.o
obj-y += umc.o
diff --git a/arch/i386/kernel/cpu/addon_cpuid_features.c b/arch/i386/kernel/cpu/addon_cpuid_features.c
new file mode 100644
index 000000000000..3e91d3ee26ec
--- /dev/null
+++ b/arch/i386/kernel/cpu/addon_cpuid_features.c
@@ -0,0 +1,50 @@
+
+/*
+ * Routines to indentify additional cpu features that are scattered in
+ * cpuid space.
+ */
+
+#include <linux/cpu.h>
+
+#include <asm/processor.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+};
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
+ { 0, 0, 0, 0 }
+ };
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
+ &regs[CR_ECX], &regs[CR_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_bit(cb->feature, c->x86_capability);
+ }
+}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 6f47eeeb93ea..c7ba455d5ac7 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -231,6 +231,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
switch (c->x86) {
case 15:
+ /* Use K8 tuning for Fam10h and Fam11h */
+ case 0x10:
+ case 0x11:
set_bit(X86_FEATURE_K8, c->x86_capability);
break;
case 6:
@@ -272,8 +275,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
#endif
- if (cpuid_eax(0x80000000) >= 0x80000006)
- num_cache_leaves = 3;
+ if (cpuid_eax(0x80000000) >= 0x80000006) {
+ if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+ }
if (amd_apic_timer_broken())
set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 794d593c47eb..d506201d397c 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -353,6 +353,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
if ( xlvl >= 0x80000004 )
get_model_name(c); /* Default name */
}
+
+ init_scattered_cpuid_features(c);
}
early_intel_workaround(c);
@@ -604,7 +606,6 @@ extern int nsc_init_cpu(void);
extern int amd_init_cpu(void);
extern int centaur_init_cpu(void);
extern int transmeta_init_cpu(void);
-extern int rise_init_cpu(void);
extern int nexgen_init_cpu(void);
extern int umc_init_cpu(void);
@@ -616,7 +617,6 @@ void __init early_cpu_init(void)
amd_init_cpu();
centaur_init_cpu();
transmeta_init_cpu();
- rise_init_cpu();
nexgen_init_cpu();
umc_init_cpu();
early_cpu_detect();
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index e912aae9473c..094118ba00da 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -90,10 +90,17 @@ config X86_POWERNOW_K8
If in doubt, say N.
config X86_POWERNOW_K8_ACPI
- bool
- depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
- depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
+ bool "ACPI Support"
+ select ACPI_PROCESSOR
+ depends on X86_POWERNOW_K8
default y
+ help
+ This provides access to the K8s Processor Performance States via ACPI.
+ This driver is probably required for CPUFreq to work with multi-socket and
+ SMP systems. It is not required on at least some single-socket yet
+ multi-core systems, even if SMP is enabled.
+
+ It is safe to say Y here.
config X86_GX_SUSPMOD
tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
@@ -109,7 +116,7 @@ config X86_GX_SUSPMOD
config X86_SPEEDSTEP_CENTRINO
tristate "Intel Enhanced SpeedStep"
select CPU_FREQ_TABLE
- select X86_SPEEDSTEP_CENTRINO_TABLE if (!X86_SPEEDSTEP_CENTRINO_ACPI)
+ select X86_SPEEDSTEP_CENTRINO_TABLE
help
This adds the CPUFreq driver for Enhanced SpeedStep enabled
mobile CPUs. This means Intel Pentium M (Centrino) CPUs. However,
@@ -121,20 +128,6 @@ config X86_SPEEDSTEP_CENTRINO
If in doubt, say N.
-config X86_SPEEDSTEP_CENTRINO_ACPI
- bool "Use ACPI tables to decode valid frequency/voltage (deprecated)"
- depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR
- depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m)
- help
- This is deprecated and this functionality is now merged into
- acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
- speedstep_centrino.
- Use primarily the information provided in the BIOS ACPI tables
- to determine valid CPU frequency and voltage pairings. It is
- required for the driver to work on non-Banias CPUs.
-
- If in doubt, say Y.
-
config X86_SPEEDSTEP_CENTRINO_TABLE
bool "Built-in tables for Banias CPUs"
depends on X86_SPEEDSTEP_CENTRINO
@@ -230,7 +223,7 @@ comment "shared options"
config X86_ACPI_CPUFREQ_PROC_INTF
bool "/proc/acpi/processor/../performance interface (deprecated)"
depends on PROC_FS
- depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
+ depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
help
This enables the deprecated /proc/acpi/processor/../performance
interface. While it is helpful for debugging, the generic,
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 10baa3501ed3..6f846bee2103 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -167,11 +167,13 @@ static void do_drv_read(struct drv_cmd *cmd)
static void do_drv_write(struct drv_cmd *cmd)
{
- u32 h = 0;
+ u32 lo, hi;
switch (cmd->type) {
case SYSTEM_INTEL_MSR_CAPABLE:
- wrmsr(cmd->addr.msr.reg, cmd->val, h);
+ rdmsr(cmd->addr.msr.reg, lo, hi);
+ lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
+ wrmsr(cmd->addr.msr.reg, lo, hi);
break;
case SYSTEM_IO_CAPABLE:
acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
@@ -372,7 +374,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
struct cpufreq_freqs freqs;
cpumask_t online_policy_cpus;
struct drv_cmd cmd;
- unsigned int msr;
unsigned int next_state = 0; /* Index into freq_table */
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
@@ -417,11 +418,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
- msr =
- (u32) perf->states[next_perf_state].
- control & INTEL_MSR_RANGE;
- cmd.val = get_cur_val(online_policy_cpus);
- cmd.val = (cmd.val & ~INTEL_MSR_RANGE) | msr;
+ cmd.val = (u32) perf->states[next_perf_state].control;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
@@ -668,8 +665,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
data->max_freq = perf->states[0].core_frequency * 1000;
/* table init */
for (i=0; i<perf->state_count; i++) {
- if (i>0 && perf->states[i].core_frequency ==
- perf->states[i-1].core_frequency)
+ if (i>0 && perf->states[i].core_frequency >=
+ data->freq_table[valid_states-1].frequency / 1000)
continue;
data->freq_table[valid_states].index = i;
diff --git a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 0d49d73d1b71..66acd5039918 100644
--- a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -391,8 +391,6 @@ static struct cpufreq_driver nforce2_driver = {
*/
static unsigned int nforce2_detect_chipset(void)
{
- u8 revision;
-
nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
PCI_DEVICE_ID_NVIDIA_NFORCE2,
PCI_ANY_ID, PCI_ANY_ID, NULL);
@@ -400,10 +398,8 @@ static unsigned int nforce2_detect_chipset(void)
if (nforce2_chipset_dev == NULL)
return -ENODEV;
- pci_read_config_byte(nforce2_chipset_dev, PCI_REVISION_ID, &revision);
-
printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n",
- revision);
+ nforce2_chipset_dev->revision);
printk(KERN_INFO
"cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n");
diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
index 6667e9cceb9f..461dabc4e495 100644
--- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
@@ -79,7 +79,7 @@
#include <linux/smp.h>
#include <linux/cpufreq.h>
#include <linux/pci.h>
-#include <asm/processor.h>
+#include <asm/processor-cyrix.h>
#include <asm/errno.h>
/* PCI config registers, all at F0 */
@@ -115,7 +115,6 @@ struct gxfreq_params {
u8 pci_suscfg;
u8 pci_pmer1;
u8 pci_pmer2;
- u8 pci_rev;
struct pci_dev *cs55x0;
};
@@ -276,7 +275,7 @@ static void gx_set_cpuspeed(unsigned int khz)
pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */
pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1);
- if (gx_params->pci_rev < 0x10) { /* CS5530(rev 1.2, 1.3) */
+ if (gx_params->cs55x0->revision < 0x10) { /* CS5530(rev 1.2, 1.3) */
suscfg = gx_params->pci_suscfg | SUSMOD;
} else { /* CS5530A,B.. */
suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE;
@@ -471,7 +470,6 @@ static int __init cpufreq_gx_init(void)
pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
- pci_read_config_byte(params->cs55x0, PCI_REVISION_ID, &params->pci_rev);
if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
kfree(params);
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index a3df9c039bd4..ef8f0bc3fc71 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -29,6 +29,7 @@
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/delay.h>
#include <asm/msr.h>
#include <asm/timex.h>
@@ -55,7 +56,6 @@
/* Flags */
#define USE_ACPI_C3 (1 << 1)
#define USE_NORTHBRIDGE (1 << 2)
-#define USE_VT8235 (1 << 3)
static int cpu_model;
static unsigned int numscales=16;
@@ -63,22 +63,19 @@ static unsigned int fsb;
static const struct mV_pos *vrm_mV_table;
static const unsigned char *mV_vrm_table;
-struct f_msr {
- u8 vrm;
- u8 pos;
-};
-static struct f_msr f_msr_table[32];
static unsigned int highest_speed, lowest_speed; /* kHz */
static unsigned int minmult, maxmult;
static int can_scale_voltage;
static struct acpi_processor *pr = NULL;
static struct acpi_processor_cx *cx = NULL;
+static u32 acpi_regs_addr;
static u8 longhaul_flags;
-static u8 longhaul_pos;
+static unsigned int longhaul_index;
/* Module parameters */
static int scale_voltage;
+static int disable_acpi_c3;
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
@@ -144,7 +141,7 @@ static void do_longhaul1(unsigned int clock_ratio_index)
rdmsrl(MSR_VIA_BCR2, bcr2.val);
/* Enable software clock multiplier */
bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = clock_ratio_index;
+ bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff;
/* Sync to timer tick */
safe_halt();
@@ -163,14 +160,12 @@ static void do_longhaul1(unsigned int clock_ratio_index)
/* For processor with Longhaul MSR */
-static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
+static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
+ unsigned int dir)
{
union msr_longhaul longhaul;
- u8 dest_pos;
u32 t;
- dest_pos = f_msr_table[clock_ratio_index].pos;
-
rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
/* Setup new frequency */
longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
@@ -178,11 +173,11 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
/* Setup new voltage */
if (can_scale_voltage)
- longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm;
+ longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f;
/* Sync to timer tick */
safe_halt();
/* Raise voltage if necessary */
- if (can_scale_voltage && longhaul_pos < dest_pos) {
+ if (can_scale_voltage && dir) {
longhaul.bits.EnableSoftVID = 1;
wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
/* Change voltage */
@@ -199,7 +194,6 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
}
longhaul.bits.EnableSoftVID = 0;
wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- longhaul_pos = dest_pos;
}
/* Change frequency on next halt or sleep */
@@ -220,7 +214,7 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
/* Reduce voltage if necessary */
- if (can_scale_voltage && longhaul_pos > dest_pos) {
+ if (can_scale_voltage && !dir) {
longhaul.bits.EnableSoftVID = 1;
wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
/* Change voltage */
@@ -237,7 +231,6 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
}
longhaul.bits.EnableSoftVID = 0;
wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- longhaul_pos = dest_pos;
}
}
@@ -248,25 +241,28 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
* Sets a new clock ratio.
*/
-static void longhaul_setstate(unsigned int clock_ratio_index)
+static void longhaul_setstate(unsigned int table_index)
{
+ unsigned int clock_ratio_index;
int speed, mult;
struct cpufreq_freqs freqs;
- static unsigned int old_ratio=-1;
unsigned long flags;
unsigned int pic1_mask, pic2_mask;
+ u16 bm_status = 0;
+ u32 bm_timeout = 1000;
+ unsigned int dir = 0;
- if (old_ratio == clock_ratio_index)
- return;
- old_ratio = clock_ratio_index;
-
- mult = clock_ratio[clock_ratio_index];
+ clock_ratio_index = longhaul_table[table_index].index;
+ /* Safety precautions */
+ mult = clock_ratio[clock_ratio_index & 0x1f];
if (mult == -1)
return;
-
speed = calc_speed(mult);
if ((speed > highest_speed) || (speed < lowest_speed))
return;
+ /* Voltage transition before frequency transition? */
+ if (can_scale_voltage && longhaul_index < table_index)
+ dir = 1;
freqs.old = calc_speed(longhaul_get_cpu_mult());
freqs.new = speed;
@@ -285,11 +281,24 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
outb(0xFF,0xA1); /* Overkill */
outb(0xFE,0x21); /* TMR0 only */
+ /* Wait while PCI bus is busy. */
+ if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
+ || ((pr != NULL) && pr->flags.bm_control))) {
+ bm_status = inw(acpi_regs_addr);
+ bm_status &= 1 << 4;
+ while (bm_status && bm_timeout) {
+ outw(1 << 4, acpi_regs_addr);
+ bm_timeout--;
+ bm_status = inw(acpi_regs_addr);
+ bm_status &= 1 << 4;
+ }
+ }
+
if (longhaul_flags & USE_NORTHBRIDGE) {
/* Disable AGP and PCI arbiters */
outb(3, 0x22);
} else if ((pr != NULL) && pr->flags.bm_control) {
- /* Disable bus master arbitration */
+ /* Disable bus master arbitration */
acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
}
switch (longhaul_version) {
@@ -314,9 +323,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
if (longhaul_flags & USE_ACPI_C3) {
/* Don't allow wakeup */
acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- do_powersaver(cx->address, clock_ratio_index);
+ do_powersaver(cx->address, clock_ratio_index, dir);
} else {
- do_powersaver(0, clock_ratio_index);
+ do_powersaver(0, clock_ratio_index, dir);
}
break;
}
@@ -336,6 +345,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
freqs.new = calc_speed(longhaul_get_cpu_mult());
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+ if (!bm_timeout)
+ printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n");
}
/*
@@ -369,7 +381,8 @@ static int guess_fsb(int mult)
static int __init longhaul_get_ranges(void)
{
- unsigned int j, k = 0;
+ unsigned int i, j, k = 0;
+ unsigned int ratio;
int mult;
/* Get current frequency */
@@ -423,8 +436,7 @@ static int __init longhaul_get_ranges(void)
if(!longhaul_table)
return -ENOMEM;
- for (j=0; j < numscales; j++) {
- unsigned int ratio;
+ for (j = 0; j < numscales; j++) {
ratio = clock_ratio[j];
if (ratio == -1)
continue;
@@ -434,13 +446,41 @@ static int __init longhaul_get_ranges(void)
longhaul_table[k].index = j;
k++;
}
+ if (k <= 1) {
+ kfree(longhaul_table);
+ return -ENODEV;
+ }
+ /* Sort */
+ for (j = 0; j < k - 1; j++) {
+ unsigned int min_f, min_i;
+ min_f = longhaul_table[j].frequency;
+ min_i = j;
+ for (i = j + 1; i < k; i++) {
+ if (longhaul_table[i].frequency < min_f) {
+ min_f = longhaul_table[i].frequency;
+ min_i = i;
+ }
+ }
+ if (min_i != j) {
+ unsigned int temp;
+ temp = longhaul_table[j].frequency;
+ longhaul_table[j].frequency = longhaul_table[min_i].frequency;
+ longhaul_table[min_i].frequency = temp;
+ temp = longhaul_table[j].index;
+ longhaul_table[j].index = longhaul_table[min_i].index;
+ longhaul_table[min_i].index = temp;
+ }
+ }
longhaul_table[k].frequency = CPUFREQ_TABLE_END;
- if (!k) {
- kfree (longhaul_table);
- return -EINVAL;
- }
+ /* Find index we are running on */
+ for (j = 0; j < k; j++) {
+ if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) {
+ longhaul_index = j;
+ break;
+ }
+ }
return 0;
}
@@ -448,7 +488,7 @@ static int __init longhaul_get_ranges(void)
static void __init longhaul_setup_voltagescaling(void)
{
union msr_longhaul longhaul;
- struct mV_pos minvid, maxvid;
+ struct mV_pos minvid, maxvid, vid;
unsigned int j, speed, pos, kHz_step, numvscales;
int min_vid_speed;
@@ -459,11 +499,11 @@ static void __init longhaul_setup_voltagescaling(void)
}
if (!longhaul.bits.VRMRev) {
- printk (KERN_INFO PFX "VRM 8.5\n");
+ printk(KERN_INFO PFX "VRM 8.5\n");
vrm_mV_table = &vrm85_mV[0];
mV_vrm_table = &mV_vrm85[0];
} else {
- printk (KERN_INFO PFX "Mobile VRM\n");
+ printk(KERN_INFO PFX "Mobile VRM\n");
if (cpu_model < CPU_NEHEMIAH)
return;
vrm_mV_table = &mobilevrm_mV[0];
@@ -523,7 +563,6 @@ static void __init longhaul_setup_voltagescaling(void)
/* Calculate kHz for one voltage step */
kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
j = 0;
while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
speed = longhaul_table[j].frequency;
@@ -531,15 +570,14 @@ static void __init longhaul_setup_voltagescaling(void)
pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
else
pos = minvid.pos;
- f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos];
- f_msr_table[longhaul_table[j].index].pos = pos;
+ longhaul_table[j].index |= mV_vrm_table[pos] << 8;
+ vid = vrm_mV_table[mV_vrm_table[pos]];
+ printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV);
j++;
}
- longhaul_pos = maxvid.pos;
can_scale_voltage = 1;
- printk(KERN_INFO PFX "Voltage scaling enabled. "
- "Use of \"conservative\" governor is highly recommended.\n");
+ printk(KERN_INFO PFX "Voltage scaling enabled.\n");
}
@@ -553,15 +591,44 @@ static int longhaul_target(struct cpufreq_policy *policy,
unsigned int target_freq, unsigned int relation)
{
unsigned int table_index = 0;
- unsigned int new_clock_ratio = 0;
+ unsigned int i;
+ unsigned int dir = 0;
+ u8 vid, current_vid;
if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index))
return -EINVAL;
- new_clock_ratio = longhaul_table[table_index].index & 0xFF;
-
- longhaul_setstate(new_clock_ratio);
+ /* Don't set same frequency again */
+ if (longhaul_index == table_index)
+ return 0;
+ if (!can_scale_voltage)
+ longhaul_setstate(table_index);
+ else {
+ /* On test system voltage transitions exceeding single
+ * step up or down were turning motherboard off. Both
+ * "ondemand" and "userspace" are unsafe. C7 is doing
+ * this in hardware, C3 is old and we need to do this
+ * in software. */
+ i = longhaul_index;
+ current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f;
+ if (table_index > longhaul_index)
+ dir = 1;
+ while (i != table_index) {
+ vid = (longhaul_table[i].index >> 8) & 0x1f;
+ if (vid != current_vid) {
+ longhaul_setstate(i);
+ current_vid = vid;
+ msleep(200);
+ }
+ if (dir)
+ i++;
+ else
+ i--;
+ }
+ longhaul_setstate(table_index);
+ }
+ longhaul_index = table_index;
return 0;
}
@@ -590,11 +657,10 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
static int enable_arbiter_disable(void)
{
struct pci_dev *dev;
- int status;
+ int status = 1;
int reg;
u8 pci_cmd;
- status = 1;
/* Find PLE133 host bridge */
reg = 0x78;
dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
@@ -627,13 +693,17 @@ static int enable_arbiter_disable(void)
return 0;
}
-static int longhaul_setup_vt8235(void)
+static int longhaul_setup_southbridge(void)
{
struct pci_dev *dev;
u8 pci_cmd;
/* Find VT8235 southbridge */
dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
+ if (dev == NULL)
+ /* Find VT8237 southbridge */
+ dev = pci_get_device(PCI_VENDOR_ID_VIA,
+ PCI_DEVICE_ID_VIA_8237, NULL);
if (dev != NULL) {
/* Set transition time to max */
pci_read_config_byte(dev, 0xec, &pci_cmd);
@@ -645,6 +715,14 @@ static int longhaul_setup_vt8235(void)
pci_read_config_byte(dev, 0xe5, &pci_cmd);
pci_cmd |= 1 << 7;
pci_write_config_byte(dev, 0xe5, pci_cmd);
+ /* Get address of ACPI registers block*/
+ pci_read_config_byte(dev, 0x81, &pci_cmd);
+ if (pci_cmd & 1 << 7) {
+ pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
+ acpi_regs_addr &= 0xff00;
+ printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr);
+ }
+
pci_dev_put(dev);
return 1;
}
@@ -657,7 +735,6 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
char *cpuname=NULL;
int ret;
u32 lo, hi;
- int vt8235_present;
/* Check what we have on this motherboard */
switch (c->x86_model) {
@@ -755,7 +832,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
};
/* Doesn't hurt */
- vt8235_present = longhaul_setup_vt8235();
+ longhaul_setup_southbridge();
/* Find ACPI data for processor */
acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
@@ -765,35 +842,29 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
/* Check ACPI support for C3 state */
if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
cx = &pr->power.states[ACPI_STATE_C3];
- if (cx->address > 0 && cx->latency <= 1000) {
+ if (cx->address > 0 && cx->latency <= 1000)
longhaul_flags |= USE_ACPI_C3;
- goto print_support_type;
- }
}
+ /* Disable if it isn't working */
+ if (disable_acpi_c3)
+ longhaul_flags &= ~USE_ACPI_C3;
/* Check if northbridge is friendly */
- if (enable_arbiter_disable()) {
+ if (enable_arbiter_disable())
longhaul_flags |= USE_NORTHBRIDGE;
- goto print_support_type;
- }
- /* Use VT8235 southbridge if present */
- if (longhaul_version == TYPE_POWERSAVER && vt8235_present) {
- longhaul_flags |= USE_VT8235;
- goto print_support_type;
- }
+
/* Check ACPI support for bus master arbiter disable */
- if ((pr == NULL) || !(pr->flags.bm_control)) {
+ if (!(longhaul_flags & USE_ACPI_C3
+ || longhaul_flags & USE_NORTHBRIDGE)
+ && ((pr == NULL) || !(pr->flags.bm_control))) {
printk(KERN_ERR PFX
"No ACPI support. Unsupported northbridge.\n");
return -ENODEV;
}
-print_support_type:
if (longhaul_flags & USE_NORTHBRIDGE)
- printk (KERN_INFO PFX "Using northbridge support.\n");
- else if (longhaul_flags & USE_VT8235)
- printk (KERN_INFO PFX "Using VT8235 support.\n");
- else
- printk (KERN_INFO PFX "Using ACPI support.\n");
+ printk(KERN_INFO PFX "Using northbridge support.\n");
+ if (longhaul_flags & USE_ACPI_C3)
+ printk(KERN_INFO PFX "Using ACPI support.\n");
ret = longhaul_get_ranges();
if (ret != 0)
@@ -885,6 +956,9 @@ static void __exit longhaul_exit(void)
kfree(longhaul_table);
}
+module_param (disable_acpi_c3, int, 0644);
+MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
+
module_param (scale_voltage, int, 0644);
MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
index 102548f12842..4fcc320997df 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -180,7 +180,7 @@ static const int __initdata ezrat_clock_ratio[32] = {
-1, /* 0000 -> RESERVED (10.0x) */
110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
+ -1, /* 0010 -> 12.0x */
-1, /* 0011 -> RESERVED (9.0x)*/
105, /* 0100 -> 10.5x */
115, /* 0101 -> 11.5x */
@@ -237,7 +237,7 @@ static const int __initdata ezrat_eblcr[32] = {
static const int __initdata nehemiah_clock_ratio[32] = {
100, /* 0000 -> 10.0x */
- 160, /* 0001 -> 16.0x */
+ -1, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
90, /* 0011 -> 9.0x */
95, /* 0100 -> 9.5x */
@@ -252,10 +252,10 @@ static const int __initdata nehemiah_clock_ratio[32] = {
75, /* 1101 -> 7.5x */
85, /* 1110 -> 8.5x */
120, /* 1111 -> 12.0x */
- 100, /* 0000 -> 10.0x */
+ -1, /* 0000 -> 10.0x */
110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- 90, /* 0011 -> 9.0x */
+ -1, /* 0010 -> 12.0x */
+ -1, /* 0011 -> 9.0x */
105, /* 0100 -> 10.5x */
115, /* 0101 -> 11.5x */
125, /* 0110 -> 12.5x */
@@ -267,7 +267,7 @@ static const int __initdata nehemiah_clock_ratio[32] = {
145, /* 1100 -> 14.5x */
155, /* 1101 -> 15.5x */
-1, /* 1110 -> RESERVED (13.0x) */
- 120, /* 1111 -> 12.0x */
+ -1, /* 1111 -> 12.0x */
};
static const int __initdata nehemiah_eblcr[32] = {
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 4ade55c5f333..34ed53a06730 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -599,14 +599,17 @@ static void print_basics(struct powernow_k8_data *data)
for (j = 0; j < data->numps; j++) {
if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) {
if (cpu_family == CPU_HW_PSTATE) {
- printk(KERN_INFO PFX " %d : fid 0x%x gid 0x%x (%d MHz)\n", j, (data->powernow_table[j].index & 0xff00) >> 8,
- (data->powernow_table[j].index & 0xff0000) >> 16,
- data->powernow_table[j].frequency/1000);
+ printk(KERN_INFO PFX " %d : fid 0x%x did 0x%x (%d MHz)\n",
+ j,
+ (data->powernow_table[j].index & 0xff00) >> 8,
+ (data->powernow_table[j].index & 0xff0000) >> 16,
+ data->powernow_table[j].frequency/1000);
} else {
- printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", j,
- data->powernow_table[j].index & 0xff,
- data->powernow_table[j].frequency/1000,
- data->powernow_table[j].index >> 8);
+ printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n",
+ j,
+ data->powernow_table[j].index & 0xff,
+ data->powernow_table[j].frequency/1000,
+ data->powernow_table[j].index >> 8);
}
}
}
@@ -1086,7 +1089,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
if (cpu_family == CPU_HW_PSTATE)
dprintk("targ: curr fid 0x%x, did 0x%x\n",
- data->currfid, data->currvid);
+ data->currfid, data->currdid);
else {
dprintk("targ: curr fid 0x%x, vid 0x%x\n",
data->currfid, data->currvid);
@@ -1322,16 +1325,22 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
static int __cpuinit powernowk8_init(void)
{
unsigned int i, supported_cpus = 0;
+ unsigned int booted_cores = 1;
for_each_online_cpu(i) {
if (check_supported_cpu(i))
supported_cpus++;
}
+#ifdef CONFIG_SMP
+ booted_cores = cpu_data[0].booted_cores;
+#endif
+
if (supported_cpus == num_online_cpus()) {
printk(KERN_INFO PFX "Found %d %s "
- "processors (" VERSION ")\n", supported_cpus,
- boot_cpu_data.x86_model_id);
+ "processors (%d cpu cores) (" VERSION ")\n",
+ supported_cpus/booted_cores,
+ boot_cpu_data.x86_model_id, supported_cpus);
return cpufreq_register_driver(&cpufreq_amd64_driver);
}
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index 35489fd68852..6c5dc2c85aeb 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -21,12 +21,6 @@
#include <linux/delay.h>
#include <linux/compiler.h>
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
-#include <linux/acpi.h>
-#include <linux/dmi.h>
-#include <acpi/processor.h>
-#endif
-
#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
@@ -257,9 +251,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
/* Matched a non-match */
dprintk("no table support for CPU model \"%s\"\n",
cpu->x86_model_id);
-#ifndef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
- dprintk("try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n");
-#endif
+ dprintk("try using the acpi-cpufreq driver\n");
return -ENOENT;
}
@@ -346,213 +338,6 @@ static unsigned int get_cur_freq(unsigned int cpu)
}
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
-
-static struct acpi_processor_performance *acpi_perf_data[NR_CPUS];
-
-/*
- * centrino_cpu_early_init_acpi - Do the preregistering with ACPI P-States
- * library
- *
- * Before doing the actual init, we need to do _PSD related setup whenever
- * supported by the BIOS. These are handled by this early_init routine.
- */
-static int centrino_cpu_early_init_acpi(void)
-{
- unsigned int i, j;
- struct acpi_processor_performance *data;
-
- for_each_possible_cpu(i) {
- data = kzalloc(sizeof(struct acpi_processor_performance),
- GFP_KERNEL);
- if (!data) {
- for_each_possible_cpu(j) {
- kfree(acpi_perf_data[j]);
- acpi_perf_data[j] = NULL;
- }
- return (-ENOMEM);
- }
- acpi_perf_data[i] = data;
- }
-
- acpi_processor_preregister_performance(acpi_perf_data);
- return 0;
-}
-
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-static int sw_any_bug_found(struct dmi_system_id *d)
-{
- bios_with_sw_any_bug = 1;
- return 0;
-}
-
-static struct dmi_system_id sw_any_bug_dmi_table[] = {
- {
- .callback = sw_any_bug_found,
- .ident = "Supermicro Server X6DLP",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
- DMI_MATCH(DMI_BIOS_VERSION, "080010"),
- DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
- },
- },
- { }
-};
-#endif
-
-/*
- * centrino_cpu_init_acpi - register with ACPI P-States library
- *
- * Register with the ACPI P-States library (part of drivers/acpi/processor.c)
- * in order to determine correct frequency and voltage pairings by reading
- * the _PSS of the ACPI DSDT or SSDT tables.
- */
-static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
-{
- unsigned long cur_freq;
- int result = 0, i;
- unsigned int cpu = policy->cpu;
- struct acpi_processor_performance *p;
-
- p = acpi_perf_data[cpu];
-
- /* register with ACPI core */
- if (acpi_processor_register_performance(p, cpu)) {
- dprintk(PFX "obtaining ACPI data failed\n");
- return -EIO;
- }
-
- policy->shared_type = p->shared_type;
- /*
- * Will let policy->cpus know about dependency only when software
- * coordination is required.
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
- policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- policy->cpus = p->shared_cpu_map;
- }
-
-#ifdef CONFIG_SMP
- dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
- policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- policy->cpus = cpu_core_map[cpu];
- }
-#endif
-
- /* verify the acpi_data */
- if (p->state_count <= 1) {
- dprintk("No P-States\n");
- result = -ENODEV;
- goto err_unreg;
- }
-
- if ((p->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (p->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
- dprintk("Invalid control/status registers (%x - %x)\n",
- p->control_register.space_id, p->status_register.space_id);
- result = -EIO;
- goto err_unreg;
- }
-
- for (i=0; i<p->state_count; i++) {
- if ((p->states[i].control & INTEL_MSR_RANGE) !=
- (p->states[i].status & INTEL_MSR_RANGE)) {
- dprintk("Different MSR bits in control (%llu) and status (%llu)\n",
- p->states[i].control, p->states[i].status);
- result = -EINVAL;
- goto err_unreg;
- }
-
- if (!p->states[i].core_frequency) {
- dprintk("Zero core frequency for state %u\n", i);
- result = -EINVAL;
- goto err_unreg;
- }
-
- if (p->states[i].core_frequency > p->states[0].core_frequency) {
- dprintk("P%u has larger frequency (%llu) than P0 (%llu), skipping\n", i,
- p->states[i].core_frequency, p->states[0].core_frequency);
- p->states[i].core_frequency = 0;
- continue;
- }
- }
-
- centrino_model[cpu] = kzalloc(sizeof(struct cpu_model), GFP_KERNEL);
- if (!centrino_model[cpu]) {
- result = -ENOMEM;
- goto err_unreg;
- }
-
- centrino_model[cpu]->model_name=NULL;
- centrino_model[cpu]->max_freq = p->states[0].core_frequency * 1000;
- centrino_model[cpu]->op_points = kmalloc(sizeof(struct cpufreq_frequency_table) *
- (p->state_count + 1), GFP_KERNEL);
- if (!centrino_model[cpu]->op_points) {
- result = -ENOMEM;
- goto err_kfree;
- }
-
- for (i=0; i<p->state_count; i++) {
- centrino_model[cpu]->op_points[i].index = p->states[i].control & INTEL_MSR_RANGE;
- centrino_model[cpu]->op_points[i].frequency = p->states[i].core_frequency * 1000;
- dprintk("adding state %i with frequency %u and control value %04x\n",
- i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index);
- }
- centrino_model[cpu]->op_points[p->state_count].frequency = CPUFREQ_TABLE_END;
-
- cur_freq = get_cur_freq(cpu);
-
- for (i=0; i<p->state_count; i++) {
- if (!p->states[i].core_frequency) {
- dprintk("skipping state %u\n", i);
- centrino_model[cpu]->op_points[i].frequency = CPUFREQ_ENTRY_INVALID;
- continue;
- }
-
- if (extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0) !=
- (centrino_model[cpu]->op_points[i].frequency)) {
- dprintk("Invalid encoded frequency (%u vs. %u)\n",
- extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0),
- centrino_model[cpu]->op_points[i].frequency);
- result = -EINVAL;
- goto err_kfree_all;
- }
-
- if (cur_freq == centrino_model[cpu]->op_points[i].frequency)
- p->state = i;
- }
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
- printk("speedstep-centrino with X86_SPEEDSTEP_CENTRINO_ACPI "
- "config is deprecated.\n "
- "Use X86_ACPI_CPUFREQ (acpi-cpufreq) instead.\n" );
-
- return 0;
-
- err_kfree_all:
- kfree(centrino_model[cpu]->op_points);
- err_kfree:
- kfree(centrino_model[cpu]);
- err_unreg:
- acpi_processor_unregister_performance(p, cpu);
- dprintk(PFX "invalid ACPI data\n");
- return (result);
-}
-#else
-static inline int centrino_cpu_init_acpi(struct cpufreq_policy *policy) { return -ENODEV; }
-static inline int centrino_cpu_early_init_acpi(void) { return 0; }
-#endif
-
static int centrino_cpu_init(struct cpufreq_policy *policy)
{
struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
@@ -568,27 +353,25 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
- if (centrino_cpu_init_acpi(policy)) {
- if (policy->cpu != 0)
- return -ENODEV;
+ if (policy->cpu != 0)
+ return -ENODEV;
- for (i = 0; i < N_IDS; i++)
- if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
- break;
+ for (i = 0; i < N_IDS; i++)
+ if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
+ break;
- if (i != N_IDS)
- centrino_cpu[policy->cpu] = &cpu_ids[i];
+ if (i != N_IDS)
+ centrino_cpu[policy->cpu] = &cpu_ids[i];
- if (!centrino_cpu[policy->cpu]) {
- dprintk("found unsupported CPU with "
- "Enhanced SpeedStep: send /proc/cpuinfo to "
- MAINTAINER "\n");
- return -ENODEV;
- }
+ if (!centrino_cpu[policy->cpu]) {
+ dprintk("found unsupported CPU with "
+ "Enhanced SpeedStep: send /proc/cpuinfo to "
+ MAINTAINER "\n");
+ return -ENODEV;
+ }
- if (centrino_cpu_init_table(policy)) {
- return -ENODEV;
- }
+ if (centrino_cpu_init_table(policy)) {
+ return -ENODEV;
}
/* Check to see if Enhanced SpeedStep is enabled, and try to
@@ -634,20 +417,6 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
cpufreq_frequency_table_put_attr(cpu);
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
- if (!centrino_model[cpu]->model_name) {
- static struct acpi_processor_performance *p;
-
- if (acpi_perf_data[cpu]) {
- p = acpi_perf_data[cpu];
- dprintk("unregistering and freeing ACPI data\n");
- acpi_processor_unregister_performance(p, cpu);
- kfree(centrino_model[cpu]->op_points);
- kfree(centrino_model[cpu]);
- }
- }
-#endif
-
centrino_model[cpu] = NULL;
return 0;
@@ -849,25 +618,12 @@ static int __init centrino_init(void)
if (!cpu_has(cpu, X86_FEATURE_EST))
return -ENODEV;
- centrino_cpu_early_init_acpi();
-
return cpufreq_register_driver(&centrino_driver);
}
static void __exit centrino_exit(void)
{
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
- unsigned int j;
-#endif
-
cpufreq_unregister_driver(&centrino_driver);
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
- for_each_possible_cpu(j) {
- kfree(acpi_perf_data[j]);
- acpi_perf_data[j] = NULL;
- }
-#endif
}
MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c
index 698f980eb443..a5b2346faf1f 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c
@@ -205,7 +205,6 @@ static unsigned int speedstep_detect_chipset (void)
* host brige. Abort on these systems.
*/
static struct pci_dev *hostbridge;
- u8 rev = 0;
hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
PCI_DEVICE_ID_INTEL_82815_MC,
@@ -216,8 +215,7 @@ static unsigned int speedstep_detect_chipset (void)
if (!hostbridge)
return 2; /* 2-M */
- pci_read_config_byte(hostbridge, PCI_REVISION_ID, &rev);
- if (rev < 5) {
+ if (hostbridge->revision < 5) {
dprintk("hostbridge does not support speedstep\n");
speedstep_chipset_dev = NULL;
pci_dev_put(hostbridge);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index e88d2fba156b..122d2d75aa9f 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -4,7 +4,7 @@
#include <linux/pci.h>
#include <asm/dma.h>
#include <asm/io.h>
-#include <asm/processor.h>
+#include <asm/processor-cyrix.h>
#include <asm/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index e5be819492ef..d5a456d27d82 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,7 +4,7 @@
* Changes:
* Venkatesh Pallipadi : Adding cache identification through cpuid(4)
* Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen : CPUID4 emulation on AMD.
+ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
#include <linux/init.h>
@@ -135,7 +135,7 @@ unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user. This makes some assumptions about the machine:
- No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+ L2 not shared, no SMT etc. that is currently true on AMD CPUs.
In theory the TLBs could be reported as fake type (they are in "dummy").
Maybe later */
@@ -159,13 +159,26 @@ union l2_cache {
unsigned val;
};
+union l3_cache {
+ struct {
+ unsigned line_size : 8;
+ unsigned lines_per_tag : 4;
+ unsigned assoc : 4;
+ unsigned res : 2;
+ unsigned size_encoded : 14;
+ };
+ unsigned val;
+};
+
static const unsigned short assocs[] = {
[1] = 1, [2] = 2, [4] = 4, [6] = 8,
- [8] = 16,
+ [8] = 16, [0xa] = 32, [0xb] = 48,
+ [0xc] = 64,
[0xf] = 0xffff // ??
- };
-static const unsigned char levels[] = { 1, 1, 2 };
-static const unsigned char types[] = { 1, 2, 3 };
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
@@ -175,37 +188,58 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
unsigned line_size, lines_per_tag, assoc, size_in_kb;
union l1_cache l1i, l1d;
union l2_cache l2;
+ union l3_cache l3;
+ union l1_cache *l1 = &l1d;
eax->full = 0;
ebx->full = 0;
ecx->full = 0;
cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
- cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
-
- if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
- return;
-
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-
- if (leaf <= 1) {
- union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ switch (leaf) {
+ case 1:
+ l1 = &l1i;
+ case 0:
+ if (!l1->val)
+ return;
assoc = l1->assoc;
line_size = l1->line_size;
lines_per_tag = l1->lines_per_tag;
size_in_kb = l1->size_in_kb;
- } else {
+ break;
+ case 2:
+ if (!l2.val)
+ return;
assoc = l2.assoc;
line_size = l2.line_size;
lines_per_tag = l2.lines_per_tag;
/* cpu_data has errata corrections for K7 applied */
size_in_kb = current_cpu_data.x86_cache_size;
+ break;
+ case 3:
+ if (!l3.val)
+ return;
+ assoc = l3.assoc;
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ break;
+ default:
+ return;
}
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[leaf];
+ eax->split.level = levels[leaf];
+ if (leaf == 3)
+ eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+ else
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+
+
if (assoc == 0xf)
eax->split.is_fully_associative = 1;
ebx->split.coherency_line_size = line_size - 1;
@@ -239,8 +273,7 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le
return 0;
}
-/* will only be called once; __init is safe here */
-static int __init find_num_cache_leaves(void)
+static int __cpuinit find_num_cache_leaves(void)
{
unsigned int eax, ebx, ecx, edx;
union _cpuid4_leaf_eax cache_eax;
@@ -710,7 +743,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
return retval;
}
-static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
{
unsigned int cpu = sys_dev->id;
unsigned long i;
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 56cd485b127c..34c781eddee4 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+ old_cr4 = read_cr4();
+ clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+ if (old_cr4 & X86_CR4_MCE)
+ set_in_cr4(X86_CR4_MCE);
+}
+
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 6b5d3518a1c0..bf39409b3838 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
static void mce_work_fn(struct work_struct *work)
{
on_each_cpu(mce_checkregs, NULL, 1, 1);
- schedule_delayed_work(&mce_work, MCE_RATE);
+ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
}
static int __init init_nonfatal_mce_checker(void)
@@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_checker(void)
/*
* Check for non-fatal errors every MCE_RATE s
*/
- schedule_delayed_work(&mce_work, MCE_RATE);
+ schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
printk(KERN_INFO "Machine check exception polling timer started.\n");
return 0;
}
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 7ba7c3abd3a4..1203dc5ab87a 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
int err;
sys_dev = get_cpu_sysdev(cpu);
- mutex_lock(&therm_cpu_lock);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ mutex_lock(&therm_cpu_lock);
err = thermal_throttle_add_dev(sys_dev);
+ mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+ mutex_lock(&therm_cpu_lock);
thermal_throttle_remove_dev(sys_dev);
+ mutex_unlock(&therm_cpu_lock);
break;
}
- mutex_unlock(&therm_cpu_lock);
return NOTIFY_OK;
}
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 1001f1e0fe6d..2287d4863a8a 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -3,6 +3,7 @@
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/io.h>
+#include <asm/processor-cyrix.h>
#include "mtrr.h"
int arr3_protected;
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f6e46943e6ef..56f64e34829f 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -79,7 +79,7 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
}
/* Grab all of the MTRR state for this CPU into *state */
-void get_mtrr_state(void)
+void __init get_mtrr_state(void)
{
unsigned int i;
struct mtrr_var_range *vrs;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 75dc6d5214bc..c48b6fea5ab4 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
* initialized (i.e. before smp_init()).
*
*/
-__init void mtrr_bp_init(void)
+void __init mtrr_bp_init(void)
{
init_ifs();
diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c
index 7b39a2f954d9..c9014ca4a575 100644
--- a/arch/i386/kernel/cpu/mtrr/state.c
+++ b/arch/i386/kernel/cpu/mtrr/state.c
@@ -3,6 +3,7 @@
#include <asm/io.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
+#include <asm-i386/processor-cyrix.h>
#include "mtrr.h"
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
index 4d26d514c56f..4be488e73bee 100644
--- a/arch/i386/kernel/cpu/perfctr-watchdog.c
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -325,7 +325,7 @@ static struct wd_ops k7_wd_ops = {
.stop = single_msr_stop_watchdog,
.perfctr = MSR_K7_PERFCTR0,
.evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL<<63,
+ .checkbit = 1ULL<<47,
};
/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
@@ -346,7 +346,9 @@ static int setup_p6_watchdog(unsigned nmi_hz)
perfctr_msr = MSR_P6_PERFCTR0;
evntsel_msr = MSR_P6_EVNTSEL0;
- wrmsrl(perfctr_msr, 0UL);
+ /* KVM doesn't implement this MSR */
+ if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
+ return 0;
evntsel = P6_EVNTSEL_INT
| P6_EVNTSEL_OS
@@ -599,8 +601,8 @@ static struct wd_ops intel_arch_wd_ops = {
.setup = setup_intel_arch_watchdog,
.rearm = p6_rearm,
.stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+ .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
};
static void probe_nmi_watchdog(void)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 89d91e6cc972..1e31b6caffb1 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
+ NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+ "3dnowext", "3dnow",
/* Transmeta-defined */
"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -40,8 +41,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Other (Linux-defined) */
"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
NULL, NULL, NULL, NULL,
- "constant_tsc", "up", NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "constant_tsc", "up", NULL, "arch_perfmon",
+ "pebs", "bts", NULL, "sync_rdtsc",
+ "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
@@ -57,9 +59,16 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* AMD-defined (#2) */
- "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
- "sse4a", "misalignsse",
- "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
+ "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+ "altmovcr8", "abm", "sse4a",
+ "misalignsse", "3dnowprefetch",
+ "osvw", "ibs", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+ /* Auxiliary (Linux-defined) */
+ "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
deleted file mode 100644
index 50076f22e90f..000000000000
--- a/arch/i386/kernel/cpu/rise.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit init_rise(struct cpuinfo_x86 *c)
-{
- printk("CPU: Rise iDragon");
- if (c->x86_model > 2)
- printk(" II");
- printk("\n");
-
- /* Unhide possibly hidden capability flags
- The mp6 iDragon family don't have MSRs.
- We switch on extra features with this cpuid weirdness: */
- __asm__ (
- "movl $0x6363452a, %%eax\n\t"
- "movl $0x3231206c, %%ecx\n\t"
- "movl $0x2a32313a, %%edx\n\t"
- "cpuid\n\t"
- "movl $0x63634523, %%eax\n\t"
- "movl $0x32315f6c, %%ecx\n\t"
- "movl $0x2333313a, %%edx\n\t"
- "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx"
- );
- set_bit(X86_FEATURE_CX8, c->x86_capability);
-}
-
-static struct cpu_dev rise_cpu_dev __cpuinitdata = {
- .c_vendor = "Rise",
- .c_ident = { "RiseRiseRise" },
- .c_models = {
- { .vendor = X86_VENDOR_RISE, .family = 5, .model_names =
- {
- [0] = "iDragon",
- [2] = "iDragon",
- [8] = "iDragon II",
- [9] = "iDragon II"
- }
- },
- },
- .c_init = init_rise,
-};
-
-int __init rise_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev;
- return 0;
-}
-
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 9645bb51f76a..e60cddbc4cfb 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/pfn.h>
#include <linux/uaccess.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/page.h>
@@ -320,6 +321,37 @@ static int __init request_standard_resources(void)
subsys_initcall(request_standard_resources);
+#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+ int i;
+ unsigned long pfn;
+
+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (pfn < PFN_UP(ei->addr))
+ register_nosave_region(pfn, PFN_UP(ei->addr));
+
+ pfn = PFN_DOWN(ei->addr + ei->size);
+ if (ei->type != E820_RAM)
+ register_nosave_region(PFN_UP(ei->addr), pfn);
+
+ if (pfn >= max_low_pfn)
+ break;
+ }
+}
+#endif
+
void __init add_memory_region(unsigned long long start,
unsigned long long size, int type)
{
@@ -734,7 +766,7 @@ void __init print_memory_map(char *who)
case E820_NVS:
printk("(ACPI NVS)\n");
break;
- default: printk("type %lu\n", e820.map[i].type);
+ default: printk("type %u\n", e820.map[i].type);
break;
}
}
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index a1808022ea19..2452c6fbe992 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
struct range {
unsigned long start;
unsigned long end;
- } prev, curr;
+ } uninitialized_var(prev), curr;
efi_memory_desc_t *md;
unsigned long start, end;
void *p;
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c9..a714d6b43506 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -409,8 +409,6 @@ restore_nocheck_notrace:
1: INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -1023,6 +1021,91 @@ ENTRY(kernel_thread_helper)
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+ CFI_STARTPROC
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+
+ /* Check to see if we got the event in the critical
+ region in xen_iret_direct, after we've reenabled
+ events and checked for pending events. This simulates
+ iret instruction's behaviour where it delivers a
+ pending interrupt when enabling interrupts. */
+ movl PT_EIP(%esp),%eax
+ cmpl $xen_iret_start_crit,%eax
+ jb 1f
+ cmpl $xen_iret_end_crit,%eax
+ jae 1f
+
+ call xen_iret_crit_fixup
+
+1: mov %esp, %eax
+ call xen_evtchn_do_upcall
+ jmp ret_from_intr
+ CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+ CFI_STARTPROC
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl $1,%eax
+1: mov 4(%esp),%ds
+2: mov 8(%esp),%es
+3: mov 12(%esp),%fs
+4: mov 16(%esp),%gs
+ testl %eax,%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ lea 16(%esp),%esp
+ CFI_ADJUST_CFA_OFFSET -16
+ jz 5f
+ addl $16,%esp
+ jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
+5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ jmp ret_from_exception
+ CFI_ENDPROC
+
+.section .fixup,"ax"
+6: xorl %eax,%eax
+ movl %eax,4(%esp)
+ jmp 1b
+7: xorl %eax,%eax
+ movl %eax,8(%esp)
+ jmp 2b
+8: xorl %eax,%eax
+ movl %eax,12(%esp)
+ jmp 3b
+9: xorl %eax,%eax
+ movl %eax,16(%esp)
+ jmp 4b
+.previous
+.section __ex_table,"a"
+ .align 4
+ .long 1b,6b
+ .long 2b,7b
+ .long 3b,8b
+ .long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
+
.section .rodata,"a"
#include "syscall_table.S"
diff --git a/arch/i386/kernel/geode.c b/arch/i386/kernel/geode.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/i386/kernel/geode.c
@@ -0,0 +1,155 @@
+/*
+ * AMD Geode southbridge support code
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <asm/msr.h>
+#include <asm/geode.h>
+
+static struct {
+ char *name;
+ u32 msr;
+ int size;
+ u32 base;
+} lbars[] = {
+ { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
+ { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
+ { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
+ { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
+};
+
+static void __init init_lbars(void)
+{
+ u32 lo, hi;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lbars); i++) {
+ rdmsr(lbars[i].msr, lo, hi);
+ if (hi & 0x01)
+ lbars[i].base = lo & 0x0000ffff;
+
+ if (lbars[i].base == 0)
+ printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
+ lbars[i].name);
+ }
+}
+
+int geode_get_dev_base(unsigned int dev)
+{
+ BUG_ON(dev >= ARRAY_SIZE(lbars));
+ return lbars[dev].base;
+}
+EXPORT_SYMBOL_GPL(geode_get_dev_base);
+
+/* === GPIO API === */
+
+void geode_gpio_set(unsigned int gpio, unsigned int reg)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+ if (!base)
+ return;
+
+ if (gpio < 16)
+ outl(1 << gpio, base + reg);
+ else
+ outl(1 << (gpio - 16), base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set);
+
+void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+ if (!base)
+ return;
+
+ if (gpio < 16)
+ outl(1 << (gpio + 16), base + reg);
+ else
+ outl(1 << gpio, base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_clear);
+
+int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+ if (!base)
+ return 0;
+
+ if (gpio < 16)
+ return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
+ else
+ return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(geode_gpio_isset);
+
+void geode_gpio_set_irq(unsigned int group, unsigned int irq)
+{
+ u32 lo, hi;
+
+ if (group > 7 || irq > 15)
+ return;
+
+ rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+
+ lo &= ~(0xF << (group * 4));
+ lo |= (irq & 0xF) << (group * 4);
+
+ wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
+
+void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+ u32 offset, shift, val;
+
+ if (gpio >= 24)
+ offset = GPIO_MAP_W;
+ else if (gpio >= 16)
+ offset = GPIO_MAP_Z;
+ else if (gpio >= 8)
+ offset = GPIO_MAP_Y;
+ else
+ offset = GPIO_MAP_X;
+
+ shift = (gpio % 8) * 4;
+
+ val = inl(base + offset);
+
+ /* Clear whatever was there before */
+ val &= ~(0xF << shift);
+
+ /* And set the new value */
+
+ val |= ((pair & 7) << shift);
+
+ /* Set the PME bit if this is a PME event */
+
+ if (pme)
+ val |= (1 << (shift + 3));
+
+ outl(val, base + offset);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
+
+static int __init geode_southbridge_init(void)
+{
+ if (!is_geode())
+ return -ENODEV;
+
+ init_lbars();
+ return 0;
+}
+
+postcore_initcall(geode_southbridge_init);
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index f74dfc419b56..7c52b222207e 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -168,6 +168,12 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
.section .init.text,"ax",@progbits
#endif
+ /* Do an early initialization of the fixmap area */
+ movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+ movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
+ addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
+ movl %eax, 4092(%edx)
+
#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
@@ -504,9 +510,12 @@ ENTRY(_stext)
/*
* BSS section
*/
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned","wa"
+ .align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
.fill 1024,4,0
+ENTRY(swapper_pg_pmd)
+ .fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
@@ -530,6 +539,8 @@ fault_msg:
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n"
+#include "../xen/xen-head.S"
+
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17d73459fc5f..533d4932bc79 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -5,6 +5,7 @@
#include <linux/init.h>
#include <linux/sysdev.h>
#include <linux/pm.h>
+#include <linux/delay.h>
#include <asm/hpet.h>
#include <asm/io.h>
@@ -187,6 +188,10 @@ static void hpet_set_mode(enum clock_event_mode mode,
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T0_CFG);
break;
+
+ case CLOCK_EVT_MODE_RESUME:
+ hpet_enable_int();
+ break;
}
}
@@ -217,6 +222,7 @@ static struct clocksource clocksource_hpet = {
.mask = HPET_MASK,
.shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = hpet_start_counter,
};
/*
@@ -226,7 +232,8 @@ int __init hpet_enable(void)
{
unsigned long id;
uint64_t hpet_freq;
- u64 tmp;
+ u64 tmp, start, now;
+ cycle_t t1;
if (!is_hpet_capable())
return 0;
@@ -273,6 +280,27 @@ int __init hpet_enable(void)
/* Start the counter */
hpet_start_counter();
+ /* Verify whether hpet counter works */
+ t1 = read_hpet();
+ rdtscll(start);
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ rdtscll(now);
+ } while ((now - start) < 200000UL);
+
+ if (t1 == read_hpet()) {
+ printk(KERN_WARNING
+ "HPET counter not counting. HPET disabled\n");
+ goto out_nohpet;
+ }
+
/* Initialize and register HPET clocksource
*
* hpet period is in femto seconds per cycle
@@ -291,7 +319,6 @@ int __init hpet_enable(void)
clocksource_register(&clocksource_hpet);
-
if (id & HPET_ID_LEGSUP) {
hpet_enable_int();
hpet_reserve_platform_timers(id);
@@ -299,7 +326,7 @@ int __init hpet_enable(void)
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask =cpumask_of_cpu(0);
+ hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
clockevents_register_device(&hpet_clockevent);
global_clock_event = &hpet_clockevent;
return 1;
@@ -524,68 +551,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
#endif
-
-
-/*
- * Suspend/resume part
- */
-
-#ifdef CONFIG_PM
-
-static int hpet_suspend(struct sys_device *sys_device, pm_message_t state)
-{
- unsigned long cfg = hpet_readl(HPET_CFG);
-
- cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY);
- hpet_writel(cfg, HPET_CFG);
-
- return 0;
-}
-
-static int hpet_resume(struct sys_device *sys_device)
-{
- unsigned int id;
-
- hpet_start_counter();
-
- id = hpet_readl(HPET_ID);
-
- if (id & HPET_ID_LEGSUP)
- hpet_enable_int();
-
- return 0;
-}
-
-static struct sysdev_class hpet_class = {
- set_kset_name("hpet"),
- .suspend = hpet_suspend,
- .resume = hpet_resume,
-};
-
-static struct sys_device hpet_device = {
- .id = 0,
- .cls = &hpet_class,
-};
-
-
-static __init int hpet_register_sysfs(void)
-{
- int err;
-
- if (!is_hpet_capable())
- return 0;
-
- err = sysdev_class_register(&hpet_class);
-
- if (!err) {
- err = sysdev_register(&hpet_device);
- if (err)
- sysdev_class_unregister(&hpet_class);
- }
-
- return err;
-}
-
-device_initcall(hpet_register_sysfs);
-
-#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index f8a3c4054c70..6d839f2f1b1a 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -3,18 +3,17 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/jiffies.h>
-#include <linux/sysdev.h>
#include <linux/module.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
#include <asm/smp.h>
#include <asm/delay.h>
#include <asm/i8253.h>
#include <asm/io.h>
-
-#include "io_ports.h"
+#include <asm/timer.h>
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
@@ -41,26 +40,27 @@ static void init_pit_timer(enum clock_event_mode mode,
case CLOCK_EVT_MODE_PERIODIC:
/* binary, mode 2, LSB/MSB, ch 0 */
outb_p(0x34, PIT_MODE);
- udelay(10);
outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
- udelay(10);
outb(LATCH >> 8 , PIT_CH0); /* MSB */
break;
- /*
- * Avoid unnecessary state transitions, as it confuses
- * Geode / Cyrix based boxen.
- */
case CLOCK_EVT_MODE_SHUTDOWN:
- if (evt->mode == CLOCK_EVT_MODE_UNUSED)
- break;
case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN)
- break;
+ if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+ evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+ outb_p(0x30, PIT_MODE);
+ outb_p(0, PIT_CH0);
+ outb_p(0, PIT_CH0);
+ }
+ break;
+
case CLOCK_EVT_MODE_ONESHOT:
/* One shot setup */
outb_p(0x38, PIT_MODE);
- udelay(10);
+ break;
+
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here */
break;
}
spin_unlock_irqrestore(&i8253_lock, flags);
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index cff95d10a4d8..d26fc063a760 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's.
*/
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 7f8b7af2b95f..893df8280756 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -353,14 +353,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h> /* time_after() */
-#ifdef CONFIG_BALANCED_IRQ_DEBUG
-# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
-# define Dprintk(x...) do { TDprintk(x); } while (0)
-# else
-# define TDprintk(x...)
-# define Dprintk(x...)
-# endif
-
#define IRQBALANCE_CHECK_ARCH -999
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -443,7 +435,7 @@ static inline void balance_irq(int cpu, int irq)
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
{
int i, j;
- Dprintk("Rotating IRQs among CPUs.\n");
+
for_each_online_cpu(i) {
for (j = 0; j < NR_IRQS; j++) {
if (!irq_desc[j].action)
@@ -560,19 +552,11 @@ tryanothercpu:
max_loaded = tmp_loaded; /* processor */
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
- Dprintk("max_loaded cpu = %d\n", max_loaded);
- Dprintk("min_loaded cpu = %d\n", min_loaded);
- Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
- Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
- Dprintk("load imbalance = %lu\n", imbalance);
-
/* if imbalance is less than approx 10% of max load, then
* observe diminishing returns action. - quit
*/
- if (imbalance < (max_cpu_irq >> 3)) {
- Dprintk("Imbalance too trivial\n");
+ if (imbalance < (max_cpu_irq >> 3))
goto not_worth_the_effort;
- }
tryanotherirq:
/* if we select an IRQ to move that can't go where we want, then
@@ -629,9 +613,6 @@ tryanotherirq:
cpus_and(tmp, target_cpu_mask, allowed_mask);
if (!cpus_empty(tmp)) {
-
- Dprintk("irq = %d moved to cpu = %d\n",
- selected_irq, min_loaded);
/* mark for change destination */
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
@@ -651,7 +632,6 @@ not_worth_the_effort:
*/
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
- Dprintk("IRQ worth rotating not found\n");
return;
}
@@ -667,6 +647,7 @@ static int balanced_irq(void *unused)
set_pending_irq(i, cpumask_of_cpu(0));
}
+ set_freezable();
for ( ; ; ) {
time_remaining = schedule_timeout_interruptible(time_remaining);
try_to_freeze();
@@ -1901,7 +1882,7 @@ __setup("no_timer_check", notimercheck);
* - if this function detects that timer IRQs are defunct, then we fall
* back to ISA timer IRQs
*/
-int __init timer_irq_works(void)
+static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index d2daf672f4a2..dd2b97fc00b2 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -21,7 +21,7 @@
#include <asm/apic.h>
#include <asm/uaccess.h>
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
@@ -149,15 +149,11 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
#ifdef CONFIG_4KSTACKS
-/*
- * These should really be __section__(".bss.page_aligned") as well, but
- * gcc's 3.0 and earlier don't handle that correctly.
- */
static char softirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__aligned__(THREAD_SIZE)));
+ __attribute__((__section__(".bss.page_aligned")));
static char hardirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__aligned__(THREAD_SIZE)));
+ __attribute__((__section__(".bss.page_aligned")));
/*
* allocate per-cpu stacks for hardirq and for softirq processing
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index dde828a333c3..448a50b1324c 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/uaccess.h>
+#include <asm/alternative.h>
void jprobe_return_end(void);
@@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
- *p->addr = BREAKPOINT_INSTRUCTION;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
- *p->addr = p->opcode;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ text_poke(p->addr, &p->opcode, 1);
}
void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index fba121f7973f..99beac7f96ce 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -295,7 +295,7 @@ static unsigned int
last_irq_sums [NR_CPUS],
alert_counter [NR_CPUS];
-void touch_nmi_watchdog (void)
+void touch_nmi_watchdog(void)
{
if (nmi_watchdog > 0) {
unsigned cpu;
@@ -304,8 +304,10 @@ void touch_nmi_watchdog (void)
* Just reset the alert counters, (other CPUs might be
* spinning on locks we hold):
*/
- for_each_present_cpu (cpu)
- alert_counter[cpu] = 0;
+ for_each_present_cpu(cpu) {
+ if (alert_counter[cpu])
+ alert_counter[cpu] = 0;
+ }
}
/*
@@ -351,7 +353,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
* Take the local apic timer and PIT/HPET into account. We don't
* know which one is active, when we have highres/dyntick on
*/
- sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
/* if the none of the timers isn't firing, this cpu isn't doing much */
if (!touched && last_irq_sums[cpu] == sum) {
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5e..ea962c0667d5 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -124,20 +124,28 @@ unsigned paravirt_patch_ignore(unsigned len)
return len;
}
+struct branch {
+ unsigned char opcode;
+ u32 delta;
+} __attribute__((packed));
+
unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
void *site, u16 site_clobbers,
unsigned len)
{
unsigned char *call = site;
unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
+ struct branch b;
if (tgt_clobbers & ~site_clobbers)
return len; /* target would clobber too much for this site */
if (len < 5)
return len; /* call too long for patch site */
- *call++ = 0xe8; /* call */
- *(unsigned long *)call = delta;
+ b.opcode = 0xe8; /* call */
+ b.delta = delta;
+ BUILD_BUG_ON(sizeof(b) != 5);
+ text_poke(call, (unsigned char *)&b, 5);
return 5;
}
@@ -146,12 +154,14 @@ unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
{
unsigned char *jmp = site;
unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
+ struct branch b;
if (len < 5)
return len; /* call too long for patch site */
- *jmp++ = 0xe9; /* jmp */
- *(unsigned long *)jmp = delta;
+ b.opcode = 0xe9; /* jmp */
+ b.delta = delta;
+ text_poke(jmp, (unsigned char *)&b, 5);
return 5;
}
@@ -228,6 +238,41 @@ static int __init print_banner(void)
}
core_initcall(print_banner);
+static struct resource reserve_ioports = {
+ .start = 0,
+ .end = IO_SPACE_LIMIT,
+ .name = "paravirt-ioport",
+ .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+ .start = 0,
+ .end = -1,
+ .name = "paravirt-iomem",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware. This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+ int ret;
+
+ ret = request_resource(&ioport_resource, &reserve_ioports);
+ if (ret == 0) {
+ ret = request_resource(&iomem_resource, &reserve_iomem);
+ if (ret)
+ release_resource(&reserve_ioports);
+ }
+
+ return ret;
+}
+
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -267,7 +312,7 @@ struct paravirt_ops paravirt_ops = {
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
- .get_scheduled_cycles = native_read_tsc,
+ .sched_clock = native_sched_clock,
.get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 06dfa65ad180..84664710b784 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -300,6 +300,7 @@ early_param("idle", idle_setup);
void show_regs(struct pt_regs * regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
printk("\n");
printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
@@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs)
cr3 = read_cr3();
cr4 = read_cr4_safe();
printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ d0, d1, d2, d3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+ printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+
show_trace(NULL, regs, &regs->esp);
}
@@ -538,8 +550,31 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
return 1;
}
-static noinline void __switch_to_xtra(struct task_struct *next_p,
- struct tss_struct *tss)
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ struct tss_struct *tss)
{
struct thread_struct *next;
@@ -555,6 +590,17 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
set_debugreg(next->debugreg[7], 7);
}
+#ifdef CONFIG_SECCOMP
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+#endif
+
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
@@ -586,33 +632,6 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
}
/*
- * This function selects if the context switch from prev to next
- * has to tweak the TSC disable bit in the cr4.
- */
-static inline void disable_tsc(struct task_struct *prev_p,
- struct task_struct *next_p)
-{
- struct thread_info *prev, *next;
-
- /*
- * gcc should eliminate the ->thread_info dereference if
- * has_secure_computing returns 0 at compile time (SECCOMP=n).
- */
- prev = task_thread_info(prev_p);
- next = task_thread_info(next_p);
-
- if (has_secure_computing(prev) || has_secure_computing(next)) {
- /* slow path here */
- if (has_secure_computing(prev) &&
- !has_secure_computing(next)) {
- write_cr4(read_cr4() & ~X86_CR4_TSD);
- } else if (!has_secure_computing(prev) &&
- has_secure_computing(next))
- write_cr4(read_cr4() | X86_CR4_TSD);
- }
-}
-
-/*
* switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
@@ -689,11 +708,9 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
- if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
- || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
- __switch_to_xtra(next_p, tss);
-
- disable_tsc(prev_p, next_p);
+ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+ task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+ __switch_to_xtra(prev_p, next_p, tss);
/*
* Leave lazy mode, flushing any hypercalls made here.
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0c0ceec5de00..0c8f00e69c4d 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -164,14 +164,22 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
u32 *desc;
unsigned long base;
- down(&child->mm->context.sem);
- desc = child->mm->context.ldt + (seg & ~7);
- base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
+ seg &= ~7UL;
- /* 16-bit code segment? */
- if (!((desc[1] >> 22) & 1))
- addr &= 0xffff;
- addr += base;
+ down(&child->mm->context.sem);
+ if (unlikely((seg >> 3) >= child->mm->context.size))
+ addr = -1L; /* bogus selector, access would fault */
+ else {
+ desc = child->mm->context.ldt + seg;
+ base = ((desc[0] >> 16) |
+ ((desc[1] & 0xff) << 16) |
+ (desc[1] & 0xff000000));
+
+ /* 16-bit code segment? */
+ if (!((desc[1] >> 22) & 1))
+ addr &= 0xffff;
+ addr += base;
+ }
up(&child->mm->context.sem);
}
return addr;
@@ -358,17 +366,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
- case PTRACE_PEEKDATA: {
- unsigned long tmp;
- int copied;
-
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
- ret = -EIO;
- if (copied != sizeof(tmp))
- break;
- ret = put_user(tmp, datap);
+ case PTRACE_PEEKDATA:
+ ret = generic_ptrace_peekdata(child, addr, data);
break;
- }
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -395,10 +395,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/* when I and D space are separate, this will have to be fixed. */
case PTRACE_POKETEXT: /* write the word at location addr. */
case PTRACE_POKEDATA:
- ret = 0;
- if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
- break;
- ret = -EIO;
+ ret = generic_ptrace_pokedata(child, addr, data);
break;
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 5513f8d5b5be..0d796248866c 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -113,6 +113,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 698c24fe482e..d474cd639bcb 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -102,19 +102,10 @@ static unsigned int highmem_pages = -1;
/*
* Setup options
*/
-struct drive_info_struct { char dummy[32]; } drive_info;
-#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
- defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
-EXPORT_SYMBOL(drive_info);
-#endif
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
-struct sys_desc_table_struct {
- unsigned short length;
- unsigned char table[0];
-};
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
struct ist_info ist_info;
@@ -134,7 +125,7 @@ unsigned long saved_videomode;
static char __initdata command_line[COMMAND_LINE_SIZE];
-unsigned char __initdata boot_params[PARAM_SIZE];
+struct boot_params __initdata boot_params;
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -282,18 +273,18 @@ unsigned long __init find_max_low_pfn(void)
printk(KERN_WARNING "Warning only %ldMB will be used.\n",
MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_X86_PAE
+#ifndef CONFIG_HIGHMEM64G
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING "Warning only 4GB will be used.\n");
- printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
}
-#endif /* !CONFIG_X86_PAE */
+#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
} else {
if (highmem_pages == -1)
@@ -475,7 +466,7 @@ void __init setup_bootmem_allocator(void)
*
* This should all compile down to nothing when NUMA is off.
*/
-void __init remapped_pgdat_init(void)
+static void __init remapped_pgdat_init(void)
{
int nid;
@@ -528,7 +519,6 @@ void __init setup_arch(char **cmdline_p)
#endif
ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
- drive_info = DRIVE_INFO;
screen_info = SCREEN_INFO;
edid_info = EDID_INFO;
apm_info.bios = APM_BIOS_INFO;
@@ -611,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
* NOTE: at this point the bootmem allocator is fully available.
*/
+ paravirt_post_allocator_init();
+
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH
@@ -648,6 +640,7 @@ void __init setup_arch(char **cmdline_p)
#endif
e820_register_memory();
+ e820_mark_nosave_regions();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index d574e38f0f77..f5dd85656c18 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
return eax;
badframe:
+ if (show_unhandled_signals && printk_ratelimit())
+ printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+ " esp:%lx oeax:%lx\n",
+ current->pid > 1 ? KERN_INFO : KERN_EMERG,
+ current->comm, current->pid, frame, regs->eip,
+ regs->esp, regs->orig_eax);
+
force_sig(SIGSEGV, current);
return 0;
}
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e2..2d35d8502029 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
#include <mach_apic.h>
/*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
/*
- * We cannot call mmdrop() because we are in interrupt context,
+ * We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*
* We need to reload %cr3 since the page tables may be going
* away from under us..
*/
-static inline void leave_mm (unsigned long cpu)
+void leave_mm(unsigned long cpu)
{
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
BUG();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8e..e4f61d1c6248 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
* a given CPU
*/
-static void __cpuinit smp_store_cpu_info(int id)
+void __cpuinit smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = cpu_data + id;
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
-static inline void
-set_cpu_sibling_map(int cpu)
+void __cpuinit set_cpu_sibling_map(int cpu)
{
int i;
struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void
-remove_siblinginfo(int cpu)
+void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = cpu_data;
diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c
index 1868ae18eb4d..bbfe85a0f699 100644
--- a/arch/i386/kernel/smpcommon.c
+++ b/arch/i386/kernel/smpcommon.c
@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
EXPORT_SYMBOL(smp_call_function);
/**
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single - Run a function on a specific CPU
* @cpu: The target CPU. Cannot be the calling CPU.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int ret;
int me = get_cpu();
if (cpu == me) {
- WARN_ON(1);
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
put_cpu();
- return -EBUSY;
+ return 0;
}
ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index bf6adce52267..8344c70adf61 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -323,3 +323,4 @@ ENTRY(sys_call_table)
.long sys_signalfd
.long sys_timerfd
.long sys_eventfd
+ .long sys_fallocate
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index ff4ee6f3326b..6deb159d08e0 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
int in_gate_area(struct task_struct *task, unsigned long addr)
{
- return 0;
+ const struct vm_area_struct *vma = get_gate_vma(task);
+
+ return vma && addr >= vma->vm_start && addr < vma->vm_end;
}
int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index a665df61f08c..19a6c678d02e 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void)
return retval;
}
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-int no_sync_cmos_clock;
-
-static void sync_cmos_clock(unsigned long dummy)
-{
- struct timeval now, next;
- int fail = 1;
-
- /*
- * If we have an externally synchronized Linux clock, then update
- * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
- * called as close as possible to 500 ms before the new second starts.
- * This code is run on a timer. If the clock is set, that timer
- * may not expire at the correct time. Thus, we adjust...
- */
- if (!ntp_synced())
- /*
- * Not synced, exit, do not restart a timer (if one is
- * running, let it run out).
- */
- return;
-
- do_gettimeofday(&now);
- if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
- now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
- fail = set_rtc_mmss(now.tv_sec);
-
- next.tv_usec = USEC_AFTER - now.tv_usec;
- if (next.tv_usec <= 0)
- next.tv_usec += USEC_PER_SEC;
-
- if (!fail)
- next.tv_sec = 659;
- else
- next.tv_sec = 0;
-
- if (next.tv_usec >= USEC_PER_SEC) {
- next.tv_sec++;
- next.tv_usec -= USEC_PER_SEC;
- }
- mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
{
- if (!no_sync_cmos_clock)
- mod_timer(&sync_cmos_timer, jiffies + 1);
+ return set_rtc_mmss(now.tv_sec);
}
extern void (*late_time_init)(void);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 90da0575fcff..cfffe3dd9e83 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -41,6 +41,10 @@
#include <linux/mca.h>
#endif
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/io.h>
@@ -148,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!stack) {
unsigned long dummy;
stack = &dummy;
- if (task && task != current)
+ if (task != current)
stack = (unsigned long *)task->thread.esp;
}
@@ -207,6 +211,7 @@ static void print_trace_address(void *data, unsigned long addr)
{
printk("%s [<%08lx>] ", (char *)data, addr);
print_symbol("%s\n", addr);
+ touch_nmi_watchdog();
}
static struct stacktrace_ops print_trace_ops = {
@@ -390,7 +395,7 @@ void die(const char * str, struct pt_regs * regs, long err)
unsigned long esp;
unsigned short ss;
- report_bug(regs->eip);
+ report_bug(regs->eip, regs);
printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
#ifdef CONFIG_PREEMPT
@@ -433,6 +438,7 @@ void die(const char * str, struct pt_regs * regs, long err)
bust_spinlocks(0);
die.lock_owner = -1;
+ add_taint(TAINT_DIE);
spin_unlock_irqrestore(&die.lock, flags);
if (!regs)
@@ -517,10 +523,12 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
fastcall void do_##name(struct pt_regs * regs, long error_code) \
{ \
siginfo_t info; \
+ if (irq) \
+ local_irq_enable(); \
info.si_signo = signr; \
info.si_errno = 0; \
info.si_code = sicode; \
@@ -560,13 +568,13 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
#endif
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
fastcall void __kprobes do_general_protection(struct pt_regs * regs,
long error_code)
@@ -610,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
current->thread.error_code = error_code;
current->thread.trap_no = 13;
+ if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+ printk_ratelimit())
+ printk(KERN_INFO
+ "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+ current->comm, current->pid,
+ regs->eip, regs->esp, error_code);
+
force_sig(SIGSEGV, current);
return;
@@ -635,6 +650,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
"CPU %d.\n", reason, smp_processor_id());
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+ if(edac_handler_set()) {
+ edac_atomic_assert_error();
+ return;
+ }
+#endif
+
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
@@ -752,6 +775,8 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
reassert_nmi();
}
+static int ignore_nmis;
+
fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
int cpu;
@@ -762,11 +787,24 @@ fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
++nmi_count(cpu);
- default_do_nmi(regs);
+ if (!ignore_nmis)
+ default_do_nmi(regs);
nmi_exit();
}
+void stop_nmi(void)
+{
+ acpi_nmi_disable();
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+ acpi_nmi_enable();
+}
+
#ifdef CONFIG_KPROBES
fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
@@ -1053,6 +1091,7 @@ asmlinkage void math_state_restore(void)
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
+EXPORT_SYMBOL_GPL(math_state_restore);
#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e8..debd7dbb4158 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
* an extra value to store the TSC freq
*/
unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
int tsc_disable;
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
*/
static int tsc_unstable;
-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
{
return tsc_unstable;
}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
@@ -84,7 +86,7 @@ static inline int check_tsc_unstable(void)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
@@ -93,15 +95,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
- return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
/*
* Scheduler clock - returns current time in nanosec units.
*/
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
{
unsigned long long this_offset;
@@ -118,12 +115,24 @@ unsigned long long sched_clock(void)
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */
- get_scheduled_cycles(this_offset);
+ rdtscll(this_offset);
/* return the value in ns */
return cycles_2_ns(this_offset);
}
+/* We need to define a real function for sched_clock, to override the
+ weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+ return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+ __attribute__((alias("native_sched_clock")));
+#endif
+
unsigned long native_calculate_cpu_khz(void)
{
unsigned long long start, end;
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
deleted file mode 100644
index f1d1eacf4ab0..000000000000
--- a/arch/i386/kernel/verify_cpu.S
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Check if CPU has some minimum CPUID bits
- This runs in 16bit mode so that the caller can still use the BIOS
- to output errors on the screen */
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-
-verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
-
-#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4
- pushfl
- pop %eax
- orl $(1<<18),%eax # try setting AC
- push %eax
- popfl
- pushfl
- popl %eax
- testl $(1<<18),%eax
- jz bad
-#endif
-#if REQUIRED_MASK1 != 0
- pushfl # standard way to check for cpuid
- popl %eax
- movl %eax,%ebx
- xorl $0x200000,%eax
- pushl %eax
- popfl
- pushfl
- popl %eax
- cmpl %eax,%ebx
- pushfl # standard way to check for cpuid
- popl %eax
- movl %eax,%ebx
- xorl $0x200000,%eax
- pushl %eax
- popfl
- pushfl
- popl %eax
- cmpl %eax,%ebx
- jz bad # REQUIRED_MASK1 != 0 requires CPUID
-
- movl $0x0,%eax # See if cpuid 1 is implemented
- cpuid
- cmpl $0x1,%eax
- jb bad # no cpuid 1
-
-#if REQUIRED_MASK1 & NEED_CMPXCHG64
- /* Some VIA C3s need magic MSRs to enable CX64. Do this here */
- cmpl $0x746e6543,%ebx # Cent
- jne 1f
- cmpl $0x48727561,%edx # aurH
- jne 1f
- cmpl $0x736c7561,%ecx # auls
- jne 1f
- movl $1,%eax # check model
- cpuid
- movl %eax,%ebx
- shr $8,%ebx
- andl $0xf,%ebx
- cmp $6,%ebx # check family == 6
- jne 1f
- shr $4,%eax
- andl $0xf,%eax
- cmpl $6,%eax # check model >= 6
- jb 1f
- # assume models >= 6 all support this MSR
- movl $MSR_VIA_FCR,%ecx
- rdmsr
- orl $((1<<1)|(1<<7)),%eax # enable CMPXCHG64 and PGE
- wrmsr
-1:
-#endif
- movl $0x1,%eax # Does the cpu have what it takes
- cpuid
-
-#if CONFIG_X86_MINIMUM_CPU_MODEL > 4
-#error add proper model checking here
-#endif
-
- andl $REQUIRED_MASK1,%edx
- xorl $REQUIRED_MASK1,%edx
- jnz bad
-#endif /* REQUIRED_MASK1 */
-
- popfl
- xor %eax,%eax
- ret
-
-bad:
- popfl
- movl $1,%eax
- ret
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc5..72042bb7ec94 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pt(u32 pfn)
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
- paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+ paravirt_ops.sched_clock = vmi_sched_clock;
paravirt_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a8762..b1b5ab08b26e 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -32,6 +32,7 @@
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/timer.h>
+#include <asm/i8253.h>
#include <irq_vectors.h>
#include "io_ports.h"
@@ -64,10 +65,10 @@ int vmi_set_wallclock(unsigned long now)
return 0;
}
-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
-unsigned long long vmi_get_sched_cycles(void)
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
{
- return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+ return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
@@ -142,6 +143,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode,
switch (mode) {
case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
break;
case CLOCK_EVT_MODE_PERIODIC:
cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c82..7d72cce00529 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -60,7 +60,9 @@ SECTIONS
__stop___ex_table = .;
}
- BUG_TABLE
+ NOTES :text :note
+
+ BUG_TABLE :text
. = ALIGN(4);
.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
@@ -88,6 +90,7 @@ SECTIONS
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
*(.data.idt)
}
@@ -180,6 +183,7 @@ SECTIONS
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;
*(.data.percpu)
+ *(.data.percpu.shared_aligned)
__per_cpu_end = .;
}
. = ALIGN(4096);
@@ -206,6 +210,4 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
-
- NOTES
}
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..07c0daf78237 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,43 @@
* Here we can supply some information useful to userland.
*/
-#include <linux/uts.h>
#include <linux/version.h>
+#include <linux/elfnote.h>
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \
- .section name, flags; \
- .balign 4; \
- .long 1f - 0f; /* name length */ \
- .long 3f - 2f; /* data length */ \
- .long type; /* note type */ \
-0: .asciz vendor; /* vendor name */ \
-1: .balign 4; \
-2:
+/* Ideally this would use UTS_NAME, but using a quoted string here
+ doesn't work. Remember to change this when changing the
+ kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
-#define ASM_ELF_NOTE_END \
-3: .balign 4; /* pad out section */ \
- .previous
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently. This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ * hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word. So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
- ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
- .long LINUX_VERSION_CODE
- ASM_ELF_NOTE_END
+#include "../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
+
+ .globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+ .long 1 /* ncaps */
+VDSO_NOTE_MASK:
+ .long 0 /* mask */
+ .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
index 22d8ac5815f0..4d105fdfe817 100644
--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -4,7 +4,7 @@
lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
- bitops.o semaphore.o
+ bitops.o semaphore.o string.o
lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
diff --git a/arch/i386/lib/string.c b/arch/i386/lib/string.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/i386/lib/string.c
@@ -0,0 +1,257 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char * dest,const char *src)
+{
+ int d0, d1, d2;
+ asm volatile( "1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2)
+ :"0" (src),"1" (dest) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char * dest,const char *src,size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "1:\tdecl %2\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "rep\n\t"
+ "stosb\n"
+ "2:"
+ : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+ :"0" (src),"1" (dest),"2" (count) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char * dest,const char * src)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "decl %1\n"
+ "1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char * dest,const char * src,size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "decl %1\n\t"
+ "movl %8,%3\n"
+ "1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %2,%2\n\t"
+ "stosb"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+ : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char * cs,const char * ct)
+{
+ int d0, d1;
+ int res;
+ asm volatile( "1:\tlodsb\n\t"
+ "scasb\n\t"
+ "jne 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "xorl %%eax,%%eax\n\t"
+ "jmp 3f\n"
+ "2:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "3:"
+ :"=a" (res), "=&S" (d0), "=&D" (d1)
+ :"1" (cs),"2" (ct)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+ int res;
+ int d0, d1, d2;
+ asm volatile( "1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "scasb\n\t"
+ "jne 3f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %%eax,%%eax\n\t"
+ "jmp 4f\n"
+ "3:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "4:"
+ :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ :"1" (cs),"2" (ct),"3" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char * s, int c)
+{
+ int d0;
+ char * res;
+ asm volatile( "movb %%al,%%ah\n"
+ "1:\tlodsb\n\t"
+ "cmpb %%ah,%%al\n\t"
+ "je 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "movl $1,%1\n"
+ "2:\tmovl %1,%0\n\t"
+ "decl %0"
+ :"=a" (res), "=&S" (d0)
+ :"1" (s),"0" (c)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRRCHR
+char *strrchr(const char * s, int c)
+{
+ int d0, d1;
+ char * res;
+ asm volatile( "movb %%al,%%ah\n"
+ "1:\tlodsb\n\t"
+ "cmpb %%ah,%%al\n\t"
+ "jne 2f\n\t"
+ "leal -1(%%esi),%0\n"
+ "2:\ttestb %%al,%%al\n\t"
+ "jne 1b"
+ :"=g" (res), "=&S" (d0), "=&a" (d1)
+ :"0" (0),"1" (s),"2" (c)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strrchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char * s)
+{
+ int d0;
+ int res;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "notl %0\n\t"
+ "decl %0"
+ :"=c" (res), "=&D" (d0)
+ :"1" (s),"a" (0), "0" (0xffffffffu)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs,int c,size_t count)
+{
+ int d0;
+ void *res;
+ if (!count)
+ return NULL;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "je 1f\n\t"
+ "movl $1,%0\n"
+ "1:\tdecl %0"
+ :"=D" (res), "=&c" (d0)
+ :"a" (c),"0" (cs),"1" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void * addr, int c, size_t size)
+{
+ if (!size)
+ return addr;
+ asm volatile("repnz; scasb\n\t"
+ "jnz 1f\n\t"
+ "dec %%edi\n"
+ "1:"
+ : "=D" (addr), "=c" (size)
+ : "0" (addr), "1" (size), "a" (c)
+ : "memory");
+ return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+ int d0;
+ int res;
+ asm volatile( "movl %2,%0\n\t"
+ "jmp 2f\n"
+ "1:\tcmpb $0,(%0)\n\t"
+ "je 3f\n\t"
+ "incl %0\n"
+ "2:\tdecl %1\n\t"
+ "cmpl $-1,%1\n\t"
+ "jne 1b\n"
+ "3:\tsubl %2,%0"
+ :"=a" (res), "=&d" (d0)
+ :"c" (s),"1" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c
index b47f951c0ec2..4742626f08c4 100644
--- a/arch/i386/mach-generic/es7000.c
+++ b/arch/i386/mach-generic/es7000.c
@@ -66,4 +66,4 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
#endif
-struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000);
+struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/i386/mach-visws/traps.c b/arch/i386/mach-visws/traps.c
index 5199bd03254a..843b67acf43b 100644
--- a/arch/i386/mach-visws/traps.c
+++ b/arch/i386/mach-visws/traps.c
@@ -23,13 +23,13 @@ static __init void lithium_init(void)
set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcia_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) {
+ (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
panic("This machine is not SGI Visual Workstation 320/540");
}
if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcib_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) {
+ (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
panic("This machine is not SGI Visual Workstation 320/540");
}
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
NULL,
};
- if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+ if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
string, ret);
}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 1ecb3e43b523..01ffdd4964f0 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address)
return 0;
}
+int show_unhandled_signals = 1;
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
@@ -303,6 +305,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
struct vm_area_struct * vma;
unsigned long address;
int write, si_code;
+ int fault;
/* get the address */
address = read_cr2();
@@ -422,20 +425,18 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- switch (handle_mm_fault(mm, vma, address, write)) {
- case VM_FAULT_MINOR:
- tsk->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- tsk->maj_flt++;
- break;
- case VM_FAULT_SIGBUS:
- goto do_sigbus;
- case VM_FAULT_OOM:
+ fault = handle_mm_fault(mm, vma, address, write);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
goto out_of_memory;
- default:
- BUG();
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
}
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -470,6 +471,14 @@ bad_area_nosemaphore:
if (is_prefetch(regs, address, error_code))
return;
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk("%s%s[%d]: segfault at %08lx eip %08lx "
+ "esp %08lx error %lx\n",
+ tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, tsk->pid, address, regs->eip,
+ regs->esp, error_code);
+ }
tsk->thread.cr2 = address;
/* Kernel addresses are always protection faults */
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d3663..c3b9905af2d5 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
+ paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
@@ -471,8 +471,13 @@ void zap_low_mappings (void)
flush_tlb_all();
}
+int nx_enabled = 0;
+
+#ifdef CONFIG_X86_PAE
+
static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
/*
* noexec = on|off
@@ -499,9 +504,6 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-int nx_enabled = 0;
-#ifdef CONFIG_X86_PAE
-
static void __init set_nx(void)
{
unsigned int v[4], l, h;
@@ -751,8 +753,7 @@ void __init pgtable_cache_init(void)
PTRS_PER_PMD*sizeof(pmd_t),
PTRS_PER_PMD*sizeof(pmd_t),
SLAB_PANIC,
- pmd_ctor,
- NULL);
+ pmd_ctor);
if (!SHARED_KERNEL_PMD) {
/* If we're in PAE mode and have a non-shared
kernel pmd, then the pgd size must be a
@@ -799,17 +800,9 @@ void mark_rodata_ro(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() <= 1)
-#endif
- {
- change_page_attr(virt_to_page(start),
- size >> PAGE_SHIFT, PAGE_KERNEL_RX);
- printk("Write protecting the kernel text: %luk\n", size >> 10);
- }
-#endif
+ change_page_attr(virt_to_page(start),
+ size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+ printk("Write protecting the kernel text: %luk\n", size >> 10);
start += size;
size = (unsigned long)__end_rodata - start;
change_page_attr(virt_to_page(start),
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index fff08ae7b5ed..0b278315d737 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr)
/* Reset the direct mapping. Can block */
if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
- p->size >> PAGE_SHIFT,
+ get_vm_area_size(p) >> PAGE_SHIFT,
PAGE_KERNEL);
global_flush_tlb();
}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9c..8927222b3ab2 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
- paravirt_alloc_pt(page_to_pfn(base));
+ paravirt_alloc_pt(&init_mm, page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));
@@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg)
struct page *p;
/* High level code is not ready for clflush yet */
- if (0 && cpu_has_clflush) {
+ if (cpu_has_clflush) {
list_for_each_entry (p, lh, lru)
cache_flush_page(p);
} else if (boot_cpu_data.x86_model >= 4)
@@ -136,6 +136,12 @@ static inline void revert_page(struct page *kpte_page, unsigned long address)
ref_prot));
}
+static inline void save_page(struct page *kpte_page)
+{
+ if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+ list_add(&kpte_page->lru, &df_list);
+}
+
static int
__change_page_attr(struct page *page, pgprot_t prot)
{
@@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
if (!kpte)
return -EINVAL;
kpte_page = virt_to_page(kpte);
+ BUG_ON(PageLRU(kpte_page));
+ BUG_ON(PageCompound(kpte_page));
+
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if (!pte_huge(*kpte)) {
set_pte_atomic(kpte, mk_pte(page, prot));
@@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pgprot_t prot)
* time (not via split_large_page) and in turn we must not
* replace it with a largepage.
*/
+
+ save_page(kpte_page);
if (!PageReserved(kpte_page)) {
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
- ClearPagePrivate(kpte_page);
paravirt_release_pt(page_to_pfn(kpte_page));
- list_add(&kpte_page->lru, &df_list);
revert_page(kpte_page, address);
}
}
@@ -236,6 +245,11 @@ void global_flush_tlb(void)
spin_unlock_irq(&cpa_lock);
flush_map(&l);
list_for_each_entry_safe(pg, next, &l, lru) {
+ list_del(&pg->lru);
+ clear_bit(PG_arch_1, &pg->flags);
+ if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+ continue;
+ ClearPagePrivate(pg);
__free_page(pg);
}
}
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 8d7c0864cc04..01437c46baae 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -235,7 +235,7 @@ static inline void pgd_list_del(pgd_t *pgd)
#if (PTRS_PER_PMD == 1)
/* Non-PAE pgd constructor */
-void pgd_ctor(void *pgd)
+static void pgd_ctor(void *pgd)
{
unsigned long flags;
@@ -257,7 +257,7 @@ void pgd_ctor(void *pgd)
}
#else /* PTRS_PER_PMD > 1 */
/* PAE pgd constructor */
-void pgd_ctor(void *pgd)
+static void pgd_ctor(void *pgd)
{
/* PAE, kernel PMD may be shared */
@@ -276,7 +276,7 @@ void pgd_ctor(void *pgd)
}
#endif /* PTRS_PER_PMD */
-void pgd_dtor(void *pgd)
+static void pgd_dtor(void *pgd)
{
unsigned long flags; /* can be called from interrupt context */
diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c
index b33aea845f58..bc8a44bddaa7 100644
--- a/arch/i386/pci/acpi.c
+++ b/arch/i386/pci/acpi.c
@@ -8,20 +8,42 @@
struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
{
struct pci_bus *bus;
+ struct pci_sysdata *sd;
+ int pxm;
+
+ /* Allocate per-root-bus (not per bus) arch-specific data.
+ * TODO: leak; this memory is never freed.
+ * It's arguable whether it's worth the trouble to care.
+ */
+ sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+ if (!sd) {
+ printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+ return NULL;
+ }
if (domain != 0) {
printk(KERN_WARNING "PCI: Multiple domains not supported\n");
+ kfree(sd);
return NULL;
}
- bus = pcibios_scan_root(busnum);
+ sd->node = -1;
+
+ pxm = acpi_get_pxm(device->handle);
+#ifdef CONFIG_ACPI_NUMA
+ if (pxm >= 0)
+ sd->node = pxm_to_node(pxm);
+#endif
+
+ bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+ if (!bus)
+ kfree(sd);
+
#ifdef CONFIG_ACPI_NUMA
if (bus != NULL) {
- int pxm = acpi_get_pxm(device->handle);
if (pxm >= 0) {
- bus->sysdata = (void *)(unsigned long)pxm_to_node(pxm);
- printk("bus %d -> pxm %d -> node %ld\n",
- busnum, pxm, (long)(bus->sysdata));
+ printk("bus %d -> pxm %d -> node %d\n",
+ busnum, pxm, sd->node);
}
}
#endif
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 3f78d4d8ecf3..85503deeda46 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -293,6 +293,7 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
struct pci_bus * __devinit pcibios_scan_root(int busnum)
{
struct pci_bus *bus = NULL;
+ struct pci_sysdata *sd;
dmi_check_system(pciprobe_dmi_table);
@@ -303,9 +304,19 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
}
}
+ /* Allocate per-root-bus (not per bus) arch-specific data.
+ * TODO: leak; this memory is never freed.
+ * It's arguable whether it's worth the trouble to care.
+ */
+ sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+ if (!sd) {
+ printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+ return NULL;
+ }
+
printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
- return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL);
+ return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
}
extern u8 pci_cache_line_size;
diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c
index b95b42950ed4..e7306dbf6c42 100644
--- a/arch/i386/pci/fixup.c
+++ b/arch/i386/pci/fixup.c
@@ -118,12 +118,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci
static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
{
u8 v;
- u8 revision;
int where = 0x55;
int mask = 0x1f; /* clear bits 5, 6, 7 by default */
- pci_read_config_byte(d, PCI_REVISION_ID, &revision);
-
if (d->device == PCI_DEVICE_ID_VIA_8367_0) {
/* fix pci bus latency issues resulted by NB bios error
it appears on bug free^Wreduced kt266x's bios forces
@@ -133,8 +130,8 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
where = 0x95; /* the memory write queue timer register is
different for the KT266x's: 0x95 not 0x55 */
} else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
- (revision == VIA_8363_KL133_REVISION_ID ||
- revision == VIA_8363_KM133_REVISION_ID)) {
+ (d->revision == VIA_8363_KL133_REVISION_ID ||
+ d->revision == VIA_8363_KM133_REVISION_ID)) {
mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5
causes screen corruption on the KL133/KM133 */
}
@@ -142,7 +139,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
pci_read_config_byte(d, where, &v);
if (v & ~mask) {
printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \
- d->device, revision, where, v, mask, v & mask);
+ d->device, d->revision, where, v, mask, v & mask);
v &= mask;
pci_write_config_byte(d, where, v);
}
diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c
index c7cabeed4d7b..4df637e34f81 100644
--- a/arch/i386/pci/mmconfig-shared.c
+++ b/arch/i386/pci/mmconfig-shared.c
@@ -24,6 +24,9 @@
DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+/* Indicate if the mmcfg resources have been placed into the resource table. */
+static int __initdata pci_mmcfg_resources_inserted;
+
/* K8 systems have some devices (typically in the builtin northbridge)
that are only accessible using type1
Normally this can be expressed in the MCFG by not listing them
@@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
return name != NULL;
}
-static void __init pci_mmcfg_insert_resources(void)
+static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
{
#define PCI_MMCFG_RESOURCE_NAME_LEN 19
int i;
@@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_resources(void)
cfg->pci_segment);
res->start = cfg->address;
res->end = res->start + (num_buses << 20) - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->flags = IORESOURCE_MEM | resource_flags;
insert_resource(&iomem_resource, res);
names += PCI_MMCFG_RESOURCE_NAME_LEN;
}
+
+ /* Mark that the resources have been inserted. */
+ pci_mmcfg_resources_inserted = 1;
}
static void __init pci_mmcfg_reject_broken(int type)
@@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type)
if (type == 1)
unreachable_devices();
if (known_bridge)
- pci_mmcfg_insert_resources();
+ pci_mmcfg_insert_resources(IORESOURCE_BUSY);
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ } else {
+ /*
+ * Signal not to attempt to insert mmcfg resources because
+ * the architecture mmcfg setup could not initialize.
+ */
+ pci_mmcfg_resources_inserted = 1;
}
}
+
+static int __init pci_mmcfg_late_insert_resources(void)
+{
+ /*
+ * If resources are already inserted or we are not using MMCONFIG,
+ * don't insert the resources.
+ */
+ if ((pci_mmcfg_resources_inserted == 1) ||
+ (pci_probe & PCI_PROBE_MMCONF) == 0 ||
+ (pci_mmcfg_config_num == 0) ||
+ (pci_mmcfg_config == NULL) ||
+ (pci_mmcfg_config[0].address == 0))
+ return 1;
+
+ /*
+ * Attempt to insert the mmcfg resources but not with the busy flag
+ * marked so it won't cause request errors when __request_region is
+ * called.
+ */
+ pci_mmcfg_insert_resources(0);
+
+ return 0;
+}
+
+/*
+ * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * misprogrammed MCFG tables that state larger sizes but actually conflict
+ * with other system resources.
+ */
+late_initcall(pci_mmcfg_late_insert_resources);
diff --git a/arch/i386/video/Makefile b/arch/i386/video/Makefile
new file mode 100644
index 000000000000..2c447c94adcc
--- /dev/null
+++ b/arch/i386/video/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FB) += fbdev.o
diff --git a/arch/i386/video/fbdev.c b/arch/i386/video/fbdev.c
new file mode 100644
index 000000000000..48fb38d7d2c0
--- /dev/null
+++ b/arch/i386/video/fbdev.c
@@ -0,0 +1,32 @@
+/*
+ * arch/i386/video/fbdev.c - i386 Framebuffer
+ *
+ * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <linux/fb.h>
+#include <linux/pci.h>
+
+int fb_is_primary_device(struct fb_info *info)
+{
+ struct device *device = info->device;
+ struct pci_dev *pci_dev = NULL;
+ struct resource *res = NULL;
+ int retval = 0;
+
+ if (device)
+ pci_dev = to_pci_dev(device);
+
+ if (pci_dev)
+ res = &pci_dev->resource[PCI_ROM_RESOURCE];
+
+ if (res && res->flags & IORESOURCE_ROM_SHADOW)
+ retval = 1;
+
+ return retval;
+}
+EXPORT_SYMBOL(fb_is_primary_device);
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+ bool "Enable support for Xen hypervisor"
+ depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+ help
+ This is the Linux Xen port. Enabling this will allow the
+ kernel to boot in a paravirtualized environment under the
+ Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
+ events.o time.o manage.o xen-asm.o
+
+obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..9a8c1181c001
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs. We assume it is to start with, and then set it to zero on
+ * the first failure. This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
+{
+ struct vcpu_register_vcpu_info info;
+ int err;
+ struct vcpu_info *vcpup;
+
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+ if (!have_vcpu_info_placement)
+ return; /* already tested, not available */
+
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+ info.mfn = virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
+
+ printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+ cpu, vcpup, info.mfn, info.offset);
+
+ /* Check to see if the hypervisor will put the vcpu_info
+ structure where we want it, which allows direct access via
+ a percpu-variable. */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+ if (err) {
+ printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+ have_vcpu_info_placement = 0;
+ } else {
+ /* This cpu is using the registered vcpu info, even if
+ later ones fail to. */
+ per_cpu(xen_vcpu, cpu) = vcpup;
+
+ printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+ cpu, vcpup);
+ }
+}
+
+static void __init xen_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ paravirt_ops.name);
+ printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ unsigned maskedx = ~0;
+
+ /*
+ * Mask out inconvenient features, to try and disable as many
+ * unsupported kernel subsystems as possible.
+ */
+ if (*eax == 1)
+ maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
+ (1 << X86_FEATURE_ACPI) | /* disable ACPI */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
+ asm(XEN_EMULATE_PREFIX "cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+ *edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+ HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+ return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+ struct vcpu_info *vcpu;
+ unsigned long flags;
+
+ vcpu = x86_read_percpu(xen_vcpu);
+
+ /* flag has opposite sense of mask */
+ flags = !vcpu->evtchn_upcall_mask;
+
+ /* convert to IF type flag
+ -0 -> 0x00000000
+ -1 -> 0xffffffff
+ */
+ return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+ struct vcpu_info *vcpu;
+
+ /* convert from IF type flag */
+ flags = !(flags & X86_EFLAGS_IF);
+
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = flags;
+ preempt_enable_no_resched();
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ if (flags == 0) {
+ preempt_check_resched();
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ force_evtchn_callback();
+ }
+}
+
+static void xen_irq_disable(void)
+{
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+ struct vcpu_info *vcpu;
+
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = 0;
+ preempt_enable_no_resched();
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+ /* Blocking includes an implicit local_irq_enable(). */
+ if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+ BUG();
+}
+
+static void xen_halt(void)
+{
+ if (irqs_disabled())
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ else
+ xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(preemptible());
+
+ switch (mode) {
+ case PARAVIRT_LAZY_NONE:
+ BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+ break;
+
+ case PARAVIRT_LAZY_MMU:
+ case PARAVIRT_LAZY_CPU:
+ BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+ break;
+
+ case PARAVIRT_LAZY_FLUSH:
+ /* flush if necessary, but don't change state */
+ if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+ xen_mc_flush();
+ return;
+ }
+
+ xen_mc_flush();
+ x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+ return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+ unsigned long linear_addr = (unsigned long)addr;
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_SET_LDT;
+ if (linear_addr) {
+ /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+ xmaddr_t maddr;
+ maddr = arbitrary_virt_to_machine((unsigned long)addr);
+ linear_addr = (unsigned long)maddr.maddr;
+ }
+ op->arg1.linear_addr = linear_addr;
+ op->arg2.nr_ents = entries;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ unsigned long *frames;
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ int f;
+ struct multicall_space mcs;
+
+ /* A GDT can be up to 64k in size, which corresponds to 8192
+ 8-byte entries, or 16 4k pages.. */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ mcs = xen_mc_entry(sizeof(*frames) * pages);
+ frames = mcs.args;
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ frames[f] = virt_to_mfn(va);
+ make_lowmem_page_readonly((void *)va);
+ }
+
+ MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+ unsigned int cpu, unsigned int i)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ struct multicall_space mc = __xen_mc_entry(0);
+
+ MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+ /*
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+ * it means we're in a context switch, and %gs has just been
+ * saved. This means we can zero it out to prevent faults on
+ * exit from the hypervisor if the next process has no %gs.
+ * Either way, it has been saved, and the new value will get
+ * loaded properly. This will go away as soon as Xen has been
+ * modified to not save/restore %gs for normal hypercalls.
+ */
+ if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ loadsegment(gs, 0);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+ u32 low, u32 high)
+{
+ unsigned long lp = (unsigned long)&dt[entrynum];
+ xmaddr_t mach_lp = virt_to_machine(lp);
+ u64 entry = (u64)high << 32 | low;
+
+ preempt_disable();
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+ BUG();
+
+ preempt_enable();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+ struct trap_info *info)
+{
+ u8 type, dpl;
+
+ type = (high >> 8) & 0x1f;
+ dpl = (high >> 13) & 3;
+
+ if (type != 0xf && type != 0xe)
+ return 0;
+
+ info->vector = vector;
+ info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+ info->cs = low >> 16;
+ info->flags = dpl;
+ /* interrupt gates clear IF */
+ if (type == 0xe)
+ info->flags |= 4;
+
+ return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry. If the entry is part of the current IDT, then
+ also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+ u32 low, u32 high)
+{
+ unsigned long p = (unsigned long)&dt[entrynum];
+ unsigned long start, end;
+
+ preempt_disable();
+
+ start = __get_cpu_var(idt_desc).address;
+ end = start + __get_cpu_var(idt_desc).size + 1;
+
+ xen_mc_flush();
+
+ write_dt_entry(dt, entrynum, low, high);
+
+ if (p >= start && (p + 8) <= end) {
+ struct trap_info info[2];
+
+ info[1].address = 0;
+
+ if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+ if (HYPERVISOR_set_trap_table(info))
+ BUG();
+ }
+
+ preempt_enable();
+}
+
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+ struct trap_info *traps)
+{
+ unsigned in, out, count;
+
+ count = (desc->size+1) / 8;
+ BUG_ON(count > 256);
+
+ for (in = out = 0; in < count; in++) {
+ const u32 *entry = (u32 *)(desc->address + in * 8);
+
+ if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+ out++;
+ }
+ traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+ const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+ xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen. In principle this can be per-CPU, so we
+ hold a spinlock to protect the static traps[] array (static because
+ it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+ static DEFINE_SPINLOCK(lock);
+ static struct trap_info traps[257];
+
+ spin_lock(&lock);
+
+ __get_cpu_var(idt_desc) = *desc;
+
+ xen_convert_trap_info(desc, traps);
+
+ xen_mc_flush();
+ if (HYPERVISOR_set_trap_table(traps))
+ BUG();
+
+ spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry. Ignore LDT descriptors, since
+ they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+ u32 low, u32 high)
+{
+ preempt_disable();
+
+ switch ((high >> 8) & 0xff) {
+ case DESCTYPE_LDT:
+ case DESCTYPE_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+ u64 desc = (u64)high << 32 | low;
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+ BUG();
+ }
+
+ }
+
+ preempt_enable();
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ struct multicall_space mcs = xen_mc_entry(0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+ struct physdev_set_iopl set_iopl;
+
+ /* Force the change at ring 0. */
+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+ return 0;
+}
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+ unsigned long va)
+{
+ struct {
+ struct mmuext_op op;
+ cpumask_t mask;
+ } *args;
+ cpumask_t cpumask = *cpus;
+ struct multicall_space mcs;
+
+ /*
+ * A couple of (to be removed) sanity checks:
+ *
+ * - current CPU must not be in mask
+ * - mask must exist :)
+ */
+ BUG_ON(cpus_empty(cpumask));
+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(!mm);
+
+ /* If a CPU which we ran on has gone down, OK. */
+ cpus_and(cpumask, cpumask, cpu_online_map);
+ if (cpus_empty(cpumask))
+ return;
+
+ mcs = xen_mc_entry(sizeof(*args));
+ args = mcs.args;
+ args->mask = cpumask;
+ args->op.arg2.vcpumask = &args->mask;
+
+ if (va == TLB_FLUSH_ALL) {
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ } else {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = va;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+ x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static unsigned long xen_read_cr2_direct(void)
+{
+ return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+ /* never allow TSC to be disabled */
+ native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ if (cr3 == x86_read_percpu(xen_cr3)) {
+ /* just a simple tlb flush */
+ xen_flush_tlb();
+ return;
+ }
+
+ x86_write_percpu(xen_cr3, cr3);
+
+
+ {
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = mfn;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ }
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+ BUG_ON(mem_map); /* should only be used early */
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(virt_to_page(mm->pgd))) {
+ SetPagePinned(page);
+
+ if (!PageHighMem(page))
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ else
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(page)) {
+ if (!PageHighMem(page))
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ }
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ pgprot_t prot = PAGE_KERNEL;
+
+ if (PagePinned(page))
+ prot = PAGE_KERNEL_RO;
+
+ if (0 && PageHighMem(page))
+ printk("mapping highpte %lx type %d prot %s\n",
+ page_to_pfn(page), type,
+ (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+ return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+
+ return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+ doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ pte = mask_rw_pte(ptep, pte);
+
+ xen_set_pte(ptep, pte);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+ pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+ /* special set_pte for pagetable initialization */
+ paravirt_ops.set_pte = xen_set_pte_init;
+
+ init_mm.pgd = base;
+ /*
+ * copy top-level of Xen-supplied pagetable into place. For
+ * !PAE we can use this as-is, but for PAE it is a stand-in
+ * while we copy the pmd pages.
+ */
+ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+ if (PTRS_PER_PMD > 1) {
+ int i;
+ /*
+ * For PAE, need to allocate new pmds, rather than
+ * share Xen's, since Xen doesn't like pmd's being
+ * shared between address spaces.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+ pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+ memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+ PAGE_SIZE);
+
+ make_lowmem_page_readonly(pmd);
+
+ set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+ } else
+ pgd_clear(&base[i]);
+ }
+ }
+
+ /* make sure zero_page is mapped RO so we can use it in pagetables */
+ make_lowmem_page_readonly(empty_zero_page);
+ make_lowmem_page_readonly(base);
+ /*
+ * Switch to new pagetable. This is done before
+ * pagetable_init has done anything so that the new pages
+ * added to the table can be prepared properly for Xen.
+ */
+ xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ paravirt_ops.alloc_pt = xen_alloc_pt;
+ paravirt_ops.set_pte = xen_set_pte;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /*
+ * Create a mapping for the shared info page.
+ * Should be set_fixmap(), but shared_info is a machine
+ * address with no corresponding pseudo-phys address.
+ */
+ set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+ PFN_DOWN(xen_start_info->shared_info),
+ PAGE_KERNEL);
+
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ } else
+ HYPERVISOR_shared_info =
+ (struct shared_info *)__va(xen_start_info->shared_info);
+
+ /* Actually pin the pagetable down, but we can't set PG_pinned
+ yet because the page structures don't exist yet. */
+ {
+ struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+ op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+ }
+}
+
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ xen_vcpu_setup(cpu);
+
+ /* xen_vcpu_setup managed to place the vcpu_info within the
+ percpu area for all cpus, so make use of it */
+ if (have_vcpu_info_placement) {
+ printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+ paravirt_ops.save_fl = xen_save_fl_direct;
+ paravirt_ops.restore_fl = xen_restore_fl_direct;
+ paravirt_ops.irq_disable = xen_irq_disable_direct;
+ paravirt_ops.irq_enable = xen_irq_enable_direct;
+ paravirt_ops.read_cr2 = xen_read_cr2_direct;
+ paravirt_ops.iret = xen_iret_direct;
+ }
+}
+
+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+ char *start, *end, *reloc;
+ unsigned ret;
+
+ start = end = reloc = NULL;
+
+#define SITE(x) \
+ case PARAVIRT_PATCH(x): \
+ if (have_vcpu_info_placement) { \
+ start = (char *)xen_##x##_direct; \
+ end = xen_##x##_direct_end; \
+ reloc = xen_##x##_direct_reloc; \
+ } \
+ goto patch_site
+
+ switch (type) {
+ SITE(irq_enable);
+ SITE(irq_disable);
+ SITE(save_fl);
+ SITE(restore_fl);
+#undef SITE
+
+ patch_site:
+ if (start == NULL || (end-start) > len)
+ goto default_patch;
+
+ ret = paravirt_patch_insns(insns, len, start, end);
+
+ /* Note: because reloc is assigned from something that
+ appears to be an array, gcc assumes it's non-null,
+ but doesn't know its relationship with start and
+ end. */
+ if (reloc > start && reloc < end) {
+ int reloc_off = reloc - start;
+ long *relocp = (long *)(insns + reloc_off);
+ long delta = start - (char *)insns;
+
+ *relocp += delta;
+ }
+ break;
+
+ default_patch:
+ default:
+ ret = paravirt_patch_default(type, clobbers, insns, len);
+ break;
+ }
+
+ return ret;
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+ .paravirt_enabled = 1,
+ .shared_kernel_pmd = 0,
+
+ .name = "Xen",
+ .banner = xen_banner,
+
+ .patch = xen_patch,
+
+ .memory_setup = xen_memory_setup,
+ .arch_setup = xen_arch_setup,
+ .init_IRQ = xen_init_IRQ,
+ .post_allocator_init = xen_mark_init_mm_pinned,
+
+ .time_init = xen_time_init,
+ .set_wallclock = xen_set_wallclock,
+ .get_wallclock = xen_get_wallclock,
+ .get_cpu_khz = xen_cpu_khz,
+ .sched_clock = xen_sched_clock,
+
+ .cpuid = xen_cpuid,
+
+ .set_debugreg = xen_set_debugreg,
+ .get_debugreg = xen_get_debugreg,
+
+ .clts = native_clts,
+
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3,
+
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = xen_write_cr4,
+
+ .save_fl = xen_save_fl,
+ .restore_fl = xen_restore_fl,
+ .irq_disable = xen_irq_disable,
+ .irq_enable = xen_irq_enable,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+ .wbinvd = native_wbinvd,
+
+ .read_msr = native_read_msr_safe,
+ .write_msr = native_write_msr_safe,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+
+ .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+ .irq_enable_sysexit = NULL, /* never called */
+
+ .load_tr_desc = paravirt_nop,
+ .set_ldt = xen_set_ldt,
+ .load_gdt = xen_load_gdt,
+ .load_idt = xen_load_idt,
+ .load_tls = xen_load_tls,
+
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = xen_store_tr,
+
+ .write_ldt_entry = xen_write_ldt_entry,
+ .write_gdt_entry = xen_write_gdt_entry,
+ .write_idt_entry = xen_write_idt_entry,
+ .load_esp0 = xen_load_esp0,
+
+ .set_iopl_mask = xen_set_iopl_mask,
+ .io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = xen_apic_write,
+ .apic_write_atomic = xen_apic_write,
+ .apic_read = xen_apic_read,
+ .setup_boot_clock = paravirt_nop,
+ .setup_secondary_clock = paravirt_nop,
+ .startup_ipi_hook = paravirt_nop,
+#endif
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_others = xen_flush_tlb_others,
+
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+ .pagetable_setup_start = xen_pagetable_setup_start,
+ .pagetable_setup_done = xen_pagetable_setup_done,
+
+ .alloc_pt = xen_alloc_pt_init,
+ .release_pt = xen_release_pt,
+ .alloc_pd = paravirt_nop,
+ .alloc_pd_clone = paravirt_nop,
+ .release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+ .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+ .set_pte = NULL, /* see xen_pagetable_setup_* */
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd,
+
+ .pte_val = xen_pte_val,
+ .pgd_val = xen_pgd_val,
+
+ .make_pte = xen_make_pte,
+ .make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .set_pte_present = xen_set_pte_at,
+ .set_pud = xen_set_pud,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+
+ .make_pmd = xen_make_pmd,
+ .pmd_val = xen_pmd_val,
+#endif /* PAE */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .set_lazy_mode = xen_set_lazy_mode,
+};
+
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+ .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_smp_prepare_cpus,
+ .cpu_up = xen_cpu_up,
+ .smp_cpus_done = xen_smp_cpus_done,
+
+ .smp_send_stop = xen_smp_send_stop,
+ .smp_send_reschedule = xen_smp_send_reschedule,
+ .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif /* CONFIG_SMP */
+
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+ smp_send_stop();
+#endif
+
+ if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+ BUG();
+}
+
+static void xen_restart(char *msg)
+{
+ xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+ xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+ xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+ .restart = xen_restart,
+ .halt = xen_machine_halt,
+ .power_off = xen_machine_halt,
+ .shutdown = xen_machine_halt,
+ .crash_shutdown = xen_crash_shutdown,
+ .emergency_restart = xen_emergency_restart,
+};
+
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+ pgd_t *pgd;
+
+ if (!xen_start_info)
+ return;
+
+ BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+ /* Install Xen paravirt ops */
+ paravirt_ops = xen_paravirt_ops;
+ machine_ops = xen_machine_ops;
+
+#ifdef CONFIG_SMP
+ smp_ops = xen_smp_ops;
+#endif
+
+ xen_setup_features();
+
+ /* Get mfn list */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+ pgd = (pgd_t *)xen_start_info->pt_base;
+
+ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+ init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+ /* keep using Xen gdt for now; no urgent need to change it */
+
+ x86_write_percpu(xen_cr3, __pa(pgd));
+
+#ifdef CONFIG_SMP
+ /* Don't do the full vcpu_info placement stuff until we have a
+ possible map. */
+ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+ /* May as well do it now, since there's no good time to call
+ it later on UP. */
+ xen_setup_vcpu_info_placement();
+#endif
+
+ paravirt_ops.kernel_rpl = 1;
+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ paravirt_ops.kernel_rpl = 0;
+
+ /* set the limit of our address space */
+ reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+ /* set up basic CPUID stuff */
+ cpu_detect(&new_cpu_data);
+ new_cpu_data.hard_math = 1;
+ new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+ /* Poke various useful things into boot_params */
+ LOADER_TYPE = (9 << 4) | 0;
+ INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+ INITRD_SIZE = xen_start_info->mod_len;
+
+ /* Start the world */
+ start_kernel();
+}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..da1b173547a1
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,591 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels. Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels. The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip. When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications. This includes all the virtual
+ * device events, since they're driven by front-ends in another domain
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+ unsigned short evtchn;
+ unsigned char index;
+ unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+ IRQT_UNBOUND,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+ return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+ return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+ return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+ return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] &
+ cpu_evtchn_mask[cpu][idx] &
+ ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ int irq = evtchn_to_irq[chn];
+
+ BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+ irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+ __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+ __set_bit(chn, cpu_evtchn_mask[cpu]);
+
+ cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ /* By default all event channels notify CPU#0. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+ memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ unsigned int cpu = get_cpu();
+
+ BUG_ON(!irqs_disabled());
+
+ /* Slow path (hypercall) if this is a non-local port. */
+ if (unlikely(cpu != cpu_from_evtchn(port))) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ } else {
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+ sync_clear_bit(port, &s->evtchn_mask[0]);
+
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+ * the interrupt edge' if the channel is masked.
+ */
+ if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+ !sync_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+ }
+
+ put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+ int irq;
+
+ /* Only allocate from dynirq range */
+ for (irq = 0; irq < NR_IRQS; irq++)
+ if (irq_bindcount[irq] == 0)
+ break;
+
+ if (irq == NR_IRQS)
+ panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+ return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = evtchn_to_irq[evtchn];
+
+ if (irq == -1) {
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "event");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(ipi_to_irq, cpu)[ipi];
+ if (irq == -1) {
+ irq = find_unbound_irq();
+ if (irq < 0)
+ goto out;
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "ipi");
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(virq_to_irq, cpu)[virq];
+
+ if (irq == -1) {
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "virq");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ struct evtchn_close close;
+ int evtchn = evtchn_from_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+ [index_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ /* Closed ports are implicitly re-bound to VCPU0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+
+ evtchn_to_irq[evtchn] = -1;
+ irq_info[irq] = IRQ_UNBOUND;
+
+ dynamic_irq_init(irq);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irqreturn_t (*handler)(int, void *),
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_evtchn_to_irq(evtchn);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irqreturn_t (*handler)(int, void *),
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks. For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching. The first level is
+ * a bitset of words which contain pending event bits. The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ int cpu = get_cpu();
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+ unsigned long pending_words;
+
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
+
+ if (irq != -1) {
+ regs->orig_eax = ~irq;
+ do_IRQ(regs);
+ }
+ }
+ }
+
+ put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+ struct evtchn_bind_vcpu bind_vcpu;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ /* Send future instances of this interrupt to other vcpu. */
+ bind_vcpu.port = evtchn;
+ bind_vcpu.vcpu = tcpu;
+
+ /*
+ * If this fails, it usually just indicates that we're dealing with a
+ * virq or IPI channel, which don't actually need to be rebound. Ignore
+ * it, but don't do the xenlinux-level rebind in that case.
+ */
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+ bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+ unsigned tcpu = first_cpu(dest);
+ rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ int ret = 0;
+
+ if (VALID_EVTCHN(evtchn)) {
+ set_evtchn(evtchn);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .name = "xen-dyn",
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+ .ack = ack_dynirq,
+ .set_affinity = set_affinity_irq,
+ .retrigger = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+ int i;
+
+ init_evtchn_cpu_bindings();
+
+ /* No event channels are 'live' right now. */
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ mask_evtchn(i);
+
+ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_bindcount[i] = 0;
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+ struct xen_feature_info fi;
+ int i, j;
+
+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+ fi.submap_idx = i;
+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+ break;
+ for (j = 0; j < 32; j++)
+ xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+ }
+}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID -1
+#define SHUTDOWN_POWEROFF 0
+#define SHUTDOWN_SUSPEND 2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned. The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT 4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ char *str;
+ struct xenbus_transaction xbt;
+ int err;
+
+ if (shutting_down != SHUTDOWN_INVALID)
+ return;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+
+ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+ /* Ignore read errors and empty reads. */
+ if (XENBUS_IS_ERR_READ(str)) {
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ xenbus_write(xbt, "control", "shutdown", "");
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN) {
+ kfree(str);
+ goto again;
+ }
+
+ if (strcmp(str, "poweroff") == 0 ||
+ strcmp(str, "halt") == 0)
+ orderly_poweroff(false);
+ else if (strcmp(str, "reboot") == 0)
+ ctrl_alt_del();
+ else {
+ printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+ shutting_down = SHUTDOWN_INVALID;
+ }
+
+ kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+ unsigned int len)
+{
+ char sysrq_key = '\0';
+ struct xenbus_transaction xbt;
+ int err;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+ printk(KERN_ERR "Unable to read sysrq code in "
+ "control/sysrq\n");
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ if (sysrq_key != '\0')
+ xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+
+ if (sysrq_key != '\0')
+ handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+ .node = "control/sysrq",
+ .callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+ int err;
+
+ err = register_xenbus_watch(&shutdown_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set shutdown watcher\n");
+ return err;
+ }
+
+ err = register_xenbus_watch(&sysrq_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set sysrq watcher\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ setup_shutdown_watcher();
+ return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = shutdown_event
+ };
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..4ae038aa6c24
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion. In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable. When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest. This prevents uncontrolled
+ * guest updates to the pagetable. Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow. The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use. This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+ pte_t *pte = lookup_address(address);
+ unsigned offset = address & PAGE_MASK;
+
+ BUG_ON(pte == NULL);
+
+ return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+
+ pte = lookup_address(address);
+ BUG_ON(pte == NULL);
+
+ ptev = pte_wrprotect(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+
+ pte = lookup_address(address);
+ BUG_ON(pte == NULL);
+
+ ptev = pte_mkwrite(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+ u->ptr = virt_to_machine(ptr).maddr;
+ u->val = pmd_val_ma(val);
+ MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ /* <mfn,flags> stored as-is, to permit clearing entries */
+ xen_set_pte(pte, mfn_pte(mfn, flags));
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ if (mm == current->mm || mm == &init_mm) {
+ if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ struct multicall_space mcs;
+ mcs = xen_mc_entry(0);
+
+ MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ return;
+ } else
+ if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+ return;
+ }
+ xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+ u->ptr = virt_to_machine(ptr).maddr;
+ u->val = pud_val_ma(val);
+ MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb(); /* make sure low gets written first */
+ ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+ xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+ unsigned long long ret = 0;
+
+ if (pte.pte_low) {
+ ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ }
+
+ return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+ unsigned long long ret = pmd.pmd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+ unsigned long long ret = pgd.pgd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+ if (pte & 1)
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+
+ return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+ if (pmd & 1)
+ pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+ return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+#else /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+unsigned long xen_pte_val(pte_t pte)
+{
+ unsigned long ret = pte.pte_low;
+
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr;
+
+ return ret;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+ unsigned long ret = pgd.pgd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+ if (pte & _PAGE_PRESENT)
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+
+ return (pte_t){ pte };
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+#endif /* CONFIG_X86_PAE */
+
+
+
+/*
+ (Yet another) pagetable walker. This one is intended for pinning a
+ pagetable. This means that it walks a pagetable and calls the
+ callback function on each page it finds making up the page table,
+ at every level. It walks the entire pagetable, but it only bothers
+ pinning pte pages which are below pte_limit. In the normal case
+ this will be TASK_SIZE, but at boot we need to pin up to
+ FIXADDR_TOP. But the important bit is that we don't pin beyond
+ there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+ unsigned long limit)
+{
+ pgd_t *pgd = pgd_base;
+ int flush = 0;
+ unsigned long addr = 0;
+ unsigned long pgd_next;
+
+ BUG_ON(limit > FIXADDR_TOP);
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+ pud_t *pud;
+ unsigned long pud_limit, pud_next;
+
+ pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+ if (!pgd_val(*pgd))
+ continue;
+
+ pud = pud_offset(pgd, 0);
+
+ if (PTRS_PER_PUD > 1) /* not folded */
+ flush |= (*func)(virt_to_page(pud), 0);
+
+ for (; addr != pud_limit; pud++, addr = pud_next) {
+ pmd_t *pmd;
+ unsigned long pmd_limit;
+
+ pud_next = pud_addr_end(addr, pud_limit);
+
+ if (pud_next < limit)
+ pmd_limit = pud_next;
+ else
+ pmd_limit = limit;
+
+ if (pud_none(*pud))
+ continue;
+
+ pmd = pmd_offset(pud, 0);
+
+ if (PTRS_PER_PMD > 1) /* not folded */
+ flush |= (*func)(virt_to_page(pmd), 0);
+
+ for (; addr != pmd_limit; pmd++) {
+ addr += (PAGE_SIZE * PTRS_PER_PTE);
+ if ((pmd_limit-1) < (addr-1)) {
+ addr = pmd_limit;
+ break;
+ }
+
+ if (pmd_none(*pmd))
+ continue;
+
+ flush |= (*func)(pmd_page(*pmd), 0);
+ }
+ }
+ }
+
+ flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+ return flush;
+}
+
+static int pin_page(struct page *page, unsigned flags)
+{
+ unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+ int flush;
+
+ if (pgfl)
+ flush = 0; /* already pinned */
+ else if (PageHighMem(page))
+ /* kmaps need flushing if we found an unpinned
+ highpage */
+ flush = 1;
+ else {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ struct multicall_space mcs = __xen_mc_entry(0);
+
+ flush = 0;
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL_RO),
+ flags);
+ }
+
+ return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+ been used yet. We need to make sure that its pagetable is all
+ read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ xen_mc_batch();
+
+ if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+ /* re-enable interrupts for kmap_flush_unused */
+ xen_mc_issue(0);
+ kmap_flush_unused();
+ xen_mc_batch();
+ }
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+ op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+ op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(0);
+}
+
+/* The init_mm pagetable is really pinned as soon as its created, but
+ that's before we have page structures to store the bits. So do all
+ the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+ SetPagePinned(page);
+ return 0;
+}
+
+void __init xen_mark_init_mm_pinned(void)
+{
+ pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+
+static int unpin_page(struct page *page, unsigned flags)
+{
+ unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+
+ if (pgfl && !PageHighMem(page)) {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ struct multicall_space mcs = __xen_mc_entry(0);
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL),
+ flags);
+ }
+
+ return 0; /* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_UNPIN_TABLE;
+ op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+ xen_mc_issue(0);
+}
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+ spin_lock(&next->page_table_lock);
+ xen_pgd_pin(next->pgd);
+ spin_unlock(&next->page_table_lock);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+ xen_pgd_pin(mm->pgd);
+ spin_unlock(&mm->page_table_lock);
+}
+
+
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+ we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+ struct mm_struct *mm = info;
+
+ if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+ leave_mm(smp_processor_id());
+}
+
+static void drop_mm_ref(struct mm_struct *mm)
+{
+ if (current->active_mm == mm) {
+ if (current->mm == mm)
+ load_cr3(swapper_pg_dir);
+ else
+ leave_mm(smp_processor_id());
+ }
+
+ if (!cpus_empty(mm->cpu_vm_mask))
+ xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+ mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+ if (current->active_mm == mm)
+ load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it. This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing. This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+ get_cpu(); /* make sure we don't move around */
+ drop_mm_ref(mm);
+ put_cpu();
+
+ spin_lock(&mm->page_table_lock);
+ xen_pgd_unpin(mm->pgd);
+ spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif /* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface. This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls. There's a
+ * per-cpu buffer of outstanding multicalls. When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc). When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested. There's no way to get per-multicall
+ * return results back. It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH 32
+#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
+
+struct mc_buffer {
+ struct multicall_entry entries[MC_BATCH];
+ u64 args[MC_ARGS];
+ unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ int ret = 0;
+ unsigned long flags;
+
+ BUG_ON(preemptible());
+
+ /* Disable interrupts in case someone comes in and queues
+ something in the middle */
+ local_irq_save(flags);
+
+ if (b->mcidx) {
+ int i;
+
+ if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+ BUG();
+ for (i = 0; i < b->mcidx; i++)
+ if (b->entries[i].result < 0)
+ ret++;
+ b->mcidx = 0;
+ b->argidx = 0;
+ } else
+ BUG_ON(b->argidx != 0);
+
+ local_irq_restore(flags);
+
+ BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_space ret;
+ unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+ BUG_ON(preemptible());
+ BUG_ON(argspace > MC_ARGS);
+
+ if (b->mcidx == MC_BATCH ||
+ (b->argidx + argspace) > MC_ARGS)
+ xen_mc_flush();
+
+ ret.mc = &b->entries[b->mcidx];
+ b->mcidx++;
+ ret.args = &b->args[b->argidx];
+ b->argidx += argspace;
+
+ return ret;
+}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+ struct multicall_entry *mc;
+ void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s. Must be
+ paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+ /* need to disable interrupts until this entry is complete */
+ local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+ xen_mc_batch();
+ return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+ if ((xen_get_lazy_mode() & mode) == 0)
+ xen_mc_flush();
+
+ /* restore flags saved in xen_mc_batch */
+ local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..f84e77226646
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,111 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+#include "vdso.h"
+
+/* These are code, but not functions. Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+ unsigned long max_pfn = xen_start_info->nr_pages;
+
+ e820.nr_map = 0;
+ add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+ return "Xen";
+}
+
+static void xen_idle(void)
+{
+ local_irq_disable();
+
+ if (need_resched())
+ local_irq_enable();
+ else {
+ current_thread_info()->status &= ~TS_POLLING;
+ smp_mb__after_clear_bit();
+ safe_halt();
+ current_thread_info()->status |= TS_POLLING;
+ }
+}
+
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+ extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */
+ extern char vsyscall_int80_start;
+ u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+ &vsyscall_int80_start);
+ *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+
+void __init xen_arch_setup(void)
+{
+ struct physdev_set_iopl set_iopl;
+ int rc;
+
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+ HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+ __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+ if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ disable_acpi();
+ }
+#endif
+
+ memcpy(boot_command_line, xen_start_info->cmd_line,
+ MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+ COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+ pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+ /* fill cpus_possible with all available cpus */
+ xen_fill_possible_map();
+#endif
+
+ paravirt_disable_iospace();
+
+ fiddle_vdso();
+}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops. SMP under Xen is
+ * very straightforward. Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of. As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+ void (*func) (void *info);
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+ return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_init();
+
+ preempt_disable();
+ per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+ xen_setup_cpu_clockevents();
+
+ /* We can take interrupts now: we're officially "up". */
+ local_irq_enable();
+
+ wmb(); /* make sure everything is out */
+ cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+ int rc;
+ const char *resched_name, *callfunc_name;
+
+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+ resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+ cpu,
+ xen_reschedule_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ resched_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(resched_irq, cpu) = rc;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+ cpu,
+ xen_call_function_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(callfunc_irq, cpu) = rc;
+
+ return 0;
+
+ fail:
+ if (per_cpu(resched_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ if (per_cpu(callfunc_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+ int i, rc;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0)
+ cpu_set(i, cpu_possible_map);
+ }
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+ int cpu;
+
+ BUG_ON(smp_processor_id() != 0);
+ native_smp_prepare_boot_cpu();
+
+ /* We've switched to the "real" per-cpu gdt, so make sure the
+ old memory can be recycled */
+ make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ }
+
+ xen_setup_vcpu_info_placement();
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned cpu;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ }
+
+ smp_store_cpu_info(0);
+ set_cpu_sibling_map(0);
+
+ if (xen_smp_intr_init(0))
+ BUG();
+
+ cpu_initialized_map = cpumask_of_cpu(0);
+
+ /* Restrict the possible_map according to max_cpus. */
+ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+ for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ continue;
+ cpu_clear(cpu, cpu_possible_map);
+ }
+
+ for_each_possible_cpu (cpu) {
+ struct task_struct *idle;
+
+ if (cpu == 0)
+ continue;
+
+ idle = fork_idle(cpu);
+ if (IS_ERR(idle))
+ panic("failed fork for CPU %d", cpu);
+
+ cpu_set(cpu, cpu_present_map);
+ }
+
+ //init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+ struct vcpu_guest_context *ctxt;
+ struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+ if (cpu_test_and_set(cpu, cpu_initialized_map))
+ return 0;
+
+ ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+ if (ctxt == NULL)
+ return -ENOMEM;
+
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = 0;
+ ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+ xen_copy_trap_info(ctxt->trap_ctxt);
+
+ ctxt->ldt_ents = 0;
+
+ BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+ make_lowmem_page_readonly(gdt->gdt);
+
+ ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+ ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
+
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.esp0;
+
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+ BUG();
+
+ kfree(ctxt);
+ return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+ struct task_struct *idle = idle_task(cpu);
+ int rc;
+
+#if 0
+ rc = cpu_up_check(cpu);
+ if (rc)
+ return rc;
+#endif
+
+ init_gdt(cpu);
+ per_cpu(current_task, cpu) = idle;
+ irq_ctx_init(cpu);
+ xen_setup_timer(cpu);
+
+ /* make sure interrupts start blocked */
+ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+ rc = cpu_initialize_context(cpu, idle);
+ if (rc)
+ return rc;
+
+ if (num_online_cpus() == 1)
+ alternatives_smp_switch(1);
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc)
+ return rc;
+
+ smp_store_cpu_info(cpu);
+ set_cpu_sibling_map(cpu);
+ /* This must be done before setting cpu_online_map */
+ wmb();
+
+ cpu_set(cpu, cpu_online_map);
+
+ rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+ int cpu = smp_processor_id();
+
+ /* make sure we're not pinning something down */
+ load_cr3(swapper_pg_dir);
+ /* should set up a minimal gdt */
+
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+ BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+ smp_call_function(stop_self, NULL, 0, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+ xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+ unsigned cpu;
+
+ cpus_and(mask, mask, cpu_online_map);
+
+ for_each_cpu_mask(cpu, mask)
+ xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ /*
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the info structure may be out of scope unless wait==1
+ */
+ irq_enter();
+ (*func)(info);
+ irq_exit();
+
+ if (wait) {
+ mb(); /* commit everything before setting finished */
+ atomic_inc(&call_data->finished);
+ }
+
+ return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+ void *info, int wait)
+{
+ struct call_data_struct data;
+ int cpus;
+
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+
+ cpu_clear(smp_processor_id(), mask);
+
+ cpus = cpus_weight(mask);
+ if (!cpus) {
+ spin_unlock(&call_lock);
+ return 0;
+ }
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ mb(); /* write everything before IPI */
+
+ /* Send a message to other CPUs and wait for them to respond */
+ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+ /* Make sure other vcpus get a chance to run.
+ XXX too severe? Maybe we should check the other CPU's states? */
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus ||
+ (wait && atomic_read(&data.finished) != cpus))
+ cpu_relax();
+
+ spin_unlock(&call_lock);
+
+ return 0;
+}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..dfd6db69ead5
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP 100000
+#define NS_PER_TICK (1000000000LL / HZ)
+
+static cycle_t xen_clocksource_read(void);
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+ u64 ret;
+
+ if (BITS_PER_LONG < 64) {
+ u32 *p32 = (u32 *)p;
+ u32 h, l;
+
+ /*
+ * Read high then low, and then make sure high is
+ * still the same; this will only loop if low wraps
+ * and carries into high.
+ * XXX some clean way to make this endian-proof?
+ */
+ do {
+ h = p32[1];
+ barrier();
+ l = p32[0];
+ barrier();
+ } while (p32[1] != h);
+
+ ret = (((u64)h) << 32) | l;
+ } else
+ ret = *p;
+
+ return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+ u64 state_time;
+ struct vcpu_runstate_info *state;
+
+ BUG_ON(preemptible());
+
+ state = &__get_cpu_var(runstate);
+
+ /*
+ * The runstate info is always updated by the hypervisor on
+ * the current CPU, so there's no need to use anything
+ * stronger than a compiler barrier when fetching it.
+ */
+ do {
+ state_time = get64(&state->state_entry_time);
+ barrier();
+ *res = *state;
+ barrier();
+ } while (get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
+{
+ struct vcpu_register_runstate_memory_area area;
+
+ area.addr.v = &per_cpu(runstate, cpu);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+ cpu, &area))
+ BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+ struct vcpu_runstate_info state;
+ struct vcpu_runstate_info *snap;
+ s64 blocked, runnable, offline, stolen;
+ cputime_t ticks;
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ snap = &__get_cpu_var(runstate_snapshot);
+
+ /* work out how much time the VCPU has not been runn*ing* */
+ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+ runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+ offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+ *snap = state;
+
+ /* Add the appropriate number of ticks of stolen time,
+ including any left-overs from last time. Passing NULL to
+ account_steal_time accounts the time as stolen. */
+ stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+ if (stolen < 0)
+ stolen = 0;
+
+ ticks = 0;
+ while (stolen >= NS_PER_TICK) {
+ ticks++;
+ stolen -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_stolen) = stolen;
+ account_steal_time(NULL, ticks);
+
+ /* Add the appropriate number of ticks of blocked time,
+ including any left-overs from last time. Passing idle to
+ account_steal_time accounts the time as idle/wait. */
+ blocked += __get_cpu_var(residual_blocked);
+
+ if (blocked < 0)
+ blocked = 0;
+
+ ticks = 0;
+ while (blocked >= NS_PER_TICK) {
+ ticks++;
+ blocked -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_blocked) = blocked;
+ account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+/*
+ * Xen sched_clock implementation. Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+ struct vcpu_runstate_info state;
+ cycle_t now;
+ u64 ret;
+ s64 offset;
+
+ /*
+ * Ideally sched_clock should be called on a per-cpu basis
+ * anyway, so preempt should already be disabled, but that's
+ * not current practice at the moment.
+ */
+ preempt_disable();
+
+ now = xen_clocksource_read();
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ offset = now - state.state_entry_time;
+ if (offset < 0)
+ offset = 0;
+
+ ret = state.time[RUNSTATE_blocked] +
+ state.time[RUNSTATE_running] +
+ offset;
+
+ preempt_enable();
+
+ return ret;
+}
+
+
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+ u64 cpu_khz = 1000000ULL << 32;
+ const struct vcpu_time_info *info =
+ &HYPERVISOR_shared_info->vcpu_info[0].time;
+
+ do_div(cpu_khz, info->tsc_to_system_mul);
+ if (info->tsc_shift < 0)
+ cpu_khz <<= -info->tsc_shift;
+ else
+ cpu_khz >>= info->tsc_shift;
+
+ return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ /* src is shared memory with the hypervisor, so we need to
+ make sure we get a consistent snapshot, even in the face of
+ being preempted. */
+ src = &__get_cpu_var(xen_vcpu)->time;
+ dst = &__get_cpu_var(shadow_time);
+
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) | (dst->version ^ src->version));
+
+ return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+ u64 now, delta;
+ now = native_read_tsc();
+ delta = now - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+ struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+ cycle_t ret;
+ unsigned version;
+
+ do {
+ version = get_time_values_from_xen();
+ barrier();
+ ret = shadow->system_timestamp + get_nsec_offset(shadow);
+ barrier();
+ } while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+ put_cpu_var(shadow_time);
+
+ return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+ const struct shared_info *s = HYPERVISOR_shared_info;
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = s->wc_version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = s->wc_sec;
+ now.tv_nsec = s->wc_nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+ delta = xen_clocksource_read(); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+ struct timespec ts;
+
+ xen_read_wallclock(&ts);
+
+ return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+ /* do nothing for domU */
+ return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+ .name = "xen",
+ .rating = 400,
+ .read = xen_clocksource_read,
+ .mask = ~0,
+ .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
+ .shift = XEN_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+ Xen clockevent implementation
+
+ Xen has two clockevent implementations:
+
+ The old timer_op one works with all released versions of Xen prior
+ to version 3.0.4. This version of the hypervisor provides a
+ single-shot timer with nanosecond resolution. However, sharing the
+ same event channel is a 100Hz tick which is delivered while the
+ vcpu is running. We don't care about or use this tick, but it will
+ cause the core time code to think the timer fired too soon, and
+ will end up resetting it each time. It could be filtered, but
+ doing so has complications when the ktime clocksource is not yet
+ the xen clocksource (ie, at boot time).
+
+ The new vcpu_op-based timer interface allows the tick timer period
+ to be changed or turned off. The tick timer is not useful as a
+ periodic timer because events are only delivered to running vcpus.
+ The one-shot timer can report when a timeout is in the past, so
+ set_next_event is capable of returning -ETIME when appropriate.
+ This interface is used when available.
+*/
+
+
+/*
+ Get a hypervisor absolute time. In theory we could maintain an
+ offset between the kernel's time and the hypervisor's time, and
+ apply that to a kernel's absolute timeout. Unfortunately the
+ hypervisor and kernel times can drift even if the kernel is using
+ the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+ return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ /* unsupported */
+ WARN_ON(1);
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ HYPERVISOR_set_timer_op(0); /* cancel timeout */
+ break;
+ }
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+ if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+ BUG();
+
+ /* We may have missed the deadline, but there's no real way of
+ knowing for sure. If the event was in the past, then we'll
+ get an immediate interrupt. */
+
+ return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_mode = xen_timerop_set_mode,
+ .set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ WARN_ON(1); /* unsupported */
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+ HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+ break;
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+ }
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+ struct vcpu_set_singleshot_timer single;
+ int ret;
+
+ WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+ single.timeout_abs_ns = get_abs_timeout(delta);
+ single.flags = VCPU_SSHOTTMR_future;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+ BUG_ON(ret != 0 && ret != -ETIME);
+
+ return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_mode = xen_vcpuop_set_mode,
+ .set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+ &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+ struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+ irqreturn_t ret;
+
+ ret = IRQ_NONE;
+ if (evt->event_handler) {
+ evt->event_handler(evt);
+ ret = IRQ_HANDLED;
+ }
+
+ do_stolen_accounting();
+
+ return ret;
+}
+
+void xen_setup_timer(int cpu)
+{
+ const char *name;
+ struct clock_event_device *evt;
+ int irq;
+
+ printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+ name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+ if (!name)
+ name = "<timer kasprintf failed>";
+
+ irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ name, NULL);
+
+ evt = &per_cpu(xen_clock_events, cpu);
+ memcpy(evt, xen_clockevent, sizeof(*evt));
+
+ evt->cpumask = cpumask_of_cpu(cpu);
+ evt->irq = irq;
+
+ setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+ BUG_ON(preemptible());
+
+ clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+
+__init void xen_time_init(void)
+{
+ int cpu = smp_processor_id();
+
+ get_time_values_from_xen();
+
+ clocksource_register(&xen_clocksource);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+ /* Successfully turned off 100Hz tick, so we have the
+ vcpuop-based timer interface */
+ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+ xen_clockevent = &xen_vcpuop_clockevent;
+ }
+
+ /* Set initial system time with full resolution */
+ xen_read_wallclock(&xtime);
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ tsc_disable = 0;
+
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h
new file mode 100644
index 000000000000..861fedfe5230
--- /dev/null
+++ b/arch/i386/xen/vdso.h
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments. We use
+ bit 1 to avoid bugs in some versions of glibc when bit 0 is
+ used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+ Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ The inline versions are the same as the direct-use versions, with the
+ pre- and post-amble chopped off.
+
+ This code is encoded for size rather than absolute efficiency,
+ with a view to being able to inline as much as possible.
+
+ We only bother with direct forms (ie, vcpu in pda) of the operations
+ here; the indirect forms are better handled in C, since they're
+ generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+ Enable events. This clears the event mask and tests the pending
+ event status with one and operation. If there are pending
+ events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Clear mask and test pending */
+ andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+ Disabling events is simply a matter of making the event mask
+ non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ (xen_)save_fl is used to get the current interrupt enable status.
+ Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ may be set in the return value. We take advantage of this by
+ making sure that X86_EFLAGS_IF has the right value (and other bits
+ in that byte are 0), but other bits in the return value are
+ undefined. We need to toggle the state of the bit, because
+ Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ setz %ah
+ addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+
+/*
+ In principle the caller should be passing us a value return
+ from xen_save_fl_direct, but for robustness sake we test only
+ the X86_EFLAGS_IF flag rather than the whole byte. After
+ setting the interrupt mask state, it checks for unmasked
+ pending events and enters the hypervisor to get them delivered
+ if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ testb $X86_EFLAGS_IF>>8, %ah
+ setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+ This is run where a normal iret would be run, with the same stack setup:
+ 8: eflags
+ 4: cs
+ esp-> 0: eip
+
+ This attempts to make sure that any pending events are dealt
+ with on return to usermode, but there is a small window in
+ which an event can happen just before entering usermode. If
+ the nested interrupt ends up setting one of the TIF_WORK_MASK
+ pending work flags, they will not be tested again before
+ returning to usermode. This means that a process can end up
+ with pending work, which will be unprocessed until the process
+ enters and leaves the kernel again, which could be an
+ unbounded amount of time. This means that a pending signal or
+ reschedule event could be indefinitely delayed.
+
+ The fix is to notice a nested interrupt in the critical
+ window, and if one occurs, then fold the nested interrupt into
+ the current interrupt stack frame, and re-process it
+ iteratively rather than recursively. This means that it will
+ exit via the normal path, and all pending work will be dealt
+ with appropriately.
+
+ Because the nested interrupt handler needs to deal with the
+ current stack state in whatever form its in, we keep things
+ simple by only using a single register which is pushed/popped
+ on the stack.
+
+ Non-direct iret could be done in the same way, but it would
+ require an annoying amount of code duplication. We'll assume
+ that direct mode will be the common case once the hypervisor
+ support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+ /* test eflags for special cases */
+ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+ jnz hyper_iret
+
+ push %eax
+ ESP_OFFSET=4 # bytes pushed onto stack
+
+ /* Store vcpu_info pointer for easy access. Do it this
+ way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+ GET_THREAD_INFO(%eax)
+ movl TI_cpu(%eax),%eax
+ movl __per_cpu_offset(,%eax,4),%eax
+ lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+ movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+ /* check IF state we're restoring */
+ testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+ /* Maybe enable events. Once this happens we could get a
+ recursive event, so the critical region starts immediately
+ afterwards. However, if that happens we don't end up
+ resuming the code, so we don't have to be worried about
+ being preempted to another CPU. */
+ setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+ /* If there's something pending, mask events again so we
+ can jump back into xen_hypervisor_callback */
+ sete XEN_vcpu_info_mask(%eax)
+
+ popl %eax
+
+ /* From this point on the registers are restored and the stack
+ updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+ /* Jump to hypervisor_callback after fixing up the stack.
+ Events are masked, so jumping out of the critical
+ region is OK. */
+ je xen_hypervisor_callback
+
+ iret
+xen_iret_end_crit:
+
+hyper_iret:
+ /* put this out of line since its very rarely used */
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+ This is called by xen_hypervisor_callback in entry.S when it sees
+ that the EIP at the time of interrupt was between xen_iret_start_crit
+ and xen_iret_end_crit. We're passed the EIP in %eax so we can do
+ a more refined determination of what to do.
+
+ The stack format at this point is:
+ ----------------
+ ss : (ss/esp may be present if we came from usermode)
+ esp :
+ eflags } outer exception info
+ cs }
+ eip }
+ ---------------- <- edi (copy dest)
+ eax : outer eax if it hasn't been restored
+ ----------------
+ eflags } nested exception info
+ cs } (no ss/esp because we're nested
+ eip } from the same ring)
+ orig_eax }<- esi (copy src)
+ - - - - - - - -
+ fs }
+ es }
+ ds } SAVE_ALL state
+ eax }
+ : :
+ ebx }
+ ----------------
+ return addr <- esp
+ ----------------
+
+ In order to deliver the nested exception properly, we need to shift
+ everything from the return addr up to the error code so it
+ sits just under the outer exception info. This means that when we
+ handle the exception, we do it in the context of the outer exception
+ rather than starting a new one.
+
+ The only caveat is that if the outer eax hasn't been
+ restored yet (ie, it's still on stack), we need to insert
+ its value into the SAVE_ALL state before going on, since
+ it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+ /* offsets +4 for return address */
+
+ /*
+ Paranoia: Make sure we're really coming from userspace.
+ One could imagine a case where userspace jumps into the
+ critical range address, but just before the CPU delivers a GP,
+ it decides to deliver an interrupt instead. Unlikely?
+ Definitely. Easy to avoid? Yes. The Intel documents
+ explicitly say that the reported EIP for a bad jump is the
+ jump instruction itself, not the destination, but some virtual
+ environments get this wrong.
+ */
+ movl PT_CS+4(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ je 2f
+
+ lea PT_ORIG_EAX+4(%esp), %esi
+ lea PT_EFLAGS+4(%esp), %edi
+
+ /* If eip is before iret_restore_end then stack
+ hasn't been restored yet. */
+ cmp $iret_restore_end, %eax
+ jae 1f
+
+ movl 0+4(%edi),%eax /* copy EAX */
+ movl %eax, PT_EAX+4(%esp)
+
+ lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+
+ /* set up the copy */
+1: std
+ mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
+ rep movsl
+ cld
+
+ lea 4(%edi),%esp /* point esp to new frame */
+2: ret
+
+
+/*
+ Force an event check by making a hypercall,
+ but preserve regs before making the call.
+ */
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+ ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..bc71f3bc4014
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,38 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+ place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+ .section .init.text
+ENTRY(startup_xen)
+ movl %esi,xen_start_info
+ cld
+ movl $(init_thread_union+THREAD_SIZE),%esp
+ jmp xen_start_kernel
+
+.pushsection ".bss.page_aligned"
+ .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+ .skip 0x1000
+.popsection
+
+ .section .text
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+
+#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+/* These are code, but not functions. Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+
+void xen_mark_init_mm_pinned(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+ return x86_read_percpu(xen_lazy_mode);
+}
+
+void __init xen_fill_possible_map(void);
+
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+ int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+ void *info, int wait);
+
+
+/* Declare an asm function, along with symbols needed to make it
+ inlineable */
+#define DECL_ASM(ret, name, ...) \
+ ret name(__VA_ARGS__); \
+ extern char name##_end[]; \
+ extern char name##_reloc[] \
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */