aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/fault.c41
-rw-r--r--arch/s390/mm/gmap.c4
-rw-r--r--arch/s390/mm/init.c3
-rw-r--r--arch/s390/mm/mmap.c7
-rw-r--r--arch/s390/mm/pgalloc.c85
-rw-r--r--arch/s390/mm/vmem.c8
6 files changed, 73 insertions, 75 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index cce577feab1e..7a3144017301 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -631,6 +631,29 @@ void pfault_fini(void)
static DEFINE_SPINLOCK(pfault_lock);
static LIST_HEAD(pfault_list);
+#define PF_COMPLETE 0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule(). It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
static void pfault_interrupt(struct ext_code ext_code,
unsigned int param32, unsigned long param64)
{
@@ -639,10 +662,9 @@ static void pfault_interrupt(struct ext_code ext_code,
pid_t pid;
/*
- * Get the external interruption subcode & pfault
- * initial/completion signal bit. VM stores this
- * in the 'cpu address' field associated with the
- * external interrupt.
+ * Get the external interruption subcode & pfault initial/completion
+ * signal bit. VM stores this in the 'cpu address' field associated
+ * with the external interrupt.
*/
subcode = ext_code.subcode;
if ((subcode & 0xff00) != __SUBCODE_MASK)
@@ -658,7 +680,7 @@ static void pfault_interrupt(struct ext_code ext_code,
if (!tsk)
return;
spin_lock(&pfault_lock);
- if (subcode & 0x0080) {
+ if (subcode & PF_COMPLETE) {
/* signal bit is set -> a page has been swapped in by VM */
if (tsk->thread.pfault_wait == 1) {
/* Initial interrupt was faster than the completion
@@ -687,8 +709,7 @@ static void pfault_interrupt(struct ext_code ext_code,
goto out;
if (tsk->thread.pfault_wait == 1) {
/* Already on the list with a reference: put to sleep */
- __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- set_tsk_need_resched(tsk);
+ goto block;
} else if (tsk->thread.pfault_wait == -1) {
/* Completion interrupt was faster than the initial
* interrupt (pfault_wait == -1). Set pfault_wait
@@ -703,7 +724,11 @@ static void pfault_interrupt(struct ext_code ext_code,
get_task_struct(tsk);
tsk->thread.pfault_wait = 1;
list_add(&tsk->thread.list, &pfault_list);
- __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+block:
+ /* Since this must be a userspace fault, there
+ * is no kernel task state to trample. Rely on the
+ * return to userspace schedule() to block. */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
set_tsk_need_resched(tsk);
}
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 69247b4dcc43..cace818d86eb 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -23,7 +23,7 @@
/**
* gmap_alloc - allocate a guest address space
* @mm: pointer to the parent mm_struct
- * @limit: maximum size of the gmap address space
+ * @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
@@ -292,7 +292,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len < from || to + len < to ||
- from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
+ from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end)
return -EINVAL;
flush = 0;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index c7b0451397d6..2489b2e917c8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -89,7 +89,8 @@ void __init paging_init(void)
asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
pgd_type = _REGION3_ENTRY_EMPTY;
}
- S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
+ init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
+ S390_lowcore.kernel_asce = init_mm.context.asce;
clear_table((unsigned long *) init_mm.pgd, pgd_type,
sizeof(unsigned long)*2048);
vmem_map_init();
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 45c4daa49930..eb9df2822da1 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -22,6 +22,7 @@
* Started by Ingo Molnar <mingo@elte.hu>
*/
+#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -174,7 +175,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
if (!(flags & MAP_FIXED))
addr = 0;
if ((addr + len) >= TASK_SIZE)
- return crst_table_upgrade(current->mm, TASK_MAX_SIZE);
+ return crst_table_upgrade(current->mm);
return 0;
}
@@ -191,7 +192,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
return area;
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
/* Upgrade the page table to 4 levels and retry. */
- rc = crst_table_upgrade(mm, TASK_MAX_SIZE);
+ rc = crst_table_upgrade(mm);
if (rc)
return (unsigned long) rc;
area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
@@ -213,7 +214,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
return area;
if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
/* Upgrade the page table to 4 levels and retry. */
- rc = crst_table_upgrade(mm, TASK_MAX_SIZE);
+ rc = crst_table_upgrade(mm);
if (rc)
return (unsigned long) rc;
area = arch_get_unmapped_area_topdown(filp, addr, len,
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index f6c3de26cda8..e8b5962ac12a 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -76,81 +76,52 @@ static void __crst_table_upgrade(void *arg)
__tlb_flush_local();
}
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
+int crst_table_upgrade(struct mm_struct *mm)
{
unsigned long *table, *pgd;
- unsigned long entry;
- int flush;
- BUG_ON(limit > TASK_MAX_SIZE);
- flush = 0;
-repeat:
+ /* upgrade should only happen from 3 to 4 levels */
+ BUG_ON(mm->context.asce_limit != (1UL << 42));
+
table = crst_table_alloc(mm);
if (!table)
return -ENOMEM;
+
spin_lock_bh(&mm->page_table_lock);
- if (mm->context.asce_limit < limit) {
- pgd = (unsigned long *) mm->pgd;
- if (mm->context.asce_limit <= (1UL << 31)) {
- entry = _REGION3_ENTRY_EMPTY;
- mm->context.asce_limit = 1UL << 42;
- mm->context.asce_bits = _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS |
- _ASCE_TYPE_REGION3;
- } else {
- entry = _REGION2_ENTRY_EMPTY;
- mm->context.asce_limit = 1UL << 53;
- mm->context.asce_bits = _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS |
- _ASCE_TYPE_REGION2;
- }
- crst_table_init(table, entry);
- pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->task_size = mm->context.asce_limit;
- table = NULL;
- flush = 1;
- }
+ pgd = (unsigned long *) mm->pgd;
+ crst_table_init(table, _REGION2_ENTRY_EMPTY);
+ pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
+ mm->pgd = (pgd_t *) table;
+ mm->context.asce_limit = 1UL << 53;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm->task_size = mm->context.asce_limit;
spin_unlock_bh(&mm->page_table_lock);
- if (table)
- crst_table_free(mm, table);
- if (mm->context.asce_limit < limit)
- goto repeat;
- if (flush)
- on_each_cpu(__crst_table_upgrade, mm, 0);
+
+ on_each_cpu(__crst_table_upgrade, mm, 0);
return 0;
}
-void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
+void crst_table_downgrade(struct mm_struct *mm)
{
pgd_t *pgd;
+ /* downgrade should only happen from 3 to 2 levels (compat only) */
+ BUG_ON(mm->context.asce_limit != (1UL << 42));
+
if (current->active_mm == mm) {
clear_user_asce();
__tlb_flush_mm(mm);
}
- while (mm->context.asce_limit > limit) {
- pgd = mm->pgd;
- switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
- case _REGION_ENTRY_TYPE_R2:
- mm->context.asce_limit = 1UL << 42;
- mm->context.asce_bits = _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS |
- _ASCE_TYPE_REGION3;
- break;
- case _REGION_ENTRY_TYPE_R3:
- mm->context.asce_limit = 1UL << 31;
- mm->context.asce_bits = _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS |
- _ASCE_TYPE_SEGMENT;
- break;
- default:
- BUG();
- }
- mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
- mm->task_size = mm->context.asce_limit;
- crst_table_free(mm, (unsigned long *) pgd);
- }
+
+ pgd = mm->pgd;
+ mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
+ mm->context.asce_limit = 1UL << 31;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
+ mm->task_size = mm->context.asce_limit;
+ crst_table_free(mm, (unsigned long *) pgd);
+
if (current->active_mm == mm)
set_user_asce(mm);
}
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index d27fccbad7c1..d48cf25cfe99 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -56,7 +56,7 @@ static inline pmd_t *vmem_pmd_alloc(void)
return pmd;
}
-static pte_t __ref *vmem_pte_alloc(unsigned long address)
+static pte_t __ref *vmem_pte_alloc(void)
{
pte_t *pte;
@@ -121,7 +121,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
continue;
}
if (pmd_none(*pm_dir)) {
- pt_dir = vmem_pte_alloc(address);
+ pt_dir = vmem_pte_alloc();
if (!pt_dir)
goto out;
pmd_populate(&init_mm, pm_dir, pt_dir);
@@ -233,7 +233,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
- pt_dir = vmem_pte_alloc(address);
+ pt_dir = vmem_pte_alloc();
if (!pt_dir)
goto out;
pmd_populate(&init_mm, pm_dir, pt_dir);
@@ -370,7 +370,7 @@ void __init vmem_map_init(void)
ro_end = (unsigned long)&_eshared & PAGE_MASK;
for_each_memblock(memory, reg) {
start = reg->base;
- end = reg->base + reg->size - 1;
+ end = reg->base + reg->size;
if (start >= ro_end || end <= ro_start)
vmem_add_mem(start, end - start, 0);
else if (start >= ro_start && end <= ro_end)