Committing on behalf or ariane@.

recommit pmemrange: physmem allocator: change the view of free memory from single free pages to free ranges. Classify memory based on region with associated use-counter (which is used to construct a priority list of where to allocate memory). Based on code from tedu@, help from many. Useable now that bugs have been found and fixed in most architecture's pmap.c ok by everyone who has done a pmap or uvm commit in the last year.
author: oga <oga@openbsd.org> 2010-04-22 19:02:44 +0000
committer: oga <oga@openbsd.org> 2010-04-22 19:02:44 +0000
commit: a3544580456680ac17bea7051ae709d5e34e7208 (patch)
tree: 21a35190754e62a9ed9f6031aab7d11fd1256cc3
parent: zap trailing whitespace; (diff)
download: wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.tar.xz
wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.zip
12 files changed, 2038 insertions, 493 deletions
diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c
index 8b17da23f22..317295656a2 100644
--- a/sys/arch/i386/i386/pmapae.c
+++ b/sys/arch/i386/i386/pmapae.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: pmapae.c,v 1.20 2009/08/06 15:28:14 oga Exp $	*/
+/*	$OpenBSD: pmapae.c,v 1.21 2010/04/22 19:02:44 oga Exp $	*/
 
 /*
  * Copyright (c) 2006 Michael Shalayeff
@@ -364,7 +364,7 @@
  * is a void function.
  *
  * [B] new page tables pages (PTP)
- * 	call pae_pagealloc()
+ * 	call uvm_pagealloc()
  * 		=> success: zero page, add to pm_pdir
  * 		=> failure: we are out of free vm_pages, let pmap_enter()
  *		   tell UVM about it.
@@ -553,13 +553,6 @@ extern int pmap_pg_g;
 extern struct pmap_head pmaps;
 
 /*
- * a towards larger memory prioritised version opf uvm_pagealloc()
- */
-#define	pae_pagealloc(obj, off, anon, flags) \
-    uvm_pagealloc_strat((obj), (off), (anon), (flags), \
-	UVM_PGA_STRAT_FALLBACK, VM_FREELIST_ABOVE4G)
-
-/*
  * local prototypes
  */
 
@@ -801,7 +794,7 @@ pmap_bootstrap_pae()
 	for (va = KERNBASE, eva = va + (nkpde << 22);
 	    va < eva; va += PAGE_SIZE) {
 		if (!pmap_valid_entry(PDE(kpm, pdei(va)))) {
-			ptp = pae_pagealloc(&kpm->pm_obj, va, NULL,
+			ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL,
 			    UVM_PGA_ZERO);
 			ptaddr = VM_PAGE_TO_PHYS(ptp);
 			PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V;
@@ -977,7 +970,7 @@ pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, boolean_t just_try)
 {
 	struct vm_page *ptp;
 
-	ptp = pae_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
+	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
 			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
 	if (ptp == NULL)
 		return(NULL);
diff --git a/sys/arch/sparc/include/vmparam.h b/sys/arch/sparc/include/vmparam.h
index 8dc59a3729f..a7ba9e169b3 100644
--- a/sys/arch/sparc/include/vmparam.h
+++ b/sys/arch/sparc/include/vmparam.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmparam.h,v 1.33 2008/07/22 18:15:48 miod Exp $	*/
+/*	$OpenBSD: vmparam.h,v 1.34 2010/04/22 19:02:47 oga Exp $	*/
 /*	$NetBSD: vmparam.h,v 1.13 1997/07/12 16:20:03 perry Exp $	*/
 
 /*
@@ -130,6 +130,9 @@ struct vm_page_md {
 #define VM_NFREELIST		1
 #define VM_FREELIST_DEFAULT	0
 
+/* No UVM_IO_RANGES required: IOMMU takes care of this. */
+#define UVM_IO_RANGES {}
+
 #if defined (_KERNEL) && !defined(_LOCORE)
 struct vm_map;
 #define		dvma_mapin(map,va,len,canwait)	dvma_mapin_space(map,va,len,canwait,0)
diff --git a/sys/arch/sparc64/include/vmparam.h b/sys/arch/sparc64/include/vmparam.h
index f2239697466..b3eca0ba6a4 100644
--- a/sys/arch/sparc64/include/vmparam.h
+++ b/sys/arch/sparc64/include/vmparam.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vmparam.h,v 1.18 2008/07/18 16:40:17 kurt Exp $	*/
+/*	$OpenBSD: vmparam.h,v 1.19 2010/04/22 19:02:49 oga Exp $	*/
 /*	$NetBSD: vmparam.h,v 1.18 2001/05/01 02:19:19 thorpej Exp $ */
 
 /*
@@ -145,6 +145,9 @@
 #define	VM_NFREELIST		1
 #define	VM_FREELIST_DEFAULT	0
 
+/* No UVM_IO_RANGES required: IOMMU takes care of this. */
+#define UVM_IO_RANGES {}
+
 #define __HAVE_VM_PAGE_MD
 /*
  * For each struct vm_page, there is a list of all currently valid virtual
diff --git a/sys/conf/files b/sys/conf/files
index 32e9a761cb2..9c06c5eabb2 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,4 +1,4 @@
-#	$OpenBSD: files,v 1.486 2010/04/20 22:53:24 miod Exp $
+#	$OpenBSD: files,v 1.487 2010/04/22 19:02:52 oga Exp $
 #	$NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $
 
 #	@(#)files.newconf	7.5 (Berkeley) 5/10/93
@@ -1009,6 +1009,7 @@ file uvm/uvm_page.c
 file uvm/uvm_pager.c
 file uvm/uvm_pdaemon.c
 file uvm/uvm_pglist.c
+file uvm/uvm_pmemrange.c
 file uvm/uvm_stat.c
 file uvm/uvm_swap.c
 file uvm/uvm_swap_encrypt.c		uvm_swap_encrypt
diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h
index 0e4966bec1a..a8d42714cbb 100644
--- a/sys/uvm/uvm.h
+++ b/sys/uvm/uvm.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm.h,v 1.37 2009/06/16 23:54:57 oga Exp $	*/
+/*	$OpenBSD: uvm.h,v 1.38 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $	*/
 
 /*
@@ -58,6 +58,7 @@
 #include <uvm/uvm_pager.h>
 #include <uvm/uvm_pdaemon.h>
 #include <uvm/uvm_swap.h>
+#include <uvm/uvm_pmemrange.h>
 #ifdef UVM_SWAP_ENCRYPT
 #include <uvm/uvm_swap_encrypt.h>
 #endif
@@ -68,6 +69,34 @@
 #include <machine/vmparam.h>
 
 /*
+ * UVM_IO_RANGES: paddr_t pairs, describing the lowest and highest address
+ * that should be reserved. These ranges (which may overlap) will have their
+ * use counter increased, causing them to be avoided if an allocation can be
+ * satisfied from another range of memory.
+ *
+ * UVM_IO_RANGES actually results into a call to uvm_pmr_use_inc() per range
+ * at uvm initialization. uvm_pmr_use_inc() can also be called after uvm_init()
+ * has completed.
+ *
+ * Note: the upper bound is specified in the same way as to uvm_pglistalloc.
+ * Ex: a memory range of 16 bit is specified as: { 0, 0xffff }.
+ * Default: no special ranges in use.
+ */
+#ifndef UVM_IO_RANGES
+#define UVM_IO_RANGES							\
+	{								\
+		{ 0, 0x00ffffffUL }, /* ISA memory */			\
+		{ 0, 0xffffffffUL }, /* 32-bit PCI memory */		\
+	}
+#endif
+
+/* UVM IO ranges are described in an array of struct uvm_io_ranges. */
+struct uvm_io_ranges {
+	paddr_t low;
+	paddr_t high;
+};
+
+/*
  * uvm structure (vm global state: collected in one structure for ease
  * of reference...)
  */
@@ -76,7 +105,6 @@ struct uvm {
 	/* vm_page related parameters */
 
 		/* vm_page queues */
-	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
 	struct pglist page_active;	/* allocated pages, in use */
 	struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */
 	struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */
@@ -86,6 +114,7 @@ struct uvm {
 	boolean_t page_init_done;	/* TRUE if uvm_page_init() finished */
 	boolean_t page_idle_zero;	/* TRUE if we should try to zero
 					   pages in the idle loop */
+	struct uvm_pmr_control pmr_control; /* pmemrange data */
 
 		/* page daemon trigger */
 	int pagedaemon;			/* daemon sleeps on this */
diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h
index f3a156d923a..5de889ddb48 100644
--- a/sys/uvm/uvm_extern.h
+++ b/sys/uvm/uvm_extern.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm_extern.h,v 1.84 2010/03/24 00:36:04 oga Exp $	*/
+/*	$OpenBSD: uvm_extern.h,v 1.85 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $	*/
 
 /*
@@ -208,14 +208,14 @@ typedef int		vm_prot_t;
 #define UVM_KMF_TRYLOCK	UVM_FLAG_TRYLOCK	/* try locking only */
 
 /*
- * the following defines the strategies for uvm_pagealloc_strat()
+ * the following defines the strategies for uvm_pagealloc()
  */
 #define	UVM_PGA_STRAT_NORMAL	0	/* high -> low free list walk */
 #define	UVM_PGA_STRAT_ONLY	1	/* only specified free list */
 #define	UVM_PGA_STRAT_FALLBACK	2	/* ONLY falls back on NORMAL */
 
 /*
- * flags for uvm_pagealloc_strat()
+ * flags for uvm_pagealloc()
  */
 #define UVM_PGA_USERESERVE	0x0001	/* ok to use reserve pages */
 #define	UVM_PGA_ZERO		0x0002	/* returned page must be zeroed */
@@ -226,6 +226,7 @@ typedef int		vm_prot_t;
 #define UVM_PLA_WAITOK		0x0001	/* may sleep */
 #define UVM_PLA_NOWAIT		0x0002	/* can't sleep (need one of the two) */
 #define UVM_PLA_ZERO		0x0004	/* zero all pages before returning */
+#define UVM_PLA_TRYCONTIG	0x0008	/* try to allocate contig physmem */
 
 /*
  * lockflags that control the locking behavior of various functions.
@@ -564,11 +565,8 @@ int			uvm_mmap(vm_map_t, vaddr_t *, vsize_t,
 				caddr_t, voff_t, vsize_t, struct proc *);
 
 /* uvm_page.c */
-struct vm_page		*uvm_pagealloc_strat(struct uvm_object *,
-				voff_t, struct vm_anon *, int, int, int);
-#define	uvm_pagealloc(obj, off, anon, flags) \
-	    uvm_pagealloc_strat((obj), (off), (anon), (flags), \
-				UVM_PGA_STRAT_NORMAL, 0)
+struct vm_page		*uvm_pagealloc(struct uvm_object *,
+				voff_t, struct vm_anon *, int);
 vaddr_t			uvm_pagealloc_contig(vaddr_t, vaddr_t,
 				vaddr_t, vaddr_t);
 void			uvm_pagerealloc(struct vm_page *, 
@@ -596,6 +594,9 @@ int			uvm_pglistalloc(psize_t, paddr_t,
 				struct pglist *, int, int); 
 void			uvm_pglistfree(struct pglist *);
 
+/* uvm_pmemrange.c */
+void			uvm_pmr_use_inc(paddr_t, paddr_t);
+
 /* uvm_swap.c */
 void			uvm_swap_init(void);
 
diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c
index c8abc87aa70..29028e79629 100644
--- a/sys/uvm/uvm_map.c
+++ b/sys/uvm/uvm_map.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm_map.c,v 1.123 2009/08/28 00:40:03 ariane Exp $	*/
+/*	$OpenBSD: uvm_map.c,v 1.124 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/
 
 /* 
@@ -3999,9 +3999,11 @@ uvm_page_printit(pg, full, pr)
 
 	/* cross-verify page queue */
 	if (pg->pg_flags & PQ_FREE) {
-		int fl = uvm_page_lookup_freelist(pg);
-		pgl = &uvm.page_free[fl].pgfl_queues[((pg)->pg_flags & PG_ZERO) ?
-		    PGFL_ZEROS : PGFL_UNKNOWN];
+		if (uvm_pmr_isfree(pg))
+			printf("  page found in uvm_pmemrange\n");
+		else
+			printf("  >>> page not found in uvm_pmemrange <<<\n");
+		pgl = NULL;
 	} else if (pg->pg_flags & PQ_INACTIVE) {
 		pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
 		    &uvm.page_inactive_swp : &uvm.page_inactive_obj;
diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c
index 467e072ff43..d40e6a41abb 100644
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm_page.c,v 1.99 2010/04/20 22:05:44 tedu Exp $	*/
+/*	$OpenBSD: uvm_page.c,v 1.100 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $	*/
 
 /* 
@@ -73,7 +73,6 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/malloc.h>
 #include <sys/sched.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
@@ -212,15 +211,12 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
 	 * init the page queues and page queue locks
 	 */
 
-	for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-		for (i = 0; i < PGFL_NQUEUES; i++)
-			TAILQ_INIT(&uvm.page_free[lcv].pgfl_queues[i]);
-	}
 	TAILQ_INIT(&uvm.page_active);
 	TAILQ_INIT(&uvm.page_inactive_swp);
 	TAILQ_INIT(&uvm.page_inactive_obj);
 	simple_lock_init(&uvm.pageqlock);
 	mtx_init(&uvm.fpageqlock, IPL_VM);
+	uvm_pmr_init();
 
 	/* 
 	 * allocate vm_page structures.
@@ -271,9 +267,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
 	for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) {
 		n = vm_physmem[lcv].end - vm_physmem[lcv].start;
 		if (n > pagecount) {
-			printf("uvm_page_init: lost %ld page(s) in init\n",
+			panic("uvm_page_init: lost %ld page(s) in init\n",
 			    (long)(n - pagecount));
-			panic("uvm_page_init");  /* XXXCDC: shouldn't happen? */
+			    /* XXXCDC: shouldn't happen? */
 			/* n = pagecount; */
 		}
 
@@ -293,10 +289,15 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
 			if (atop(paddr) >= vm_physmem[lcv].avail_start &&
 			    atop(paddr) <= vm_physmem[lcv].avail_end) {
 				uvmexp.npages++;
-				/* add page to free pool */
-				uvm_pagefree(&vm_physmem[lcv].pgs[i]);
 			}
 		}
+
+		/*
+		 * Add pages to free pool.
+		 */
+		uvm_pmr_freepages(&vm_physmem[lcv].pgs[
+		    vm_physmem[lcv].avail_start - vm_physmem[lcv].start],
+		    vm_physmem[lcv].avail_end - vm_physmem[lcv].avail_start);
 	}
 
 	/*
@@ -651,13 +652,19 @@ uvm_page_physload_flags(paddr_t start, paddr_t end, paddr_t avail_start,
 				} else {
 #if defined(VM_PHYSSEG_NOADD)
 		panic("uvm_page_physload: tried to add RAM after vm_mem_init");
-#else
-					uvm_pagefree(&pgs[lcv]);
 #endif
 				}
 			}
 		}
-		/* XXXCDC: incomplete: need to update uvmexp.free, what else? */
+
+		/*
+		 * Add pages to free pool.
+		 */
+		if ((flags & PHYSLOAD_DEVICE) == 0) {
+			uvm_pmr_freepages(&pgs[avail_start - start],
+			    avail_end - avail_start);
+		}
+
 		/* XXXCDC: need hook to tell pmap to rebuild pv_list, etc... */
 	} else {
 
@@ -778,31 +785,21 @@ uvm_shutdown(void)
  * => if anon != NULL, anon must be locked (to put in anon)
  * => only one of obj or anon can be non-null
  * => caller must activate/deactivate page if it is not wired.
- * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
- * => policy decision: it is more important to pull a page off of the
- *	appropriate priority free list than it is to get a zero'd or
- *	unknown contents page.  This is because we live with the
- *	consequences of a bad free list decision for the entire
- *	lifetime of the page, e.g. if the page comes from memory that
- *	is slower to access.
  */
 
 struct vm_page *
-uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
-    int flags, int strat, int free_list)
+uvm_pagealloc(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
+    int flags)
 {
-	int lcv, try1, try2, zeroit = 0;
 	struct vm_page *pg;
-	struct pglist *freeq;
-	struct pgfreelist *pgfl;
+	struct pglist pgl;
+	int pmr_flags;
 	boolean_t use_reserve;
-	UVMHIST_FUNC("uvm_pagealloc_strat"); UVMHIST_CALLED(pghist);
+	UVMHIST_FUNC("uvm_pagealloc"); UVMHIST_CALLED(pghist);
 
 	KASSERT(obj == NULL || anon == NULL);
 	KASSERT(off == trunc_page(off));
 
-	uvm_lock_fpageq();
-
 	/*
 	 * check to see if we need to generate some free pages waking
 	 * the pagedaemon.
@@ -829,124 +826,39 @@ uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
 	      (curproc == syncerproc))))
 		goto fail;
 
-#if PGFL_NQUEUES != 2
-#error uvm_pagealloc_strat needs to be updated
-#endif
-
-	/*
-	 * If we want a zero'd page, try the ZEROS queue first, otherwise
-	 * we try the UNKNOWN queue first.
-	 */
-	if (flags & UVM_PGA_ZERO) {
-		try1 = PGFL_ZEROS;
-		try2 = PGFL_UNKNOWN;
-	} else {
-		try1 = PGFL_UNKNOWN;
-		try2 = PGFL_ZEROS;
-	}
-
-	UVMHIST_LOG(pghist, "obj=%p off=%lx anon=%p flags=%lx",
-	    obj, (u_long)off, anon, flags);
-	UVMHIST_LOG(pghist, "strat=%ld free_list=%ld", strat, free_list, 0, 0);
- again:
-	switch (strat) {
-	case UVM_PGA_STRAT_NORMAL:
-		/* Check all freelists in descending priority order. */
-		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-			pgfl = &uvm.page_free[lcv];
-			if ((pg = TAILQ_FIRST((freeq =
-			      &pgfl->pgfl_queues[try1]))) != NULL ||
-			    (pg = TAILQ_FIRST((freeq =
-			      &pgfl->pgfl_queues[try2]))) != NULL)
-				goto gotit;
-		}
-
-		/* No pages free! */
-		goto fail;
-
-	case UVM_PGA_STRAT_ONLY:
-	case UVM_PGA_STRAT_FALLBACK:
-		/* Attempt to allocate from the specified free list. */
-		KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
-		pgfl = &uvm.page_free[free_list];
-		if ((pg = TAILQ_FIRST((freeq =
-		      &pgfl->pgfl_queues[try1]))) != NULL ||
-		    (pg = TAILQ_FIRST((freeq =
-		      &pgfl->pgfl_queues[try2]))) != NULL)
-			goto gotit;
-
-		/* Fall back, if possible. */
-		if (strat == UVM_PGA_STRAT_FALLBACK) {
-			strat = UVM_PGA_STRAT_NORMAL;
-			goto again;
-		}
-
-		/* No pages free! */
+	pmr_flags = UVM_PLA_NOWAIT;
+	if (flags & UVM_PGA_ZERO)
+		pmr_flags |= UVM_PLA_ZERO;
+	TAILQ_INIT(&pgl);
+	if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, pmr_flags, &pgl) != 0)
 		goto fail;
 
-	default:
-		panic("uvm_pagealloc_strat: bad strat %d", strat);
-		/* NOTREACHED */
-	}
-
- gotit:
-	TAILQ_REMOVE(freeq, pg, pageq);
-	uvmexp.free--;
-
-	/* update zero'd page count */
-	if (pg->pg_flags & PG_ZERO)
-		uvmexp.zeropages--;
-
-	/*
-	 * update allocation statistics and remember if we have to
-	 * zero the page
-	 */
-	if (flags & UVM_PGA_ZERO) {
-		if (pg->pg_flags & PG_ZERO) {
-			uvmexp.pga_zerohit++;
-			zeroit = 0;
-		} else {
-			uvmexp.pga_zeromiss++;
-			zeroit = 1;
-		}
-	}
-
-	uvm_unlock_fpageq();		/* unlock free page queue */
+	pg = TAILQ_FIRST(&pgl);
+	KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
 
 	pg->offset = off;
 	pg->uobject = obj;
 	pg->uanon = anon;
 	KASSERT((pg->pg_flags & PG_DEV) == 0);
-	pg->pg_flags = PG_BUSY|PG_CLEAN|PG_FAKE;
-	pg->pg_version++;
+	atomic_setbits_int(&pg->pg_flags, PG_BUSY|PG_CLEAN|PG_FAKE);
+	if (flags & UVM_PGA_ZERO)
+		atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
 	if (anon) {
 		anon->an_page = pg;
 		atomic_setbits_int(&pg->pg_flags, PQ_ANON);
-	} else {
-		if (obj)
-			uvm_pageinsert(pg);
-	}
+	} else if (obj)
+		uvm_pageinsert(pg);
+
 #if defined(UVM_PAGE_TRKOWN)
 	pg->owner_tag = NULL;
 #endif
 	UVM_PAGE_OWN(pg, "new alloc");
 
-	if (flags & UVM_PGA_ZERO) {
-		/*
-		 * A zero'd page is not clean.  If we got a page not already
-		 * zero'd, then we have to zero it ourselves.
-		 */
-		atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
-		if (zeroit)
-			pmap_zero_page(pg);
-	}
-
 	UVMHIST_LOG(pghist, "allocated pg %p/%lx", pg,
 	    (u_long)VM_PAGE_TO_PHYS(pg), 0, 0);
 	return(pg);
 
  fail:
-	uvm_unlock_fpageq();
 	UVMHIST_LOG(pghist, "failed!", 0, 0, 0, 0);
 	return (NULL);
 }
@@ -1030,7 +942,7 @@ uvm_pagefree(struct vm_page *pg)
 		if (saved_loan_count)
 			atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
 		uvm_pageremove(pg);
-		
+
 		/*
 		 * if our page was on loan, then we just lost control over it
 		 * (in fact, if it was loaned to an anon, the anon may have
@@ -1085,38 +997,31 @@ uvm_pagefree(struct vm_page *pg)
 	}
 	if (pg->uanon) {
 		pg->uanon->an_page = NULL;
-#ifdef UBC
-		uvm_pgcnt_anon--;
-#endif
+		pg->uanon = NULL;
+		atomic_clearbits_int(&pg->pg_flags, PQ_ANON);
 	}
 
 	/*
-	 * and put on free queue
+	 * Clean page state bits.
 	 */
+	atomic_clearbits_int(&pg->pg_flags, PQ_AOBJ); /* XXX: find culprit */
+	atomic_clearbits_int(&pg->pg_flags, PQ_ENCRYPT|
+	    PG_ZERO|PG_FAKE|PG_BUSY|PG_RELEASED|PG_CLEAN|PG_CLEANCHK);
 
-	atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
+	/*
+	 * and put on free queue
+	 */
 
-	uvm_lock_fpageq();
-#ifdef PAGEFASTRECYCLE
-	TAILQ_INSERT_HEAD(&uvm.page_free[
-	    uvm_page_lookup_freelist(pg)].pgfl_queues[PGFL_UNKNOWN], pg, pageq);
-#else
-	TAILQ_INSERT_TAIL(&uvm.page_free[
-	    uvm_page_lookup_freelist(pg)].pgfl_queues[PGFL_UNKNOWN], pg, pageq);
-#endif
-	atomic_clearbits_int(&pg->pg_flags, PQ_MASK);
-	atomic_setbits_int(&pg->pg_flags, PQ_FREE);
 #ifdef DEBUG
 	pg->uobject = (void *)0xdeadbeef;
 	pg->offset = 0xdeadbeef;
 	pg->uanon = (void *)0xdeadbeef;
 #endif
-	uvmexp.free++;
+
+	uvm_pmr_freepages(pg, 1);
 
 	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
 		uvm.page_idle_zero = vm_page_zero_enable;
-
-	uvm_unlock_fpageq();
 }
 
 /*
@@ -1216,6 +1121,7 @@ uvm_page_own(struct vm_page *pg, char *tag)
 void
 uvm_pageidlezero(void)
 {
+#if 0 /* disabled: need new code */
 	struct vm_page *pg;
 	struct pgfreelist *pgfl;
 	int free_list;
@@ -1282,6 +1188,7 @@ uvm_pageidlezero(void)
 		uvmexp.zeropages++;
 		uvm_unlock_fpageq();
 	} while (curcpu_is_idle());
+#endif /* 0 */
 }
 
 /*
diff --git a/sys/uvm/uvm_page.h b/sys/uvm/uvm_page.h
index 00324b59037..eda9030fe63 100644
--- a/sys/uvm/uvm_page.h
+++ b/sys/uvm/uvm_page.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm_page.h,v 1.41 2010/03/24 00:36:04 oga Exp $	*/
+/*	$OpenBSD: uvm_page.h,v 1.42 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm_page.h,v 1.19 2000/12/28 08:24:55 chs Exp $	*/
 
 /* 
@@ -116,6 +116,7 @@ struct vm_page {
 						 * to read: [O or P]
 						 * to modify: [O _and_ P] */
 	paddr_t			phys_addr;	/* physical address of page */
+	psize_t			fpgsz;		/* free page range size */
 
 #ifdef __HAVE_VM_PAGE_MD
 	struct vm_page_md	mdpage;		/* pmap-specific data */
diff --git a/sys/uvm/uvm_pglist.c b/sys/uvm/uvm_pglist.c
index b9826f6ee62..b4b15f326c0 100644
--- a/sys/uvm/uvm_pglist.c
+++ b/sys/uvm/uvm_pglist.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: uvm_pglist.c,v 1.35 2009/08/13 15:29:59 deraadt Exp $	*/
+/*	$OpenBSD: uvm_pglist.c,v 1.36 2010/04/22 19:02:55 oga Exp $	*/
 /*	$NetBSD: uvm_pglist.c,v 1.13 2001/02/18 21:19:08 chs Exp $	*/
 
 /*-
@@ -56,112 +56,6 @@ u_long	uvm_pglistalloc_npages;
 #define	STAT_DECR(v)
 #endif
 
-int	uvm_pglistalloc_simple(psize_t, paddr_t, paddr_t, struct pglist *);
-
-/*
- * Simple page allocation: pages do not need to be contiguous. We just
- * attempt to find enough free pages in the given range.
- */
-int
-uvm_pglistalloc_simple(psize_t size, paddr_t low, paddr_t high,
-    struct pglist *rlist)
-{
-	psize_t todo;
-	int psi;
-	struct vm_page *pg;
-	struct vm_physseg *seg;
-	paddr_t slow, shigh;
-	int pgflidx, error, free_list;
-	UVMHIST_FUNC("uvm_pglistalloc_simple"); UVMHIST_CALLED(pghist);
-#ifdef DEBUG
-	vm_page_t tp;
-#endif
-
-	/* Default to "lose". */
-	error = ENOMEM;
-
-	todo = atop(size);
-
-	/*
-	 * Block all memory allocation and lock the free list.
-	 */
-	uvm_lock_fpageq();
-
-	/* Are there even any free pages? */
-	if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
-		goto out;
-
-	for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) {
-		/*
-		 * Skip this segment if incompatible with the address range.
-		 */
-		if (seg->avail_end <= atop(low))
-			continue;
-		if (seg->avail_start >= atop(high))
-			continue;
-
-		slow = MAX(atop(low), seg->avail_start);
-		shigh = MIN(atop(high), seg->avail_end);
-
-		/* we want to be able to allocate at least a page... */
-		if (slow == shigh)
-			continue;
-
-		for (pg = &seg->pgs[slow - seg->start]; slow != shigh;
-		    slow++, pg++) {
-			if (VM_PAGE_IS_FREE(pg) == 0)
-				continue;
-
-			free_list = uvm_page_lookup_freelist(pg);
-			pgflidx = (pg->pg_flags & PG_ZERO) ?
-			    PGFL_ZEROS : PGFL_UNKNOWN;
-#ifdef DEBUG
-			for (tp = TAILQ_FIRST(&uvm.page_free[free_list].pgfl_queues[pgflidx]);
-			     tp != NULL; tp = TAILQ_NEXT(tp, pageq)) {
-				if (tp == pg)
-					break;
-			}
-			if (tp == NULL)
-				panic("uvm_pglistalloc_simple: page not on freelist");
-#endif
-			TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx],
-			    pg, pageq);
-			uvmexp.free--;
-			if (pg->pg_flags & PG_ZERO)
-				uvmexp.zeropages--;
-			pg->uobject = NULL;
-			pg->uanon = NULL;
-			pg->pg_version++;
-			TAILQ_INSERT_TAIL(rlist, pg, pageq);
-			STAT_INCR(uvm_pglistalloc_npages);
-			if (--todo == 0) {
-				error = 0;
-				goto out;
-			}
-		}
-
-	}
-
-out:
-	/*
-	 * check to see if we need to generate some free pages waking
-	 * the pagedaemon.
-	 */
-
-	if (!error && (uvmexp.free + uvmexp.paging < uvmexp.freemin ||
-	    (uvmexp.free + uvmexp.paging < uvmexp.freetarg &&
-	    uvmexp.inactive < uvmexp.inactarg))) {
-		wakeup(&uvm.pagedaemon);
-	}
-
-	uvm_unlock_fpageq();
-
-	if (error)
-		uvm_pglistfree(rlist);
-
-	return (error);
-}
-
 /*
  * uvm_pglistalloc: allocate a list of pages
  *
@@ -179,202 +73,54 @@ out:
  *	alignment	memory must be aligned to this power-of-two boundary.
  *	boundary	no segment in the allocation may cross this 
  *			power-of-two boundary (relative to zero).
+ * => flags:
+ *	UVM_PLA_NOWAIT	fail if allocation fails
+ *	UVM_PLA_WAITOK	wait for memory to become avail
+ *	UVM_PLA_ZERO	return zeroed memory
+ *	UVM_PLA_TRYCONTIG caller (device) prefers p-lineair memory
  */
 
 int
 uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
     paddr_t boundary, struct pglist *rlist, int nsegs, int flags)
 {
-	int psi;
-	struct vm_page *pgs;
-	struct vm_physseg *seg;
-	paddr_t slow, shigh;
-	paddr_t try, idxpa, lastidxpa;
-	int tryidx, idx, pgflidx, endidx, error, free_list;
-	vm_page_t m;
-	u_long pagemask;
-#ifdef DEBUG
-	vm_page_t tp;
-#endif
 	UVMHIST_FUNC("uvm_pglistalloc"); UVMHIST_CALLED(pghist);
 
 	KASSERT((alignment & (alignment - 1)) == 0);
 	KASSERT((boundary & (boundary - 1)) == 0);
-	/*
-	 * This argument is always ignored for now, but ensure drivers always
-	 * show intention.
-	 */
 	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
-	
-	/*
-	 * Our allocations are always page granularity, so our alignment
-	 * must be, too.
-	 */
-	if (alignment < PAGE_SIZE)
-		alignment = PAGE_SIZE;
 
 	if (size == 0)
 		return (EINVAL);
 
-	size = round_page(size);
-	low = roundup(low, alignment);
-
-	/*
-	 * If we are allowed to allocate as many segments as pages,
-	 * no need to be smart.
-	 */
-	if ((nsegs >= size / PAGE_SIZE) && (alignment == PAGE_SIZE) &&
-	    (boundary == 0)) {
-		error = uvm_pglistalloc_simple(size, low, high, rlist);
-		goto done;
-	}
-
-	if (boundary != 0 && boundary < size)
-		return (EINVAL);
-
-	pagemask = ~(boundary - 1);
-
-	/* Default to "lose". */
-	error = ENOMEM;
-
-	/*
-	 * Block all memory allocation and lock the free list.
-	 */
-	uvm_lock_fpageq();
-
-	/* Are there even any free pages? */
-	if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
-		goto out;
-
-	for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) {
-		/*
-		 * Skip this segment if incompatible with the address range.
-		 */
-		if (seg->avail_end <= atop(low))
-			continue;
-		if (seg->avail_start >= atop(high))
-			continue;
-
-		slow = MAX(low, ptoa(seg->avail_start));
-		shigh = MIN(high, ptoa(seg->avail_end));
-
-		try = roundup(slow, alignment);
-		for (;; try += alignment) {
-			if (try + size > shigh) {
-				/*
-				 * We've run past the allowable range, or
-				 * the segment. Try another.
-				 */
-				break;
-			}
-
-			tryidx = idx = atop(try) - seg->start;
-			endidx = idx + atop(size);
-			pgs = vm_physmem[psi].pgs;
-
-			/*
-			 * Found a suitable starting page.  See if the
-			 * range is free.
-			 */
-
-			for (; idx < endidx; idx++) {
-				if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) {
-					break;
-				}
-				idxpa = VM_PAGE_TO_PHYS(&pgs[idx]);
-				if (idx == tryidx)
-					continue;
-
-				/*
-				 * Check that the region is contiguous
-				 * (it really should...) and does not
-				 * cross an alignment boundary.
-				 */
-				lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]);
-				if ((lastidxpa + PAGE_SIZE) != idxpa)
-					break;
-
-				if (boundary != 0 &&
-				    ((lastidxpa ^ idxpa) & pagemask) != 0)
-					break;
-			}
-
-			if (idx == endidx) {
-				goto found;
-			}
-		}
+	if ((high & PAGE_MASK) != PAGE_MASK) {
+		printf("uvm_pglistalloc: Upper boundary 0x%lx "
+		    "not on pagemask.\n", (unsigned long)high);
 	}
 
 	/*
-	 * We could not allocate a contiguous range.  This is where
-	 * we should try harder if nsegs > 1...
-	 */
-	goto out;
-
-#if PGFL_NQUEUES != 2
-#error uvm_pglistalloc needs to be updated
-#endif
-
-found:
-	/*
-	 * we have a chunk of memory that conforms to the requested constraints.
+	 * Our allocations are always page granularity, so our alignment
+	 * must be, too.
 	 */
-	idx = tryidx;
-	while (idx < endidx) {
-		m = &pgs[idx];
-		free_list = uvm_page_lookup_freelist(m);
-		pgflidx = (m->pg_flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
-#ifdef DEBUG
-		for (tp = TAILQ_FIRST(&uvm.page_free[
-			free_list].pgfl_queues[pgflidx]);
-		     tp != NULL;
-		     tp = TAILQ_NEXT(tp, pageq)) {
-			if (tp == m)
-				break;
-		}
-		if (tp == NULL)
-			panic("uvm_pglistalloc: page not on freelist");
-#endif
-		TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx],
-		    m, pageq);
-		uvmexp.free--;
-		if (m->pg_flags & PG_ZERO)
-			uvmexp.zeropages--;
-		m->uobject = NULL;
-		m->uanon = NULL;
-		m->pg_version++;
-		TAILQ_INSERT_TAIL(rlist, m, pageq);
-		idx++;
-		STAT_INCR(uvm_pglistalloc_npages);
-	}
-	error = 0;
+	if (alignment < PAGE_SIZE)
+		alignment = PAGE_SIZE;
 
-out:
+	low = atop(roundup(low, alignment));
 	/*
-	 * check to see if we need to generate some free pages waking
-	 * the pagedaemon.
+	 * high + 1 may result in overflow, in which case high becomes 0x0,
+	 * which is the 'don't care' value.
+	 * The only requirement in that case is that low is also 0x0, or the
+	 * low<high assert will fail.
 	 */
-	 
-	if (uvmexp.free + uvmexp.paging < uvmexp.freemin ||
-	    (uvmexp.free + uvmexp.paging < uvmexp.freetarg &&
-	     uvmexp.inactive < uvmexp.inactarg)) {
-		wakeup(&uvm.pagedaemon);
-	}
-
-	uvm_unlock_fpageq();
-
-done: 
-	/* No locking needed here, pages are not on any queue. */
-	if (error == 0) {
-		TAILQ_FOREACH(m, rlist, pageq) {
-			if (flags & UVM_PLA_ZERO &&
-			    (m->pg_flags & PG_ZERO) == 0)
-				uvm_pagezero(m);
-			m->pg_flags = PG_CLEAN;
-		}
-	}
-
-	return (error);
+	high = atop(high + 1);
+	size = atop(round_page(size));
+	alignment = atop(alignment);
+	if (boundary < PAGE_SIZE && boundary != 0)
+		boundary = PAGE_SIZE;
+	boundary = atop(boundary);
+
+	return uvm_pmr_getpages(size, low, high, alignment, boundary, nsegs,
+	    flags, rlist);
 }
 
 /*
@@ -386,43 +132,6 @@ done:
 void
 uvm_pglistfree(struct pglist *list)
 {
-	struct vm_page *m;
 	UVMHIST_FUNC("uvm_pglistfree"); UVMHIST_CALLED(pghist);
-
-	/*
-	 * Block all memory allocation and lock the free list.
-	 */
-	uvm_lock_fpageq();
-
-	while ((m = TAILQ_FIRST(list)) != NULL) {
-		KASSERT((m->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) == 0);
-		TAILQ_REMOVE(list, m, pageq);
-#ifdef DEBUG
-		if (m->uobject == (void *)0xdeadbeef &&
-		    m->uanon == (void *)0xdeadbeef) {
-			panic("uvm_pglistfree: freeing free page %p", m);
-		}
-
-		m->uobject = (void *)0xdeadbeef;
-		m->offset = 0xdeadbeef;
-		m->uanon = (void *)0xdeadbeef;
-#endif
-		atomic_clearbits_int(&m->pg_flags, PQ_MASK);
-		atomic_setbits_int(&m->pg_flags, PQ_FREE);
-#ifdef PAGEFASTRECYCLE
-		TAILQ_INSERT_HEAD(&uvm.page_free[
-		    uvm_page_lookup_freelist(m)].pgfl_queues[PGFL_UNKNOWN],
-		    m, pageq);
-#else
-		TAILQ_INSERT_TAIL(&uvm.page_free[
-		    uvm_page_lookup_freelist(m)].pgfl_queues[PGFL_UNKNOWN],
-		    m, pageq);
-#endif
-		uvmexp.free++;
-		if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
-			uvm.page_idle_zero = vm_page_zero_enable;
-		STAT_DECR(uvm_pglistalloc_npages);
-	}
-
-	uvm_unlock_fpageq();
+	uvm_pmr_freepageq(list);
 }
diff --git a/sys/uvm/uvm_pmemrange.c b/sys/uvm/uvm_pmemrange.c
new file mode 100644
index 00000000000..19a6a4f94f1
--- /dev/null
+++ b/sys/uvm/uvm_pmemrange.c
@@ -0,0 +1,1813 @@
+/*	$OpenBSD: uvm_pmemrange.c,v 1.10 2010/04/22 19:02:55 oga Exp $	*/
+
+/*
+ * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <uvm/uvm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>		/* XXX for atomic */
+
+/*
+ * 2 trees: addr tree and size tree.
+ *
+ * The allocator keeps chunks of free pages (called a range).
+ * Two pages are part of the same range if:
+ * - all pages in between are part of that range,
+ * - they are of the same memory type (zeroed or non-zeroed),
+ * - they are part of the same pmemrange.
+ * A pmemrange is a range of memory which is part of the same vm_physseg
+ * and has a use-count.
+ *
+ * addr tree is vm_page[0].objt
+ * size tree is vm_page[1].objt
+ *
+ * The size tree is not used for memory ranges of 1 page, instead,
+ * single queue is vm_page[0].pageq
+ *
+ * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges
+ * are joined, unless:
+ * - they have pages in between them which are not free
+ * - they belong to different memtypes (zeroed vs dirty memory)
+ * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
+ * - they are not a continuation of the same array
+ * The latter issue is caused by vm_physseg ordering and splitting from the
+ * MD initialization machinery. The MD code is dependant on freelists and
+ * happens to split ISA memory from non-ISA memory.
+ * (Note: freelists die die die!)
+ *
+ * uvm_page_init guarantees that every vm_physseg contains an array of
+ * struct vm_page. Also, uvm_page_physload allocates an array of struct
+ * vm_page. This code depends on that array. The array may break across
+ * vm_physsegs boundaries.
+ */
+
+/*
+ * Validate the flags of the page. (Used in asserts.)
+ * Any free page must have the PQ_FREE flag set.
+ * Free pages may be zeroed.
+ * Pmap flags are left untouched.
+ *
+ * The PQ_FREE flag is not checked here: by not checking, we can easily use
+ * this check in pages which are freed.
+ */
+#define VALID_FLAGS(pg_flags)						\
+	(((pg_flags) & ~(PQ_FREE|PG_ZERO|				\
+	    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3)) == 0x0)
+
+/* Tree comparators. */
+int	uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
+int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
+int	uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *);
+int	uvm_pmr_size_cmp(struct vm_page *, struct vm_page *);
+int	uvm_pmr_pg_to_memtype(struct vm_page *);
+
+#ifdef DDB
+void	uvm_pmr_print(void);
+#endif
+
+/*
+ * Memory types. The page flags are used to derive what the current memory
+ * type of a page is.
+ */
+int
+uvm_pmr_pg_to_memtype(struct vm_page *pg)
+{
+	if (pg->pg_flags & PG_ZERO)
+		return UVM_PMR_MEMTYPE_ZERO;
+	/* Default: dirty memory. */
+	return UVM_PMR_MEMTYPE_DIRTY;
+}
+
+/* Trees. */
+RB_PROTOTYPE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
+RB_PROTOTYPE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
+RB_PROTOTYPE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
+    uvm_pmemrange_addr_cmp);
+RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
+RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
+RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
+    uvm_pmemrange_addr_cmp);
+
+/* Validation. */
+#ifdef DEBUG
+void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
+#else
+#define uvm_pmr_assertvalid(pmr)	do {} while (0)
+#endif
+
+
+int			 uvm_pmr_get1page(psize_t, int, struct pglist *,
+			    paddr_t, paddr_t);
+
+struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
+struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
+struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
+			    struct vm_page *, int);
+void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
+			    struct vm_page *pg, struct vm_page **pg_prev,
+			    struct vm_page **pg_next);
+struct vm_page		*uvm_pmr_insert_addr(struct uvm_pmemrange *,
+			    struct vm_page *, int);
+void			 uvm_pmr_insert_size(struct uvm_pmemrange *,
+			    struct vm_page *);
+struct vm_page		*uvm_pmr_insert(struct uvm_pmemrange *,
+			    struct vm_page *, int);
+void			 uvm_pmr_remove_size(struct uvm_pmemrange *,
+			    struct vm_page *);
+void			 uvm_pmr_remove_addr(struct uvm_pmemrange *,
+			    struct vm_page *);
+void			 uvm_pmr_remove(struct uvm_pmemrange *,
+			    struct vm_page *);
+struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
+			    struct vm_page *, paddr_t);
+psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
+			    struct vm_page **, int);
+void			 uvm_pmr_split(paddr_t);
+struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
+struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
+			    struct uvm_pmemrange *);
+struct vm_page		*uvm_pmr_extract_range(struct uvm_pmemrange *,
+			    struct vm_page *, paddr_t, paddr_t,
+			    struct pglist *);
+psize_t			 pow2divide(psize_t, psize_t);
+struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
+			    struct vm_page *, paddr_t, paddr_t, int);
+
+/*
+ * Computes num/denom and rounds it up to the next power-of-2.
+ *
+ * This is a division function which calculates an approximation of
+ * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
+ * have to be accurate.
+ *
+ * Providing too large a value makes the allocator slightly faster, at the
+ * risk of hitting the failure case more often. Providing too small a value
+ * makes the allocator a bit slower, but less likely to hit a failure case.
+ */
+psize_t
+pow2divide(psize_t num, psize_t denom)
+{
+	int rshift;
+
+	for (rshift = 0; num > denom; rshift++, denom <<= 1);
+	return (paddr_t)1 << rshift;
+}
+
+/*
+ * Predicate: lhs is a subrange or rhs.
+ *
+ * If rhs_low == 0: don't care about lower bound.
+ * If rhs_high == 0: don't care about upper bound.
+ */
+#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
+	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
+	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
+
+/*
+ * Predicate: lhs intersects with rhs.
+ *
+ * If rhs_low == 0: don't care about lower bound.
+ * If rhs_high == 0: don't care about upper bound.
+ * Ranges don't intersect if they don't have any page in common, array
+ * semantics mean that < instead of <= should be used here.
+ */
+#define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
+	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
+	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
+
+/*
+ * Align to power-of-2 alignment.
+ */
+#define PMR_ALIGN(pgno, align)						\
+	(((pgno) + ((align) - 1)) & ~((align) - 1))
+
+
+/*
+ * Comparator: sort by address ascending.
+ */
+int
+uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
+{
+	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
+}
+
+/*
+ * Comparator: sort by use ascending.
+ *
+ * The higher the use value of a range, the more devices need memory in
+ * this range. Therefor allocate from the range with the lowest use first.
+ */
+int
+uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
+{
+	int result;
+
+	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
+	if (result == 0)
+		result = uvm_pmemrange_addr_cmp(lhs, rhs);
+	return result;
+}
+
+int
+uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs)
+{
+	paddr_t lhs_addr, rhs_addr;
+
+	lhs_addr = VM_PAGE_TO_PHYS(lhs);
+	rhs_addr = VM_PAGE_TO_PHYS(rhs);
+
+	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
+}
+
+int
+uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs)
+{
+	psize_t lhs_size, rhs_size;
+	int cmp;
+
+	/* Using second tree, so we receive pg[1] instead of pg[0]. */
+	lhs_size = (lhs - 1)->fpgsz;
+	rhs_size = (rhs - 1)->fpgsz;
+
+	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
+	if (cmp == 0)
+		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
+	return cmp;
+}
+
+/*
+ * Find the first range of free pages that is at least sz pages long.
+ */
+struct vm_page *
+uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
+{
+	struct	vm_page *node, *best;
+
+	KASSERT(sz >= 1);
+
+	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
+		return TAILQ_FIRST(&pmr->single[mti]);
+
+	node = RB_ROOT(&pmr->size[mti]);
+	best = NULL;
+	while (node != NULL) {
+		if ((node - 1)->fpgsz >= sz) {
+			best = (node - 1);
+			node = RB_LEFT(node, objt);
+		} else
+			node = RB_RIGHT(node, objt);
+	}
+	return best;
+}
+
+/*
+ * Finds the next range. The next range has a size >= pg->fpgsz.
+ * Returns NULL if no more ranges are available.
+ */
+struct vm_page *
+uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
+{
+	struct vm_page *npg;
+
+	KASSERT(pmr != NULL && pg != NULL);
+	if (pg->fpgsz == 1) {
+		if (TAILQ_NEXT(pg, pageq) != NULL)
+			return TAILQ_NEXT(pg, pageq);
+		else
+			npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]);
+	} else
+		npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1);
+
+	return npg == NULL ? NULL : npg - 1;
+}
+
+/*
+ * Finds the previous and next ranges relative to the (uninserted) pg range.
+ *
+ * *pg_prev == NULL if no previous range is available, that can join with
+ * 	pg.
+ * *pg_next == NULL if no next range is available, that can join with
+ * 	pg.
+ */
+void
+uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
+    struct vm_page **pg_prev, struct vm_page **pg_next)
+{
+	KASSERT(pg_prev != NULL && pg_next != NULL);
+
+	*pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
+	if (*pg_next == NULL)
+		*pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
+	else
+		*pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next);
+
+	KDASSERT(*pg_next == NULL ||
+	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
+	KDASSERT(*pg_prev == NULL ||
+	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
+
+	/* Reset if not contig. */
+	if (*pg_prev != NULL &&
+	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
+	    != atop(VM_PAGE_TO_PHYS(pg)) ||
+	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
+	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
+		*pg_prev = NULL;
+	if (*pg_next != NULL &&
+	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
+	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
+	    pg + pg->fpgsz != *pg_next || /* Array broke. */
+	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
+		*pg_next = NULL;
+	return;
+}
+
+/*
+ * Remove a range from the address tree.
+ * Address tree maintains pmr counters.
+ */
+void
+uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
+{
+	KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
+	KDASSERT(pg->pg_flags & PQ_FREE);
+	RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
+
+	pmr->nsegs--;
+}
+/*
+ * Remove a range from the size tree.
+ */
+void
+uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
+{
+	int memtype;
+#ifdef DEBUG
+	struct vm_page *i;
+#endif
+
+	KDASSERT(pg->fpgsz >= 1);
+	KDASSERT(pg->pg_flags & PQ_FREE);
+	memtype = uvm_pmr_pg_to_memtype(pg);
+
+	if (pg->fpgsz == 1) {
+#ifdef DEBUG
+		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
+			if (i == pg)
+				break;
+		}
+		KDASSERT(i == pg);
+#endif
+		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
+	} else {
+		KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype],
+		    pg + 1) == pg + 1);
+		RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
+	}
+}
+/* Remove from both trees. */
+void
+uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
+{
+	uvm_pmr_assertvalid(pmr);
+	uvm_pmr_remove_size(pmr, pg);
+	uvm_pmr_remove_addr(pmr, pg);
+	uvm_pmr_assertvalid(pmr);
+}
+
+/*
+ * Insert the range described in pg.
+ * Returns the range thus created (which may be joined with the previous and
+ * next ranges).
+ * If no_join, the caller guarantees that the range cannot possibly join
+ * with adjecent ranges.
+ */
+struct vm_page *
+uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
+{
+	struct vm_page *prev, *next;
+
+#ifdef DEBUG
+	struct vm_page *i;
+	int mt;
+#endif
+
+	KDASSERT(pg->pg_flags & PQ_FREE);
+	KDASSERT(pg->fpgsz >= 1);
+
+#ifdef DEBUG
+	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
+		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
+			KDASSERT(i != pg);
+		if (pg->fpgsz > 1) {
+			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt],
+			    pg + 1) == NULL);
+		}
+		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
+	}
+#endif
+
+	if (!no_join) {
+		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
+		if (next != NULL) {
+			uvm_pmr_remove_size(pmr, next);
+			uvm_pmr_remove_addr(pmr, next);
+			pg->fpgsz += next->fpgsz;
+			next->fpgsz = 0;
+		}
+		if (prev != NULL) {
+			uvm_pmr_remove_size(pmr, prev);
+			prev->fpgsz += pg->fpgsz;
+			pg->fpgsz = 0;
+			return prev;
+		}
+	}
+
+	RB_INSERT(uvm_pmr_addr, &pmr->addr, pg);
+
+	pmr->nsegs++;
+
+	return pg;
+}
+/*
+ * Insert the range described in pg.
+ * Returns the range thus created (which may be joined with the previous and
+ * next ranges).
+ * Page must already be in the address tree.
+ */
+void
+uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
+{
+	int memtype;
+#ifdef DEBUG
+	struct vm_page *i;
+	int mti;
+#endif
+
+	KDASSERT(pg->fpgsz >= 1);
+	KDASSERT(pg->pg_flags & PQ_FREE);
+
+	memtype = uvm_pmr_pg_to_memtype(pg);
+#ifdef DEBUG
+	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
+		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
+			KDASSERT(i != pg);
+		if (pg->fpgsz > 1) {
+			KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti],
+			    pg + 1) == NULL);
+		}
+		KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
+	}
+	for (i = pg; i < pg + pg->fpgsz; i++)
+		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
+#endif
+
+	if (pg->fpgsz == 1)
+		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
+	else
+		RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
+}
+/* Insert in both trees. */
+struct vm_page *
+uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
+{
+	uvm_pmr_assertvalid(pmr);
+	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
+	uvm_pmr_insert_size(pmr, pg);
+	uvm_pmr_assertvalid(pmr);
+	return pg;
+}
+
+/*
+ * Find the last page that is part of this segment.
+ * => pg: the range at which to start the search.
+ * => boundary: the page number boundary specification (0 = no boundary).
+ * => pmr: the pmemrange of the page.
+ * 
+ * This function returns 1 before the next range, so if you want to have the
+ * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
+ * The reason is that this way, the length of the segment is easily
+ * calculated using: atop(result) - atop(pg) + 1.
+ * Hence this function also never returns NULL.
+ */
+struct vm_page *
+uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
+    struct vm_page *pg, paddr_t boundary)
+{
+	paddr_t	first_boundary;
+	struct	vm_page *next;
+	struct	vm_page *prev;
+
+	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
+	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
+	if (boundary != 0) {
+		first_boundary =
+		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
+	} else
+		first_boundary = 0;
+
+	/*
+	 * Increase next until it hits the first page of the next segment.
+	 *
+	 * While loop checks the following:
+	 * - next != NULL	we have not reached the end of pgl
+	 * - boundary == 0 || next < first_boundary
+	 *			we do not cross a boundary
+	 * - atop(prev) + 1 == atop(next)
+	 *			still in the same segment
+	 * - low <= last
+	 * - high > last	still in the same memory range
+	 * - memtype is equal	allocator is unable to view different memtypes
+	 *			as part of the same segment
+	 * - prev + 1 == next	no array breakage occurs
+	 */
+	prev = pg;
+	next = TAILQ_NEXT(prev, pageq);
+	while (next != NULL &&
+	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
+	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
+	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
+	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
+	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
+	    prev + 1 == next) {
+		prev = next;
+		next = TAILQ_NEXT(prev, pageq);
+	}
+
+	/*
+	 * End of this segment.
+	 */
+	return prev;
+}
+
+/*
+ * Remove the first segment of contiguous pages from pgl.
+ * A segment ends if it crosses boundary (unless boundary = 0) or
+ * if it would enter a different uvm_pmemrange.
+ *
+ * Work: the page range that the caller is currently working with.
+ * May be null.
+ *
+ * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
+ * the first segment is erased (which, if called by uvm_pmr_getpages(),
+ * probably is the smallest or very close to it).
+ */
+psize_t
+uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
+    struct vm_page **work, int is_desperate)
+{
+	struct vm_page *start, *end, *iter, *iter_end, *inserted;
+	psize_t count;
+	struct uvm_pmemrange *pmr, *pmr_iter;
+
+	KASSERT(!TAILQ_EMPTY(pgl));
+
+	/*
+	 * Initialize to first page.
+	 * Unless desperate scan finds a better candidate, this is what'll be
+	 * erased.
+	 */
+	start = TAILQ_FIRST(pgl);
+	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
+	end = uvm_pmr_findnextsegment(pmr, start, boundary);
+
+	/*
+	 * If we are desperate, we _really_ want to get rid of the smallest
+	 * element (rather than a close match to the smallest element).
+	 */
+	if (is_desperate) {
+		/* Lineair search for smallest segment. */
+		pmr_iter = pmr;
+		for (iter = TAILQ_NEXT(end, pageq);
+		    iter != NULL && start != end;
+		    iter = TAILQ_NEXT(iter_end, pageq)) {
+			/*
+			 * Only update pmr if it doesn't match current
+			 * iteration.
+			 */
+			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
+			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
+				pmr_iter = uvm_pmemrange_find(atop(
+				    VM_PAGE_TO_PHYS(iter)));
+			}
+
+			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
+			    boundary);
+
+			/*
+			 * Current iteration is smaller than best match so
+			 * far; update.
+			 */
+			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
+			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
+				start = iter;
+				end = iter_end;
+				pmr = pmr_iter;
+			}
+		}
+	}
+
+	/*
+	 * Calculate count and end of the list.
+	 */
+	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
+	end = TAILQ_NEXT(end, pageq);
+
+	/*
+	 * Actually remove the range of pages.
+	 *
+	 * Sadly, this cannot be done using pointer iteration:
+	 * vm_physseg is not guaranteed to be sorted on address, hence
+	 * uvm_page_init() may not have initialized its array sorted by
+	 * page number.
+	 */
+	for (iter = start; iter != end; iter = iter_end) {
+		iter_end = TAILQ_NEXT(iter, pageq);
+		TAILQ_REMOVE(pgl, iter, pageq);
+	}
+
+	start->fpgsz = count;
+	inserted = uvm_pmr_insert(pmr, start, 0);
+
+	/*
+	 * If the caller was working on a range and this function modified
+	 * that range, update the pointer.
+	 */
+	if (work != NULL && *work != NULL &&
+	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
+	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
+	    atop(VM_PAGE_TO_PHYS(*work)))
+		*work = inserted;
+	return count;
+}
+
+/*
+ * Extract a number of pages from a segment of free pages.
+ * Called by uvm_pmr_getpages.
+ *
+ * Returns the segment that was created from pages left over at the tail
+ * of the remove set of pages, or NULL if no pages were left at the tail.
+ */
+struct vm_page *
+uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
+    paddr_t start, paddr_t end, struct pglist *result)
+{
+	struct vm_page *after, *pg_i;
+	psize_t before_sz, after_sz;
+#ifdef DEBUG
+	psize_t i;
+#endif
+
+	KDASSERT(end > start);
+	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
+	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
+	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
+	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
+
+	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
+	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
+	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
+	uvm_pmr_assertvalid(pmr);
+
+	uvm_pmr_remove_size(pmr, pg);
+	if (before_sz == 0)
+		uvm_pmr_remove_addr(pmr, pg);
+
+	/* Add selected pages to result. */
+	for (pg_i = pg + before_sz; atop(VM_PAGE_TO_PHYS(pg_i)) < end;
+	    pg_i++) {
+		KDASSERT(pg_i->pg_flags & PQ_FREE);
+		pg_i->fpgsz = 0;
+		TAILQ_INSERT_TAIL(result, pg_i, pageq);
+	}
+
+	/* Before handling. */
+	if (before_sz > 0) {
+		pg->fpgsz = before_sz;
+		uvm_pmr_insert_size(pmr, pg);
+	}
+
+	/* After handling. */
+	after = NULL;
+	if (after_sz > 0) {
+		after = pg + before_sz + (end - start);
+#ifdef DEBUG
+		for (i = 0; i < after_sz; i++) {
+			KASSERT(!uvm_pmr_isfree(after + i));
+		}
+#endif
+		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
+		after->fpgsz = after_sz;
+		after = uvm_pmr_insert_addr(pmr, after, 1);
+		uvm_pmr_insert_size(pmr, after);
+	}
+
+	uvm_pmr_assertvalid(pmr);
+	return after;
+}
+
+/*
+ * Acquire a number of pages.
+ *
+ * count:	the number of pages returned
+ * start:	lowest page number
+ * end:		highest page number +1
+ * 		(start = end = 0: no limitation)
+ * align:	power-of-2 alignment constraint (align = 1: no alignment)
+ * boundary:	power-of-2 boundary (boundary = 0: no boundary)
+ * maxseg:	maximum number of segments to return
+ * flags:	UVM_PLA_* flags
+ * result:	returned pages storage (uses pageq)
+ */
+int
+uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
+    paddr_t boundary, int maxseg, int flags, struct pglist *result)
+{
+	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
+	struct	vm_page *found, *f_next; /* Iterate chunks. */
+	psize_t	fcount;			/* Current found pages. */
+	int	fnsegs;			/* Current segment counter. */
+	int	try, start_try;
+	psize_t	search[3];
+	paddr_t	fstart, fend;		/* Pages to be taken from found. */
+	int	memtype;		/* Requested memtype. */
+	int	memtype_init;		/* Best memtype. */
+	int	desperate;		/* True if allocation failed. */
+
+	/*
+	 * Validate arguments.
+	 */
+	KASSERT(count > 0 &&
+	    (start == 0 || end == 0 || start < end) &&
+	    align >= 1 && powerof2(align) &&
+	    maxseg > 0 &&
+	    (boundary == 0 || powerof2(boundary)) &&
+	    (boundary == 0 || maxseg * boundary >= count) &&
+	    TAILQ_EMPTY(result));
+
+	/*
+	 * TRYCONTIG is a noop if you only want a single segment.
+	 * Remove it if that's the case: otherwise it'll deny the fast
+	 * allocation.
+	 */
+	if (maxseg == 1 || count == 1)
+		flags &= ~UVM_PLA_TRYCONTIG;
+
+	/*
+	 * Configure search.
+	 *
+	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
+	 * search[1] is multiple segments, chosen to fulfill the search in
+	 *   approximately even-sized segments.
+	 *   This is a good trade-off between slightly reduced allocation speed
+	 *   and less fragmentation.
+	 * search[2] is the worst case, in which all segments are evaluated.
+	 *   This provides the least fragmentation, but makes the search
+	 *   possibly longer (although in the case it is selected, that no
+	 *   longer matters most).
+	 *
+	 * The exception is when maxseg == 1: since we can only fulfill that
+	 * with one segment of size pages, only a single search type has to
+	 * be attempted.
+	 */
+	if (maxseg == 1 || count == 1) {
+		start_try = 2;
+		search[2] = count;
+	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
+		start_try = 2;
+		search[2] = 1;
+	} else {
+		start_try = 0;
+		search[0] = count;
+		search[1] = pow2divide(count, maxseg);
+		search[2] = 1;
+		if ((flags & UVM_PLA_TRYCONTIG) == 0)
+			start_try = 1;
+		if (search[1] >= search[0]) {
+			search[1] = search[0];
+			start_try = 1;
+		}
+		if (search[2] >= search[start_try]) {
+			start_try = 2;
+		}
+	}
+
+	/*
+	 * Memory type: if zeroed memory is requested, traverse the zero set.
+	 * Otherwise, traverse the dirty set.
+	 *
+	 * The memtype iterator is reinitialized to memtype_init on entrance
+	 * of a pmemrange.
+	 */
+	if (flags & UVM_PLA_ZERO)
+		memtype_init = UVM_PMR_MEMTYPE_ZERO;
+	else
+		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
+
+	/*
+	 * Initially, we're not desperate.
+	 *
+	 * Note that if we return from a sleep, we are still desperate.
+	 * Chances are that memory pressure is still high, so resetting
+	 * seems over-optimistic to me.
+	 */
+	desperate = 0;
+
+ReTry:		/* Return point after sleeping. */
+	fcount = 0;
+	fnsegs = 0;
+
+	uvm_lock_fpageq();
+
+ReTryDesperate:
+	/*
+	 * If we just want any page(s), go for the really fast option.
+	 */
+	if (count <= maxseg && align == 1 && boundary == 0 &&
+	    (flags & UVM_PLA_TRYCONTIG) == 0) {
+		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
+		    result, start, end);
+
+		/*
+		 * If we found sufficient pages, go to the succes exit code.
+		 *
+		 * Otherwise, go immediately to fail, since we collected
+		 * all we could anyway.
+		 */
+		if (fcount == count)
+			goto Out;
+		else
+			goto Fail;
+	}
+
+	/*
+	 * The hart of the contig case.
+	 *
+	 * The code actually looks like this:
+	 *
+	 * foreach (struct pmemrange) {
+	 *	foreach (memtype) {
+	 *		foreach(try) {
+	 *			foreach (free range of memtype in pmemrange,
+	 *			    starting at search[try]) {
+	 *				while (range has space left)
+	 *					take from range
+	 *			}
+	 *		}
+	 *	}
+	 *
+	 *	if next pmemrange has higher usecount than current:
+	 *		enter desperate case (which will drain the pmemranges
+	 *		until empty prior to moving to the next one)
+	 * }
+	 *
+	 * When desperate is activated, try always starts at the highest
+	 * value. The memtype loop is using a goto ReScanMemtype.
+	 * The try loop is using a goto ReScan.
+	 * The 'range has space left' loop uses label DrainFound.
+	 *
+	 * Writing them all as loops would take up a lot of screen space in
+	 * the form of indentation and some parts are easier to express
+	 * using the labels.
+	 */
+
+	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
+		/* Empty range. */
+		if (pmr->nsegs == 0)
+			continue;
+
+		/* Outside requested range. */
+		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
+			continue;
+
+		memtype = memtype_init;
+
+ReScanMemtype:	/* Return point at memtype++. */
+		try = start_try;
+
+ReScan:		/* Return point at try++. */
+		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
+		    found != NULL;
+		    found = f_next) {
+			f_next = uvm_pmr_nextsz(pmr, found, memtype);
+
+			fstart = atop(VM_PAGE_TO_PHYS(found));
+			if (start != 0)
+				fstart = MAX(start, fstart);
+DrainFound:
+			/*
+			 * Throw away the first segment if fnsegs == maxseg
+			 *
+			 * Note that f_next is still valid after this call,
+			 * since we only allocated from entries before f_next.
+			 * We don't revisit the entries we already extracted
+			 * from unless we entered the desperate case.
+			 */
+			if (fnsegs == maxseg) {
+				fnsegs--;
+				fcount -=
+				    uvm_pmr_remove_1strange(result, boundary,
+				    &found, desperate);
+			}
+
+			fstart = PMR_ALIGN(fstart, align);
+			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
+			if (fstart >= fend)
+				continue;
+			if (boundary != 0) {
+				fend =
+				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
+			}
+			if (end != 0)
+				fend = MIN(end, fend);
+			if (fend - fstart > count - fcount)
+				fend = fstart + (count - fcount);
+
+			fcount += fend - fstart;
+			fnsegs++;
+			found = uvm_pmr_extract_range(pmr, found,
+			    fstart, fend, result);
+
+			if (fcount == count)
+				goto Out;
+
+			/*
+			 * If there's still space left in found, try to
+			 * fully drain it prior to continueing.
+			 */
+			if (found != NULL) {
+				fstart = fend;
+				goto DrainFound;
+			}
+		}
+
+		/*
+		 * Try a smaller search now.
+		 */
+		if (++try < nitems(search))
+			goto ReScan;
+
+		/*
+		 * Exhaust all memory types prior to going to the next memory
+		 * segment.
+		 * This means that zero-vs-dirty are eaten prior to moving
+		 * to a pmemrange with a higher use-count.
+		 *
+		 * Code is basically a difficult way of writing:
+		 * memtype = memtype_init;
+		 * do {
+		 *	...;
+		 *	memtype += 1;
+		 *	memtype %= MEMTYPE_MAX;
+		 * } while (memtype != memtype_init);
+		 */
+		memtype += 1;
+		if (memtype == UVM_PMR_MEMTYPE_MAX)
+			memtype = 0;
+		if (memtype != memtype_init)
+			goto ReScanMemtype;
+
+		/*
+		 * If not desperate, enter desperate case prior to eating all
+		 * the good stuff in the next range.
+		 */
+		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
+		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
+			break;
+	}
+
+	/*
+	 * Not enough memory of the requested type available. Fall back to
+	 * less good memory that we'll clean up better later.
+	 *
+	 * This algorithm is not very smart though, it just starts scanning
+	 * a different typed range, but the nicer ranges of the previous
+	 * iteration may fall out. Hence there is a small chance of a false
+	 * negative.
+	 *
+	 * When desparate: scan all sizes starting at the smallest
+	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
+	 * allow us to hit the fast path now).
+	 *
+	 * Also, because we will revisit entries we scanned before, we need
+	 * to reset the page queue, or we may end up releasing entries in
+	 * such a way as to invalidate f_next.
+	 */
+	if (!desperate) {
+		desperate = 1;
+		start_try = nitems(search) - 1;
+		flags &= ~UVM_PLA_TRYCONTIG;
+
+		while (!TAILQ_EMPTY(result))
+			uvm_pmr_remove_1strange(result, 0, NULL, 0);
+		fnsegs = 0;
+		fcount = 0;
+		goto ReTryDesperate;
+	}
+
+Fail:
+	/*
+	 * Allocation failed.
+	 */
+
+	/* XXX: claim from memory reserve here */
+
+	while (!TAILQ_EMPTY(result))
+		uvm_pmr_remove_1strange(result, 0, NULL, 0);
+	uvm_unlock_fpageq();
+
+	if (flags & UVM_PLA_WAITOK) {
+		uvm_wait("uvm_pmr_getpages");
+		goto ReTry;
+	} else
+		wakeup(&uvm.pagedaemon_proc);
+
+	return ENOMEM;
+
+Out:
+
+	/*
+	 * Allocation succesful.
+	 */
+
+	uvmexp.free -= fcount;
+
+	uvm_unlock_fpageq();
+
+	/* Update statistics and zero pages if UVM_PLA_ZERO. */
+	TAILQ_FOREACH(found, result, pageq) {
+		atomic_clearbits_int(&found->pg_flags,
+		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
+
+		if (found->pg_flags & PG_ZERO) {
+			uvmexp.zeropages--;
+		}
+		if (flags & UVM_PLA_ZERO) {
+			if (found->pg_flags & PG_ZERO)
+				uvmexp.pga_zerohit++;
+			else {
+				uvmexp.pga_zeromiss++;
+				uvm_pagezero(found);
+			}
+		}
+		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
+
+		found->uobject = NULL;
+		found->uanon = NULL;
+		found->pg_version++;
+
+		/*
+		 * Validate that the page matches range criterium.
+		 */
+		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
+		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
+	}
+
+	return 0;
+}
+
+/*
+ * Free a number of contig pages (invoked by uvm_page_init).
+ */
+void
+uvm_pmr_freepages(struct vm_page *pg, psize_t count)
+{
+	struct uvm_pmemrange *pmr;
+	psize_t i, pmr_count;
+
+	for (i = 0; i < count; i++) {
+		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
+		    atop(VM_PAGE_TO_PHYS(pg)) + i);
+
+		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
+		    VALID_FLAGS(pg[i].pg_flags))) {
+			printf("Flags: 0x%x, will panic now.\n",
+			    pg[i].pg_flags);
+		}
+		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
+		    VALID_FLAGS(pg[i].pg_flags));
+		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
+		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
+	}
+
+	uvm_lock_fpageq();
+
+	while (count > 0) {
+		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
+		KASSERT(pmr != NULL);
+
+		pmr_count = MIN(count, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
+		pg->fpgsz = pmr_count;
+		uvm_pmr_insert(pmr, pg, 0);
+
+		uvmexp.free += pmr_count;
+		count -= pmr_count;
+		pg += pmr_count;
+	}
+	wakeup(&uvmexp.free);
+
+	uvm_unlock_fpageq();
+}
+
+/*
+ * Free all pages in the queue.
+ */
+void
+uvm_pmr_freepageq(struct pglist *pgl)
+{
+	struct vm_page *pg;
+
+	TAILQ_FOREACH(pg, pgl, pageq) {
+		if (!((pg->pg_flags & PQ_FREE) == 0 &&
+		    VALID_FLAGS(pg->pg_flags))) {
+			printf("Flags: 0x%x, will panic now.\n",
+			    pg->pg_flags);
+		}
+		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
+		    VALID_FLAGS(pg->pg_flags));
+		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
+		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
+	}
+
+	uvm_lock_fpageq();
+	while (!TAILQ_EMPTY(pgl))
+		uvmexp.free += uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
+	wakeup(&uvmexp.free);
+	uvm_unlock_fpageq();
+
+	return;
+}
+
+/*
+ * Store a pmemrange in the list.
+ *
+ * The list is sorted by use.
+ */
+struct uvm_pmemrange *
+uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
+    struct uvm_pmemrange *pmr)
+{
+	struct uvm_pmemrange *iter;
+	int cmp = 1;
+
+	TAILQ_FOREACH(iter, useq, pmr_use) {
+		cmp = uvm_pmemrange_use_cmp(pmr, iter);
+		if (cmp == 0)
+			return iter;
+		if (cmp == -1)
+			break;
+	}
+
+	if (iter == NULL)
+		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
+	else
+		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
+	return NULL;
+}
+
+#ifdef DEBUG
+/*
+ * Validation of the whole pmemrange.
+ * Called with fpageq locked.
+ */
+void
+uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
+{
+	struct vm_page *prev, *next, *i, *xref;
+	int lcv, mti;
+
+	/* Validate address tree. */
+	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
+		/* Validate the range. */
+		KASSERT(i->fpgsz > 0);
+		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
+		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
+		    <= pmr->high);
+
+		/* Validate each page in this range. */
+		for (lcv = 0; lcv < i->fpgsz; lcv++) {
+			/*
+			 * Only the first page has a size specification.
+			 * Rest is size 0.
+			 */
+			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
+			/*
+			 * Flag check.
+			 */
+			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
+			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
+			/*
+			 * Free pages are:
+			 * - not wired
+			 * - not loaned
+			 * - have no vm_anon
+			 * - have no uvm_object
+			 */
+			KASSERT(i[lcv].wire_count == 0);
+			KASSERT(i[lcv].loan_count == 0);
+			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
+			    i[lcv].uanon == NULL);
+			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
+			    i[lcv].uobject == NULL);
+			/*
+			 * Pages in a single range always have the same
+			 * memtype.
+			 */
+			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
+			    uvm_pmr_pg_to_memtype(&i[lcv]));
+		}
+
+		/* Check that it shouldn't be joined with its predecessor. */
+		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i);
+		if (prev != NULL) {
+			KASSERT(uvm_pmr_pg_to_memtype(i) !=
+			    uvm_pmr_pg_to_memtype(prev) ||
+			    atop(VM_PAGE_TO_PHYS(i)) >
+			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
+			    prev + prev->fpgsz != i);
+		}
+
+		/* Assert i is in the size tree as well. */
+		if (i->fpgsz == 1) {
+			TAILQ_FOREACH(xref,
+			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
+				if (xref == i)
+					break;
+			}
+			KASSERT(xref == i);
+		} else {
+			KASSERT(RB_FIND(uvm_pmr_size,
+			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
+			    i + 1);
+		}
+	}
+
+	/* Validate size tree. */
+	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
+		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
+			next = uvm_pmr_nextsz(pmr, i, mti);
+			if (next != NULL) {
+				KASSERT(i->fpgsz <=
+				    next->fpgsz);
+			}
+
+			/* Assert i is in the addr tree as well. */
+			KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
+
+			/* Assert i is of the correct memory type. */
+			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
+		}
+	}
+
+	/* Validate nsegs statistic. */
+	lcv = 0;
+	RB_FOREACH(i, uvm_pmr_addr, &pmr->addr)
+		lcv++;
+	KASSERT(pmr->nsegs == lcv);
+}
+#endif /* DEBUG */
+
+/*
+ * Split pmr at split point pageno.
+ * Called with fpageq unlocked.
+ *
+ * Split is only applied if a pmemrange spans pageno.
+ */
+void
+uvm_pmr_split(paddr_t pageno)
+{
+	struct uvm_pmemrange *pmr, *drain;
+	struct vm_page *rebuild, *prev, *next;
+	psize_t prev_sz;
+
+	uvm_lock_fpageq();
+	pmr = uvm_pmemrange_find(pageno);
+	if (pmr == NULL || !(pmr->low < pageno)) {
+		/* No split required. */
+		uvm_unlock_fpageq();
+		return;
+	}
+
+	KASSERT(pmr->low < pageno);
+	KASSERT(pmr->high > pageno);
+
+	drain = uvm_pmr_allocpmr();
+	drain->low = pageno;
+	drain->high = pmr->high;
+	drain->use = pmr->use;
+
+	uvm_pmr_assertvalid(pmr);
+	uvm_pmr_assertvalid(drain);
+	KASSERT(drain->nsegs == 0);
+
+	RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
+		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
+			break;
+	}
+	if (rebuild == NULL)
+		prev = RB_MAX(uvm_pmr_addr, &pmr->addr);
+	else
+		prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild);
+	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
+
+	/*
+	 * Handle free chunk that spans the split point.
+	 */
+	if (prev != NULL &&
+	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
+		psize_t before, after;
+
+		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
+
+		uvm_pmr_remove(pmr, prev);
+		prev_sz = prev->fpgsz;
+		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
+		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
+
+		KASSERT(before > 0);
+		KASSERT(after > 0);
+
+		prev->fpgsz = before;
+		uvm_pmr_insert(pmr, prev, 1);
+		(prev + before)->fpgsz = after;
+		uvm_pmr_insert(drain, prev + before, 1);
+	}
+
+	/*
+	 * Move free chunks that no longer fall in the range.
+	 */
+	for (; rebuild != NULL; rebuild = next) {
+		next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild);
+
+		uvm_pmr_remove(pmr, rebuild);
+		uvm_pmr_insert(drain, rebuild, 1);
+	}
+
+	pmr->high = pageno;
+	uvm_pmr_assertvalid(pmr);
+	uvm_pmr_assertvalid(drain);
+
+	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
+	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
+	uvm_unlock_fpageq();
+}
+
+/*
+ * Increase the usage counter for the given range of memory.
+ *
+ * The more usage counters a given range of memory has, the more will be
+ * attempted not to allocate from it.
+ *
+ * Addresses here are in paddr_t, not page-numbers.
+ * The lowest and highest allowed address are specified.
+ */
+void
+uvm_pmr_use_inc(paddr_t low, paddr_t high)
+{
+	struct uvm_pmemrange *pmr;
+
+	/*
+	 * If high+1 == 0 and low == 0, then you are increasing use
+	 * of the whole address space, which won't make any difference.
+	 * Skip in that case.
+	 */
+	high++;
+	if (high == 0 && low == 0)
+		return;
+
+	/*
+	 * pmr uses page numbers, translate low and high.
+	 */
+	low = atop(round_page(low));
+	high = atop(trunc_page(high));
+	uvm_pmr_split(low);
+	uvm_pmr_split(high);
+
+	uvm_lock_fpageq();
+
+	/* Increase use count on segments in range. */
+	RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
+		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
+			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
+			pmr->use++;
+			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
+		}
+		uvm_pmr_assertvalid(pmr);
+	}
+
+	uvm_unlock_fpageq();
+}
+
+/*
+ * Allocate a pmemrange.
+ *
+ * If called from uvm_page_init, the uvm_pageboot_alloc is used.
+ * If called after uvm_init, malloc is used.
+ * (And if called in between, you're dead.)
+ */
+struct uvm_pmemrange *
+uvm_pmr_allocpmr()
+{
+	struct uvm_pmemrange *nw;
+	int i;
+
+	if (!uvm.page_init_done) {
+		nw = (struct uvm_pmemrange *)
+		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
+		bzero(nw, sizeof(struct uvm_pmemrange));
+	} else {
+		nw = malloc(sizeof(struct uvm_pmemrange),
+		    M_VMMAP, M_NOWAIT | M_ZERO);
+	}
+	RB_INIT(&nw->addr);
+	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
+		RB_INIT(&nw->size[i]);
+		TAILQ_INIT(&nw->single[i]);
+	}
+	return nw;
+}
+
+static const struct uvm_io_ranges uvm_io_ranges[] = UVM_IO_RANGES;
+
+/*
+ * Initialization of pmr.
+ * Called by uvm_page_init.
+ *
+ * Sets up pmemranges.
+ */
+void
+uvm_pmr_init(void)
+{
+	struct uvm_pmemrange *new_pmr;
+	int i;
+
+	TAILQ_INIT(&uvm.pmr_control.use);
+	RB_INIT(&uvm.pmr_control.addr);
+
+	new_pmr = uvm_pmr_allocpmr();
+	new_pmr->low = 0;
+	new_pmr->high = atop((paddr_t)-1) + 1;
+
+	RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
+	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
+
+	for (i = 0; i < nitems(uvm_io_ranges); i++)
+		uvm_pmr_use_inc(uvm_io_ranges[i].low, uvm_io_ranges[i].high);
+}
+
+/*
+ * Find the pmemrange that contains the given page number.
+ *
+ * (Manually traverses the binary tree, because that is cheaper on stack
+ * usage.)
+ */
+struct uvm_pmemrange *
+uvm_pmemrange_find(paddr_t pageno)
+{
+	struct uvm_pmemrange *pmr;
+
+	pmr = RB_ROOT(&uvm.pmr_control.addr);
+	while (pmr != NULL) {
+		if (pmr->low > pageno)
+			pmr = RB_LEFT(pmr, pmr_addr);
+		else if (pmr->high <= pageno)
+			pmr = RB_RIGHT(pmr, pmr_addr);
+		else
+			break;
+	}
+
+	return pmr;
+}
+
+#if defined(DDB) || defined(DEBUG)
+/*
+ * Return true if the given page is in any of the free lists.
+ * Used by uvm_page_printit.
+ * This function is safe, even if the page is not on the freeq.
+ * Note: does not apply locking, only called from ddb.
+ */
+int
+uvm_pmr_isfree(struct vm_page *pg)
+{
+	struct vm_page *r;
+	struct uvm_pmemrange *pmr;
+
+	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
+	if (pmr == NULL)
+		return 0;
+	r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg);
+	if (r == NULL)
+		r = RB_MAX(uvm_pmr_addr, &pmr->addr);
+	else
+		r = RB_PREV(uvm_pmr_addr, &pmr->addr, r);
+	if (r == NULL)
+		return 0; /* Empty tree. */
+
+	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
+	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
+	    atop(VM_PAGE_TO_PHYS(pg));
+}
+#endif /* DEBUG */
+
+/*
+ * Given a root of a tree, find a range which intersects start, end and
+ * is of the same memtype.
+ *
+ * Page must be in the address tree.
+ */
+struct vm_page*
+uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
+    paddr_t start, paddr_t end, int memtype)
+{
+	int	direction;
+	struct	vm_page *root;
+	struct	vm_page *high, *high_next;
+	struct	vm_page *low, *low_next;
+
+	KDASSERT(pmr != NULL && init_root != NULL);
+	root = init_root;
+
+	/*
+	 * Which direction to use for searching.
+	 */
+	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
+		direction =  1;
+	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
+		direction = -1;
+	else /* nothing to do */
+		return root;
+
+	/*
+	 * First, update root to fall within the chosen range.
+	 */
+	while (root && !PMR_INTERSECTS_WITH(
+	    atop(VM_PAGE_TO_PHYS(root)),
+	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
+	    start, end)) {
+		if (direction == 1)
+			root = RB_RIGHT(root, objt);
+		else
+			root = RB_LEFT(root, objt);
+	}
+	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
+		return root;
+
+	/*
+	 * Root is valid, but of the wrong memtype.
+	 *
+	 * Try to find a range that has the given memtype in the subtree
+	 * (memtype mismatches are costly, either because the conversion
+	 * is expensive, or a later allocation will need to do the opposite
+	 * conversion, which will be expensive).
+	 *
+	 *
+	 * First, simply increase address until we hit something we can use.
+	 * Cache the upper page, so we can page-walk later.
+	 */
+	high = root;
+	high_next = RB_RIGHT(high, objt);
+	while (high_next != NULL && PMR_INTERSECTS_WITH(
+	    atop(VM_PAGE_TO_PHYS(high_next)),
+	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
+	    start, end)) {
+		high = high_next;
+		if (uvm_pmr_pg_to_memtype(high) == memtype)
+			return high;
+		high_next = RB_RIGHT(high, objt);
+	}
+
+	/*
+	 * Second, decrease the address until we hit something we can use.
+	 * Cache the lower page, so we can page-walk later.
+	 */
+	low = root;
+	low_next = RB_RIGHT(low, objt);
+	while (low_next != NULL && PMR_INTERSECTS_WITH(
+	    atop(VM_PAGE_TO_PHYS(low_next)),
+	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
+	    start, end)) {
+		low = low_next;
+		if (uvm_pmr_pg_to_memtype(low) == memtype)
+			return low;
+		low_next = RB_RIGHT(low, objt);
+	}
+
+	/*
+	 * Ack, no hits. Walk the address tree until to find something usable.
+	 */
+	for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low);
+	    low != high;
+	    low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) {
+		KASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(high_next)),
+	    	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
+	    	    start, end));
+		if (uvm_pmr_pg_to_memtype(low) == memtype)
+			return low;
+	}
+
+	/*
+	 * Nothing found.
+	 */
+	return NULL;
+}
+
+/*
+ * Allocate any page, the fastest way. Page number constraints only.
+ */
+int
+uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
+    paddr_t start, paddr_t end)
+{
+	struct	uvm_pmemrange *pmr;
+	struct	vm_page *found, *splitpg;
+	psize_t	fcount;
+	int	memtype;
+
+	fcount = 0;
+	for (pmr = TAILQ_FIRST(&uvm.pmr_control.use);
+	    pmr != NULL && fcount != count; pmr = TAILQ_NEXT(pmr, pmr_use)) {
+		/* Outside requested range. */
+		if (!(start == 0 && end == 0) &&
+		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
+			continue;
+
+		/* Range is empty. */
+		if (pmr->nsegs == 0)
+			continue;
+
+		/*
+		 * Loop over all memtypes, starting at memtype_init.
+		 */
+		memtype = memtype_init;
+		do {
+			found = TAILQ_FIRST(&pmr->single[memtype]);
+			/*
+			 * If found is outside the range, walk the list
+			 * until we find something that intersects with
+			 * boundaries.
+			 */
+			while (found && !PMR_INTERSECTS_WITH(
+			    atop(VM_PAGE_TO_PHYS(found)),
+			    atop(VM_PAGE_TO_PHYS(found)) + 1,
+			    start, end))
+				found = TAILQ_NEXT(found, pageq);
+
+			if (found == NULL) {
+				found = RB_ROOT(&pmr->size[memtype]);
+				/* Size tree gives pg[1] instead of pg[0] */
+				if (found != NULL)
+					found--;
+
+				found = uvm_pmr_rootupdate(pmr, found,
+				    start, end, memtype);
+			}
+			if (found != NULL) {
+				uvm_pmr_assertvalid(pmr);
+				uvm_pmr_remove_size(pmr, found);
+
+				/*
+				 * If the page intersects the end, then it'll
+				 * need splitting.
+				 *
+				 * Note that we don't need to split if the page
+				 * intersects start: the drain function will
+				 * simply stop on hitting start.
+				 */
+				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
+				    found->fpgsz > end) {
+					psize_t splitsz =
+					    atop(VM_PAGE_TO_PHYS(found)) +
+					    found->fpgsz - end;
+
+					uvm_pmr_remove_addr(pmr, found);
+					uvm_pmr_assertvalid(pmr);
+					found->fpgsz -= splitsz;
+					splitpg = found + found->fpgsz;
+					splitpg->fpgsz = splitsz;
+					uvm_pmr_insert(pmr, splitpg, 1);
+
+					/*
+					 * At this point, splitpg and found
+					 * actually should be joined.
+					 * But we explicitly disable that,
+					 * because we will start subtracting
+					 * from found.
+					 */
+					KASSERT(start == 0 ||
+					    atop(VM_PAGE_TO_PHYS(found)) +
+					    found->fpgsz > start);
+					uvm_pmr_insert_addr(pmr, found, 1);
+				}
+
+				/*
+				 * Fetch pages from the end.
+				 * If the range is larger than the requested
+				 * number of pages, this saves us an addr-tree
+				 * update.
+				 *
+				 * Since we take from the end and insert at
+				 * the head, any ranges keep preserved.
+				 */
+				while (found->fpgsz > 0 && fcount < count &&
+				    (start == 0 ||
+				    atop(VM_PAGE_TO_PHYS(found)) +
+				    found->fpgsz > start)) {
+					found->fpgsz--;
+					fcount++;
+					TAILQ_INSERT_HEAD(result,
+					    &found[found->fpgsz], pageq);
+				}
+				if (found->fpgsz > 0) {
+					uvm_pmr_insert_size(pmr, found);
+					KDASSERT(fcount == count);
+					uvm_pmr_assertvalid(pmr);
+					return fcount;
+				}
+
+				/*
+				 * Delayed addr-tree removal.
+				 */
+				uvm_pmr_remove_addr(pmr, found);
+				uvm_pmr_assertvalid(pmr);
+			} else {
+				/*
+				 * Skip to the next memtype.
+				 */
+				memtype += 1;
+				if (memtype == UVM_PMR_MEMTYPE_MAX)
+					memtype = 0;
+			}
+		} while (memtype != memtype_init && fcount != count);
+	}
+
+	/*
+	 * Search finished.
+	 *
+	 * Ran out of ranges before enough pages were gathered, or we hit the
+	 * case where found->fpgsz == count - fcount, in which case the
+	 * above exit condition didn't trigger.
+	 *
+	 * On failure, caller will free the pages.
+	 */
+	return fcount;
+}
+
+#ifdef DDB
+/*
+ * Print information about pmemrange.
+ * Does not do locking (so either call it from DDB or acquire fpageq lock
+ * before invoking.
+ */
+void
+uvm_pmr_print(void)
+{
+	struct	uvm_pmemrange *pmr;
+	struct	vm_page *pg;
+	psize_t	size[UVM_PMR_MEMTYPE_MAX];
+	psize_t	free;
+	int	useq_len;
+	int	mt;
+
+	printf("Ranges, use queue:\n");
+	useq_len = 0;
+	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
+		useq_len++;
+		free = 0;
+		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
+			pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]);
+			if (pg != NULL)
+				pg--;
+			else
+				pg = TAILQ_FIRST(&pmr->single[mt]);
+			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
+
+			RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
+				free += pg->fpgsz;
+		}
+
+		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
+		    (unsigned long)pmr->low, (unsigned long)pmr->high,
+		    pmr->use, (unsigned long)pmr->nsegs);
+		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
+			printf(" maxsegsz[%d]=0x%lx", mt,
+			    (unsigned long)size[mt]);
+		}
+		printf(" free=0x%lx\n", (unsigned long)free);
+	}
+	printf("#ranges = %d\n", useq_len);
+}
+#endif
diff --git a/sys/uvm/uvm_pmemrange.h b/sys/uvm/uvm_pmemrange.h
new file mode 100644
index 00000000000..3fb477e6ef7
--- /dev/null
+++ b/sys/uvm/uvm_pmemrange.h
@@ -0,0 +1,83 @@
+/*	$OpenBSD: uvm_pmemrange.h,v 1.5 2010/04/22 19:02:55 oga Exp $	*/
+
+/*
+ * Copyright (c) 2009 Ariane van der Steldt <ariane@stack.nl>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * uvm_pmemrange.h: describe and manage free physical memory.
+ */
+
+#ifndef _UVM_UVM_PMEMRANGE_H_
+#define _UVM_UVM_PMEMRANGE_H_
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+RB_HEAD(uvm_pmr_addr, vm_page);
+RB_HEAD(uvm_pmr_size, vm_page);
+
+/*
+ * Page types available:
+ * - DIRTY: this page may contain random data.
+ * - ZERO: this page has been zeroed.
+ */
+#define UVM_PMR_MEMTYPE_DIRTY	0
+#define UVM_PMR_MEMTYPE_ZERO	1
+#define UVM_PMR_MEMTYPE_MAX	2
+
+/*
+ * An address range of memory.
+ */
+struct uvm_pmemrange {
+	struct	uvm_pmr_addr addr;	/* Free page chunks, sorted by addr. */
+	struct	uvm_pmr_size size[UVM_PMR_MEMTYPE_MAX];
+					/* Free page chunks, sorted by size. */
+	TAILQ_HEAD(, vm_page) single[UVM_PMR_MEMTYPE_MAX];
+					/* single page regions (uses pageq) */
+
+	paddr_t	low;			/* Start of address range (pgno). */
+	paddr_t	high;			/* End +1 (pgno). */
+	int	use;			/* Use counter. */
+	psize_t	nsegs;			/* Current range count. */
+
+	TAILQ_ENTRY(uvm_pmemrange) pmr_use;
+					/* pmr, sorted by use */
+	RB_ENTRY(uvm_pmemrange) pmr_addr;
+					/* pmr, sorted by address */
+};
+
+RB_HEAD(uvm_pmemrange_addr, uvm_pmemrange);
+TAILQ_HEAD(uvm_pmemrange_use, uvm_pmemrange);
+
+/*
+ * pmr control structure. Contained in uvm.pmr_control.
+ */
+struct uvm_pmr_control {
+	struct	uvm_pmemrange_addr addr;
+	struct	uvm_pmemrange_use use;
+};
+
+void	uvm_pmr_freepages(struct vm_page *, psize_t);
+void	uvm_pmr_freepageq(struct pglist *pgl);
+int	uvm_pmr_getpages(psize_t, paddr_t, paddr_t, paddr_t, paddr_t,
+	    int, int, struct pglist *);
+void	uvm_pmr_init(void);
+
+#ifdef DDB
+int	uvm_pmr_isfree(struct vm_page *pg);
+#endif
+
+#endif /* _UVM_UVM_PMEMRANGE_H_ */
author	oga <oga@openbsd.org>	2010-04-22 19:02:44 +0000
committer	oga <oga@openbsd.org>	2010-04-22 19:02:44 +0000
commit	a3544580456680ac17bea7051ae709d5e34e7208 (patch)
tree	21a35190754e62a9ed9f6031aab7d11fd1256cc3
parent	zap trailing whitespace; (diff)
download	wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.tar.xz wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.zip