diff options
author | 2010-04-22 19:02:44 +0000 | |
---|---|---|
committer | 2010-04-22 19:02:44 +0000 | |
commit | a3544580456680ac17bea7051ae709d5e34e7208 (patch) | |
tree | 21a35190754e62a9ed9f6031aab7d11fd1256cc3 | |
parent | zap trailing whitespace; (diff) | |
download | wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.tar.xz wireguard-openbsd-a3544580456680ac17bea7051ae709d5e34e7208.zip |
Committing on behalf or ariane@.
recommit pmemrange:
physmem allocator: change the view of free memory from single
free pages to free ranges. Classify memory based on region with
associated use-counter (which is used to construct a priority
list of where to allocate memory).
Based on code from tedu@, help from many.
Useable now that bugs have been found and fixed in most architecture's
pmap.c
ok by everyone who has done a pmap or uvm commit in the last year.
-rw-r--r-- | sys/arch/i386/i386/pmapae.c | 15 | ||||
-rw-r--r-- | sys/arch/sparc/include/vmparam.h | 5 | ||||
-rw-r--r-- | sys/arch/sparc64/include/vmparam.h | 5 | ||||
-rw-r--r-- | sys/conf/files | 3 | ||||
-rw-r--r-- | sys/uvm/uvm.h | 33 | ||||
-rw-r--r-- | sys/uvm/uvm_extern.h | 17 | ||||
-rw-r--r-- | sys/uvm/uvm_map.c | 10 | ||||
-rw-r--r-- | sys/uvm/uvm_page.c | 197 | ||||
-rw-r--r-- | sys/uvm/uvm_page.h | 3 | ||||
-rw-r--r-- | sys/uvm/uvm_pglist.c | 347 | ||||
-rw-r--r-- | sys/uvm/uvm_pmemrange.c | 1813 | ||||
-rw-r--r-- | sys/uvm/uvm_pmemrange.h | 83 |
12 files changed, 2038 insertions, 493 deletions
diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c index 8b17da23f22..317295656a2 100644 --- a/sys/arch/i386/i386/pmapae.c +++ b/sys/arch/i386/i386/pmapae.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmapae.c,v 1.20 2009/08/06 15:28:14 oga Exp $ */ +/* $OpenBSD: pmapae.c,v 1.21 2010/04/22 19:02:44 oga Exp $ */ /* * Copyright (c) 2006 Michael Shalayeff @@ -364,7 +364,7 @@ * is a void function. * * [B] new page tables pages (PTP) - * call pae_pagealloc() + * call uvm_pagealloc() * => success: zero page, add to pm_pdir * => failure: we are out of free vm_pages, let pmap_enter() * tell UVM about it. @@ -553,13 +553,6 @@ extern int pmap_pg_g; extern struct pmap_head pmaps; /* - * a towards larger memory prioritised version opf uvm_pagealloc() - */ -#define pae_pagealloc(obj, off, anon, flags) \ - uvm_pagealloc_strat((obj), (off), (anon), (flags), \ - UVM_PGA_STRAT_FALLBACK, VM_FREELIST_ABOVE4G) - -/* * local prototypes */ @@ -801,7 +794,7 @@ pmap_bootstrap_pae() for (va = KERNBASE, eva = va + (nkpde << 22); va < eva; va += PAGE_SIZE) { if (!pmap_valid_entry(PDE(kpm, pdei(va)))) { - ptp = pae_pagealloc(&kpm->pm_obj, va, NULL, + ptp = uvm_pagealloc(&kpm->pm_obj, va, NULL, UVM_PGA_ZERO); ptaddr = VM_PAGE_TO_PHYS(ptp); PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V; @@ -977,7 +970,7 @@ pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, boolean_t just_try) { struct vm_page *ptp; - ptp = pae_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, + ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO); if (ptp == NULL) return(NULL); diff --git a/sys/arch/sparc/include/vmparam.h b/sys/arch/sparc/include/vmparam.h index 8dc59a3729f..a7ba9e169b3 100644 --- a/sys/arch/sparc/include/vmparam.h +++ b/sys/arch/sparc/include/vmparam.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmparam.h,v 1.33 2008/07/22 18:15:48 miod Exp $ */ +/* $OpenBSD: vmparam.h,v 1.34 2010/04/22 19:02:47 oga Exp $ */ /* $NetBSD: vmparam.h,v 1.13 1997/07/12 16:20:03 perry Exp $ */ /* @@ -130,6 +130,9 @@ struct vm_page_md { #define VM_NFREELIST 1 #define VM_FREELIST_DEFAULT 0 +/* No UVM_IO_RANGES required: IOMMU takes care of this. */ +#define UVM_IO_RANGES {} + #if defined (_KERNEL) && !defined(_LOCORE) struct vm_map; #define dvma_mapin(map,va,len,canwait) dvma_mapin_space(map,va,len,canwait,0) diff --git a/sys/arch/sparc64/include/vmparam.h b/sys/arch/sparc64/include/vmparam.h index f2239697466..b3eca0ba6a4 100644 --- a/sys/arch/sparc64/include/vmparam.h +++ b/sys/arch/sparc64/include/vmparam.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmparam.h,v 1.18 2008/07/18 16:40:17 kurt Exp $ */ +/* $OpenBSD: vmparam.h,v 1.19 2010/04/22 19:02:49 oga Exp $ */ /* $NetBSD: vmparam.h,v 1.18 2001/05/01 02:19:19 thorpej Exp $ */ /* @@ -145,6 +145,9 @@ #define VM_NFREELIST 1 #define VM_FREELIST_DEFAULT 0 +/* No UVM_IO_RANGES required: IOMMU takes care of this. */ +#define UVM_IO_RANGES {} + #define __HAVE_VM_PAGE_MD /* * For each struct vm_page, there is a list of all currently valid virtual diff --git a/sys/conf/files b/sys/conf/files index 32e9a761cb2..9c06c5eabb2 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.486 2010/04/20 22:53:24 miod Exp $ +# $OpenBSD: files,v 1.487 2010/04/22 19:02:52 oga Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -1009,6 +1009,7 @@ file uvm/uvm_page.c file uvm/uvm_pager.c file uvm/uvm_pdaemon.c file uvm/uvm_pglist.c +file uvm/uvm_pmemrange.c file uvm/uvm_stat.c file uvm/uvm_swap.c file uvm/uvm_swap_encrypt.c uvm_swap_encrypt diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h index 0e4966bec1a..a8d42714cbb 100644 --- a/sys/uvm/uvm.h +++ b/sys/uvm/uvm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm.h,v 1.37 2009/06/16 23:54:57 oga Exp $ */ +/* $OpenBSD: uvm.h,v 1.38 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $ */ /* @@ -58,6 +58,7 @@ #include <uvm/uvm_pager.h> #include <uvm/uvm_pdaemon.h> #include <uvm/uvm_swap.h> +#include <uvm/uvm_pmemrange.h> #ifdef UVM_SWAP_ENCRYPT #include <uvm/uvm_swap_encrypt.h> #endif @@ -68,6 +69,34 @@ #include <machine/vmparam.h> /* + * UVM_IO_RANGES: paddr_t pairs, describing the lowest and highest address + * that should be reserved. These ranges (which may overlap) will have their + * use counter increased, causing them to be avoided if an allocation can be + * satisfied from another range of memory. + * + * UVM_IO_RANGES actually results into a call to uvm_pmr_use_inc() per range + * at uvm initialization. uvm_pmr_use_inc() can also be called after uvm_init() + * has completed. + * + * Note: the upper bound is specified in the same way as to uvm_pglistalloc. + * Ex: a memory range of 16 bit is specified as: { 0, 0xffff }. + * Default: no special ranges in use. + */ +#ifndef UVM_IO_RANGES +#define UVM_IO_RANGES \ + { \ + { 0, 0x00ffffffUL }, /* ISA memory */ \ + { 0, 0xffffffffUL }, /* 32-bit PCI memory */ \ + } +#endif + +/* UVM IO ranges are described in an array of struct uvm_io_ranges. */ +struct uvm_io_ranges { + paddr_t low; + paddr_t high; +}; + +/* * uvm structure (vm global state: collected in one structure for ease * of reference...) */ @@ -76,7 +105,6 @@ struct uvm { /* vm_page related parameters */ /* vm_page queues */ - struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ struct pglist page_active; /* allocated pages, in use */ struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */ struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */ @@ -86,6 +114,7 @@ struct uvm { boolean_t page_init_done; /* TRUE if uvm_page_init() finished */ boolean_t page_idle_zero; /* TRUE if we should try to zero pages in the idle loop */ + struct uvm_pmr_control pmr_control; /* pmemrange data */ /* page daemon trigger */ int pagedaemon; /* daemon sleeps on this */ diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h index f3a156d923a..5de889ddb48 100644 --- a/sys/uvm/uvm_extern.h +++ b/sys/uvm/uvm_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_extern.h,v 1.84 2010/03/24 00:36:04 oga Exp $ */ +/* $OpenBSD: uvm_extern.h,v 1.85 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $ */ /* @@ -208,14 +208,14 @@ typedef int vm_prot_t; #define UVM_KMF_TRYLOCK UVM_FLAG_TRYLOCK /* try locking only */ /* - * the following defines the strategies for uvm_pagealloc_strat() + * the following defines the strategies for uvm_pagealloc() */ #define UVM_PGA_STRAT_NORMAL 0 /* high -> low free list walk */ #define UVM_PGA_STRAT_ONLY 1 /* only specified free list */ #define UVM_PGA_STRAT_FALLBACK 2 /* ONLY falls back on NORMAL */ /* - * flags for uvm_pagealloc_strat() + * flags for uvm_pagealloc() */ #define UVM_PGA_USERESERVE 0x0001 /* ok to use reserve pages */ #define UVM_PGA_ZERO 0x0002 /* returned page must be zeroed */ @@ -226,6 +226,7 @@ typedef int vm_prot_t; #define UVM_PLA_WAITOK 0x0001 /* may sleep */ #define UVM_PLA_NOWAIT 0x0002 /* can't sleep (need one of the two) */ #define UVM_PLA_ZERO 0x0004 /* zero all pages before returning */ +#define UVM_PLA_TRYCONTIG 0x0008 /* try to allocate contig physmem */ /* * lockflags that control the locking behavior of various functions. @@ -564,11 +565,8 @@ int uvm_mmap(vm_map_t, vaddr_t *, vsize_t, caddr_t, voff_t, vsize_t, struct proc *); /* uvm_page.c */ -struct vm_page *uvm_pagealloc_strat(struct uvm_object *, - voff_t, struct vm_anon *, int, int, int); -#define uvm_pagealloc(obj, off, anon, flags) \ - uvm_pagealloc_strat((obj), (off), (anon), (flags), \ - UVM_PGA_STRAT_NORMAL, 0) +struct vm_page *uvm_pagealloc(struct uvm_object *, + voff_t, struct vm_anon *, int); vaddr_t uvm_pagealloc_contig(vaddr_t, vaddr_t, vaddr_t, vaddr_t); void uvm_pagerealloc(struct vm_page *, @@ -596,6 +594,9 @@ int uvm_pglistalloc(psize_t, paddr_t, struct pglist *, int, int); void uvm_pglistfree(struct pglist *); +/* uvm_pmemrange.c */ +void uvm_pmr_use_inc(paddr_t, paddr_t); + /* uvm_swap.c */ void uvm_swap_init(void); diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c index c8abc87aa70..29028e79629 100644 --- a/sys/uvm/uvm_map.c +++ b/sys/uvm/uvm_map.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_map.c,v 1.123 2009/08/28 00:40:03 ariane Exp $ */ +/* $OpenBSD: uvm_map.c,v 1.124 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ /* @@ -3999,9 +3999,11 @@ uvm_page_printit(pg, full, pr) /* cross-verify page queue */ if (pg->pg_flags & PQ_FREE) { - int fl = uvm_page_lookup_freelist(pg); - pgl = &uvm.page_free[fl].pgfl_queues[((pg)->pg_flags & PG_ZERO) ? - PGFL_ZEROS : PGFL_UNKNOWN]; + if (uvm_pmr_isfree(pg)) + printf(" page found in uvm_pmemrange\n"); + else + printf(" >>> page not found in uvm_pmemrange <<<\n"); + pgl = NULL; } else if (pg->pg_flags & PQ_INACTIVE) { pgl = (pg->pg_flags & PQ_SWAPBACKED) ? &uvm.page_inactive_swp : &uvm.page_inactive_obj; diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c index 467e072ff43..d40e6a41abb 100644 --- a/sys/uvm/uvm_page.c +++ b/sys/uvm/uvm_page.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.c,v 1.99 2010/04/20 22:05:44 tedu Exp $ */ +/* $OpenBSD: uvm_page.c,v 1.100 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */ /* @@ -73,7 +73,6 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/malloc.h> #include <sys/sched.h> #include <sys/kernel.h> #include <sys/vnode.h> @@ -212,15 +211,12 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) * init the page queues and page queue locks */ - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - for (i = 0; i < PGFL_NQUEUES; i++) - TAILQ_INIT(&uvm.page_free[lcv].pgfl_queues[i]); - } TAILQ_INIT(&uvm.page_active); TAILQ_INIT(&uvm.page_inactive_swp); TAILQ_INIT(&uvm.page_inactive_obj); simple_lock_init(&uvm.pageqlock); mtx_init(&uvm.fpageqlock, IPL_VM); + uvm_pmr_init(); /* * allocate vm_page structures. @@ -271,9 +267,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) { n = vm_physmem[lcv].end - vm_physmem[lcv].start; if (n > pagecount) { - printf("uvm_page_init: lost %ld page(s) in init\n", + panic("uvm_page_init: lost %ld page(s) in init\n", (long)(n - pagecount)); - panic("uvm_page_init"); /* XXXCDC: shouldn't happen? */ + /* XXXCDC: shouldn't happen? */ /* n = pagecount; */ } @@ -293,10 +289,15 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) if (atop(paddr) >= vm_physmem[lcv].avail_start && atop(paddr) <= vm_physmem[lcv].avail_end) { uvmexp.npages++; - /* add page to free pool */ - uvm_pagefree(&vm_physmem[lcv].pgs[i]); } } + + /* + * Add pages to free pool. + */ + uvm_pmr_freepages(&vm_physmem[lcv].pgs[ + vm_physmem[lcv].avail_start - vm_physmem[lcv].start], + vm_physmem[lcv].avail_end - vm_physmem[lcv].avail_start); } /* @@ -651,13 +652,19 @@ uvm_page_physload_flags(paddr_t start, paddr_t end, paddr_t avail_start, } else { #if defined(VM_PHYSSEG_NOADD) panic("uvm_page_physload: tried to add RAM after vm_mem_init"); -#else - uvm_pagefree(&pgs[lcv]); #endif } } } - /* XXXCDC: incomplete: need to update uvmexp.free, what else? */ + + /* + * Add pages to free pool. + */ + if ((flags & PHYSLOAD_DEVICE) == 0) { + uvm_pmr_freepages(&pgs[avail_start - start], + avail_end - avail_start); + } + /* XXXCDC: need hook to tell pmap to rebuild pv_list, etc... */ } else { @@ -778,31 +785,21 @@ uvm_shutdown(void) * => if anon != NULL, anon must be locked (to put in anon) * => only one of obj or anon can be non-null * => caller must activate/deactivate page if it is not wired. - * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. - * => policy decision: it is more important to pull a page off of the - * appropriate priority free list than it is to get a zero'd or - * unknown contents page. This is because we live with the - * consequences of a bad free list decision for the entire - * lifetime of the page, e.g. if the page comes from memory that - * is slower to access. */ struct vm_page * -uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, - int flags, int strat, int free_list) +uvm_pagealloc(struct uvm_object *obj, voff_t off, struct vm_anon *anon, + int flags) { - int lcv, try1, try2, zeroit = 0; struct vm_page *pg; - struct pglist *freeq; - struct pgfreelist *pgfl; + struct pglist pgl; + int pmr_flags; boolean_t use_reserve; - UVMHIST_FUNC("uvm_pagealloc_strat"); UVMHIST_CALLED(pghist); + UVMHIST_FUNC("uvm_pagealloc"); UVMHIST_CALLED(pghist); KASSERT(obj == NULL || anon == NULL); KASSERT(off == trunc_page(off)); - uvm_lock_fpageq(); - /* * check to see if we need to generate some free pages waking * the pagedaemon. @@ -829,124 +826,39 @@ uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, (curproc == syncerproc)))) goto fail; -#if PGFL_NQUEUES != 2 -#error uvm_pagealloc_strat needs to be updated -#endif - - /* - * If we want a zero'd page, try the ZEROS queue first, otherwise - * we try the UNKNOWN queue first. - */ - if (flags & UVM_PGA_ZERO) { - try1 = PGFL_ZEROS; - try2 = PGFL_UNKNOWN; - } else { - try1 = PGFL_UNKNOWN; - try2 = PGFL_ZEROS; - } - - UVMHIST_LOG(pghist, "obj=%p off=%lx anon=%p flags=%lx", - obj, (u_long)off, anon, flags); - UVMHIST_LOG(pghist, "strat=%ld free_list=%ld", strat, free_list, 0, 0); - again: - switch (strat) { - case UVM_PGA_STRAT_NORMAL: - /* Check all freelists in descending priority order. */ - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pgfl = &uvm.page_free[lcv]; - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try1]))) != NULL || - (pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try2]))) != NULL) - goto gotit; - } - - /* No pages free! */ - goto fail; - - case UVM_PGA_STRAT_ONLY: - case UVM_PGA_STRAT_FALLBACK: - /* Attempt to allocate from the specified free list. */ - KASSERT(free_list >= 0 && free_list < VM_NFREELIST); - pgfl = &uvm.page_free[free_list]; - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try1]))) != NULL || - (pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try2]))) != NULL) - goto gotit; - - /* Fall back, if possible. */ - if (strat == UVM_PGA_STRAT_FALLBACK) { - strat = UVM_PGA_STRAT_NORMAL; - goto again; - } - - /* No pages free! */ + pmr_flags = UVM_PLA_NOWAIT; + if (flags & UVM_PGA_ZERO) + pmr_flags |= UVM_PLA_ZERO; + TAILQ_INIT(&pgl); + if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, pmr_flags, &pgl) != 0) goto fail; - default: - panic("uvm_pagealloc_strat: bad strat %d", strat); - /* NOTREACHED */ - } - - gotit: - TAILQ_REMOVE(freeq, pg, pageq); - uvmexp.free--; - - /* update zero'd page count */ - if (pg->pg_flags & PG_ZERO) - uvmexp.zeropages--; - - /* - * update allocation statistics and remember if we have to - * zero the page - */ - if (flags & UVM_PGA_ZERO) { - if (pg->pg_flags & PG_ZERO) { - uvmexp.pga_zerohit++; - zeroit = 0; - } else { - uvmexp.pga_zeromiss++; - zeroit = 1; - } - } - - uvm_unlock_fpageq(); /* unlock free page queue */ + pg = TAILQ_FIRST(&pgl); + KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL); pg->offset = off; pg->uobject = obj; pg->uanon = anon; KASSERT((pg->pg_flags & PG_DEV) == 0); - pg->pg_flags = PG_BUSY|PG_CLEAN|PG_FAKE; - pg->pg_version++; + atomic_setbits_int(&pg->pg_flags, PG_BUSY|PG_CLEAN|PG_FAKE); + if (flags & UVM_PGA_ZERO) + atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); if (anon) { anon->an_page = pg; atomic_setbits_int(&pg->pg_flags, PQ_ANON); - } else { - if (obj) - uvm_pageinsert(pg); - } + } else if (obj) + uvm_pageinsert(pg); + #if defined(UVM_PAGE_TRKOWN) pg->owner_tag = NULL; #endif UVM_PAGE_OWN(pg, "new alloc"); - if (flags & UVM_PGA_ZERO) { - /* - * A zero'd page is not clean. If we got a page not already - * zero'd, then we have to zero it ourselves. - */ - atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); - if (zeroit) - pmap_zero_page(pg); - } - UVMHIST_LOG(pghist, "allocated pg %p/%lx", pg, (u_long)VM_PAGE_TO_PHYS(pg), 0, 0); return(pg); fail: - uvm_unlock_fpageq(); UVMHIST_LOG(pghist, "failed!", 0, 0, 0, 0); return (NULL); } @@ -1030,7 +942,7 @@ uvm_pagefree(struct vm_page *pg) if (saved_loan_count) atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); uvm_pageremove(pg); - + /* * if our page was on loan, then we just lost control over it * (in fact, if it was loaned to an anon, the anon may have @@ -1085,38 +997,31 @@ uvm_pagefree(struct vm_page *pg) } if (pg->uanon) { pg->uanon->an_page = NULL; -#ifdef UBC - uvm_pgcnt_anon--; -#endif + pg->uanon = NULL; + atomic_clearbits_int(&pg->pg_flags, PQ_ANON); } /* - * and put on free queue + * Clean page state bits. */ + atomic_clearbits_int(&pg->pg_flags, PQ_AOBJ); /* XXX: find culprit */ + atomic_clearbits_int(&pg->pg_flags, PQ_ENCRYPT| + PG_ZERO|PG_FAKE|PG_BUSY|PG_RELEASED|PG_CLEAN|PG_CLEANCHK); - atomic_clearbits_int(&pg->pg_flags, PG_ZERO); + /* + * and put on free queue + */ - uvm_lock_fpageq(); -#ifdef PAGEFASTRECYCLE - TAILQ_INSERT_HEAD(&uvm.page_free[ - uvm_page_lookup_freelist(pg)].pgfl_queues[PGFL_UNKNOWN], pg, pageq); -#else - TAILQ_INSERT_TAIL(&uvm.page_free[ - uvm_page_lookup_freelist(pg)].pgfl_queues[PGFL_UNKNOWN], pg, pageq); -#endif - atomic_clearbits_int(&pg->pg_flags, PQ_MASK); - atomic_setbits_int(&pg->pg_flags, PQ_FREE); #ifdef DEBUG pg->uobject = (void *)0xdeadbeef; pg->offset = 0xdeadbeef; pg->uanon = (void *)0xdeadbeef; #endif - uvmexp.free++; + + uvm_pmr_freepages(pg, 1); if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) uvm.page_idle_zero = vm_page_zero_enable; - - uvm_unlock_fpageq(); } /* @@ -1216,6 +1121,7 @@ uvm_page_own(struct vm_page *pg, char *tag) void uvm_pageidlezero(void) { +#if 0 /* disabled: need new code */ struct vm_page *pg; struct pgfreelist *pgfl; int free_list; @@ -1282,6 +1188,7 @@ uvm_pageidlezero(void) uvmexp.zeropages++; uvm_unlock_fpageq(); } while (curcpu_is_idle()); +#endif /* 0 */ } /* diff --git a/sys/uvm/uvm_page.h b/sys/uvm/uvm_page.h index 00324b59037..eda9030fe63 100644 --- a/sys/uvm/uvm_page.h +++ b/sys/uvm/uvm_page.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.h,v 1.41 2010/03/24 00:36:04 oga Exp $ */ +/* $OpenBSD: uvm_page.h,v 1.42 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm_page.h,v 1.19 2000/12/28 08:24:55 chs Exp $ */ /* @@ -116,6 +116,7 @@ struct vm_page { * to read: [O or P] * to modify: [O _and_ P] */ paddr_t phys_addr; /* physical address of page */ + psize_t fpgsz; /* free page range size */ #ifdef __HAVE_VM_PAGE_MD struct vm_page_md mdpage; /* pmap-specific data */ diff --git a/sys/uvm/uvm_pglist.c b/sys/uvm/uvm_pglist.c index b9826f6ee62..b4b15f326c0 100644 --- a/sys/uvm/uvm_pglist.c +++ b/sys/uvm/uvm_pglist.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_pglist.c,v 1.35 2009/08/13 15:29:59 deraadt Exp $ */ +/* $OpenBSD: uvm_pglist.c,v 1.36 2010/04/22 19:02:55 oga Exp $ */ /* $NetBSD: uvm_pglist.c,v 1.13 2001/02/18 21:19:08 chs Exp $ */ /*- @@ -56,112 +56,6 @@ u_long uvm_pglistalloc_npages; #define STAT_DECR(v) #endif -int uvm_pglistalloc_simple(psize_t, paddr_t, paddr_t, struct pglist *); - -/* - * Simple page allocation: pages do not need to be contiguous. We just - * attempt to find enough free pages in the given range. - */ -int -uvm_pglistalloc_simple(psize_t size, paddr_t low, paddr_t high, - struct pglist *rlist) -{ - psize_t todo; - int psi; - struct vm_page *pg; - struct vm_physseg *seg; - paddr_t slow, shigh; - int pgflidx, error, free_list; - UVMHIST_FUNC("uvm_pglistalloc_simple"); UVMHIST_CALLED(pghist); -#ifdef DEBUG - vm_page_t tp; -#endif - - /* Default to "lose". */ - error = ENOMEM; - - todo = atop(size); - - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - /* Are there even any free pages? */ - if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) - goto out; - - for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) { - /* - * Skip this segment if incompatible with the address range. - */ - if (seg->avail_end <= atop(low)) - continue; - if (seg->avail_start >= atop(high)) - continue; - - slow = MAX(atop(low), seg->avail_start); - shigh = MIN(atop(high), seg->avail_end); - - /* we want to be able to allocate at least a page... */ - if (slow == shigh) - continue; - - for (pg = &seg->pgs[slow - seg->start]; slow != shigh; - slow++, pg++) { - if (VM_PAGE_IS_FREE(pg) == 0) - continue; - - free_list = uvm_page_lookup_freelist(pg); - pgflidx = (pg->pg_flags & PG_ZERO) ? - PGFL_ZEROS : PGFL_UNKNOWN; -#ifdef DEBUG - for (tp = TAILQ_FIRST(&uvm.page_free[free_list].pgfl_queues[pgflidx]); - tp != NULL; tp = TAILQ_NEXT(tp, pageq)) { - if (tp == pg) - break; - } - if (tp == NULL) - panic("uvm_pglistalloc_simple: page not on freelist"); -#endif - TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx], - pg, pageq); - uvmexp.free--; - if (pg->pg_flags & PG_ZERO) - uvmexp.zeropages--; - pg->uobject = NULL; - pg->uanon = NULL; - pg->pg_version++; - TAILQ_INSERT_TAIL(rlist, pg, pageq); - STAT_INCR(uvm_pglistalloc_npages); - if (--todo == 0) { - error = 0; - goto out; - } - } - - } - -out: - /* - * check to see if we need to generate some free pages waking - * the pagedaemon. - */ - - if (!error && (uvmexp.free + uvmexp.paging < uvmexp.freemin || - (uvmexp.free + uvmexp.paging < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg))) { - wakeup(&uvm.pagedaemon); - } - - uvm_unlock_fpageq(); - - if (error) - uvm_pglistfree(rlist); - - return (error); -} - /* * uvm_pglistalloc: allocate a list of pages * @@ -179,202 +73,54 @@ out: * alignment memory must be aligned to this power-of-two boundary. * boundary no segment in the allocation may cross this * power-of-two boundary (relative to zero). + * => flags: + * UVM_PLA_NOWAIT fail if allocation fails + * UVM_PLA_WAITOK wait for memory to become avail + * UVM_PLA_ZERO return zeroed memory + * UVM_PLA_TRYCONTIG caller (device) prefers p-lineair memory */ int uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment, paddr_t boundary, struct pglist *rlist, int nsegs, int flags) { - int psi; - struct vm_page *pgs; - struct vm_physseg *seg; - paddr_t slow, shigh; - paddr_t try, idxpa, lastidxpa; - int tryidx, idx, pgflidx, endidx, error, free_list; - vm_page_t m; - u_long pagemask; -#ifdef DEBUG - vm_page_t tp; -#endif UVMHIST_FUNC("uvm_pglistalloc"); UVMHIST_CALLED(pghist); KASSERT((alignment & (alignment - 1)) == 0); KASSERT((boundary & (boundary - 1)) == 0); - /* - * This argument is always ignored for now, but ensure drivers always - * show intention. - */ KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); - - /* - * Our allocations are always page granularity, so our alignment - * must be, too. - */ - if (alignment < PAGE_SIZE) - alignment = PAGE_SIZE; if (size == 0) return (EINVAL); - size = round_page(size); - low = roundup(low, alignment); - - /* - * If we are allowed to allocate as many segments as pages, - * no need to be smart. - */ - if ((nsegs >= size / PAGE_SIZE) && (alignment == PAGE_SIZE) && - (boundary == 0)) { - error = uvm_pglistalloc_simple(size, low, high, rlist); - goto done; - } - - if (boundary != 0 && boundary < size) - return (EINVAL); - - pagemask = ~(boundary - 1); - - /* Default to "lose". */ - error = ENOMEM; - - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - /* Are there even any free pages? */ - if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) - goto out; - - for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) { - /* - * Skip this segment if incompatible with the address range. - */ - if (seg->avail_end <= atop(low)) - continue; - if (seg->avail_start >= atop(high)) - continue; - - slow = MAX(low, ptoa(seg->avail_start)); - shigh = MIN(high, ptoa(seg->avail_end)); - - try = roundup(slow, alignment); - for (;; try += alignment) { - if (try + size > shigh) { - /* - * We've run past the allowable range, or - * the segment. Try another. - */ - break; - } - - tryidx = idx = atop(try) - seg->start; - endidx = idx + atop(size); - pgs = vm_physmem[psi].pgs; - - /* - * Found a suitable starting page. See if the - * range is free. - */ - - for (; idx < endidx; idx++) { - if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) { - break; - } - idxpa = VM_PAGE_TO_PHYS(&pgs[idx]); - if (idx == tryidx) - continue; - - /* - * Check that the region is contiguous - * (it really should...) and does not - * cross an alignment boundary. - */ - lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]); - if ((lastidxpa + PAGE_SIZE) != idxpa) - break; - - if (boundary != 0 && - ((lastidxpa ^ idxpa) & pagemask) != 0) - break; - } - - if (idx == endidx) { - goto found; - } - } + if ((high & PAGE_MASK) != PAGE_MASK) { + printf("uvm_pglistalloc: Upper boundary 0x%lx " + "not on pagemask.\n", (unsigned long)high); } /* - * We could not allocate a contiguous range. This is where - * we should try harder if nsegs > 1... - */ - goto out; - -#if PGFL_NQUEUES != 2 -#error uvm_pglistalloc needs to be updated -#endif - -found: - /* - * we have a chunk of memory that conforms to the requested constraints. + * Our allocations are always page granularity, so our alignment + * must be, too. */ - idx = tryidx; - while (idx < endidx) { - m = &pgs[idx]; - free_list = uvm_page_lookup_freelist(m); - pgflidx = (m->pg_flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; -#ifdef DEBUG - for (tp = TAILQ_FIRST(&uvm.page_free[ - free_list].pgfl_queues[pgflidx]); - tp != NULL; - tp = TAILQ_NEXT(tp, pageq)) { - if (tp == m) - break; - } - if (tp == NULL) - panic("uvm_pglistalloc: page not on freelist"); -#endif - TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx], - m, pageq); - uvmexp.free--; - if (m->pg_flags & PG_ZERO) - uvmexp.zeropages--; - m->uobject = NULL; - m->uanon = NULL; - m->pg_version++; - TAILQ_INSERT_TAIL(rlist, m, pageq); - idx++; - STAT_INCR(uvm_pglistalloc_npages); - } - error = 0; + if (alignment < PAGE_SIZE) + alignment = PAGE_SIZE; -out: + low = atop(roundup(low, alignment)); /* - * check to see if we need to generate some free pages waking - * the pagedaemon. + * high + 1 may result in overflow, in which case high becomes 0x0, + * which is the 'don't care' value. + * The only requirement in that case is that low is also 0x0, or the + * low<high assert will fail. */ - - if (uvmexp.free + uvmexp.paging < uvmexp.freemin || - (uvmexp.free + uvmexp.paging < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg)) { - wakeup(&uvm.pagedaemon); - } - - uvm_unlock_fpageq(); - -done: - /* No locking needed here, pages are not on any queue. */ - if (error == 0) { - TAILQ_FOREACH(m, rlist, pageq) { - if (flags & UVM_PLA_ZERO && - (m->pg_flags & PG_ZERO) == 0) - uvm_pagezero(m); - m->pg_flags = PG_CLEAN; - } - } - - return (error); + high = atop(high + 1); + size = atop(round_page(size)); + alignment = atop(alignment); + if (boundary < PAGE_SIZE && boundary != 0) + boundary = PAGE_SIZE; + boundary = atop(boundary); + + return uvm_pmr_getpages(size, low, high, alignment, boundary, nsegs, + flags, rlist); } /* @@ -386,43 +132,6 @@ done: void uvm_pglistfree(struct pglist *list) { - struct vm_page *m; UVMHIST_FUNC("uvm_pglistfree"); UVMHIST_CALLED(pghist); - - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - while ((m = TAILQ_FIRST(list)) != NULL) { - KASSERT((m->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) == 0); - TAILQ_REMOVE(list, m, pageq); -#ifdef DEBUG - if (m->uobject == (void *)0xdeadbeef && - m->uanon == (void *)0xdeadbeef) { - panic("uvm_pglistfree: freeing free page %p", m); - } - - m->uobject = (void *)0xdeadbeef; - m->offset = 0xdeadbeef; - m->uanon = (void *)0xdeadbeef; -#endif - atomic_clearbits_int(&m->pg_flags, PQ_MASK); - atomic_setbits_int(&m->pg_flags, PQ_FREE); -#ifdef PAGEFASTRECYCLE - TAILQ_INSERT_HEAD(&uvm.page_free[ - uvm_page_lookup_freelist(m)].pgfl_queues[PGFL_UNKNOWN], - m, pageq); -#else - TAILQ_INSERT_TAIL(&uvm.page_free[ - uvm_page_lookup_freelist(m)].pgfl_queues[PGFL_UNKNOWN], - m, pageq); -#endif - uvmexp.free++; - if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) - uvm.page_idle_zero = vm_page_zero_enable; - STAT_DECR(uvm_pglistalloc_npages); - } - - uvm_unlock_fpageq(); + uvm_pmr_freepageq(list); } diff --git a/sys/uvm/uvm_pmemrange.c b/sys/uvm/uvm_pmemrange.c new file mode 100644 index 00000000000..19a6a4f94f1 --- /dev/null +++ b/sys/uvm/uvm_pmemrange.c @@ -0,0 +1,1813 @@ +/* $OpenBSD: uvm_pmemrange.c,v 1.10 2010/04/22 19:02:55 oga Exp $ */ + +/* + * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <uvm/uvm.h> +#include <sys/malloc.h> +#include <sys/proc.h> /* XXX for atomic */ + +/* + * 2 trees: addr tree and size tree. + * + * The allocator keeps chunks of free pages (called a range). + * Two pages are part of the same range if: + * - all pages in between are part of that range, + * - they are of the same memory type (zeroed or non-zeroed), + * - they are part of the same pmemrange. + * A pmemrange is a range of memory which is part of the same vm_physseg + * and has a use-count. + * + * addr tree is vm_page[0].objt + * size tree is vm_page[1].objt + * + * The size tree is not used for memory ranges of 1 page, instead, + * single queue is vm_page[0].pageq + * + * vm_page[0].fpgsz describes the length of a free range. Two adjecent ranges + * are joined, unless: + * - they have pages in between them which are not free + * - they belong to different memtypes (zeroed vs dirty memory) + * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) + * - they are not a continuation of the same array + * The latter issue is caused by vm_physseg ordering and splitting from the + * MD initialization machinery. The MD code is dependant on freelists and + * happens to split ISA memory from non-ISA memory. + * (Note: freelists die die die!) + * + * uvm_page_init guarantees that every vm_physseg contains an array of + * struct vm_page. Also, uvm_page_physload allocates an array of struct + * vm_page. This code depends on that array. The array may break across + * vm_physsegs boundaries. + */ + +/* + * Validate the flags of the page. (Used in asserts.) + * Any free page must have the PQ_FREE flag set. + * Free pages may be zeroed. + * Pmap flags are left untouched. + * + * The PQ_FREE flag is not checked here: by not checking, we can easily use + * this check in pages which are freed. + */ +#define VALID_FLAGS(pg_flags) \ + (((pg_flags) & ~(PQ_FREE|PG_ZERO| \ + PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3)) == 0x0) + +/* Tree comparators. */ +int uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); +int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); +int uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *); +int uvm_pmr_size_cmp(struct vm_page *, struct vm_page *); +int uvm_pmr_pg_to_memtype(struct vm_page *); + +#ifdef DDB +void uvm_pmr_print(void); +#endif + +/* + * Memory types. The page flags are used to derive what the current memory + * type of a page is. + */ +int +uvm_pmr_pg_to_memtype(struct vm_page *pg) +{ + if (pg->pg_flags & PG_ZERO) + return UVM_PMR_MEMTYPE_ZERO; + /* Default: dirty memory. */ + return UVM_PMR_MEMTYPE_DIRTY; +} + +/* Trees. */ +RB_PROTOTYPE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); +RB_PROTOTYPE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); +RB_PROTOTYPE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, + uvm_pmemrange_addr_cmp); +RB_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); +RB_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); +RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, + uvm_pmemrange_addr_cmp); + +/* Validation. */ +#ifdef DEBUG +void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); +#else +#define uvm_pmr_assertvalid(pmr) do {} while (0) +#endif + + +int uvm_pmr_get1page(psize_t, int, struct pglist *, + paddr_t, paddr_t); + +struct uvm_pmemrange *uvm_pmr_allocpmr(void); +struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); +struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, + struct vm_page *, int); +void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, + struct vm_page *pg, struct vm_page **pg_prev, + struct vm_page **pg_next); +struct vm_page *uvm_pmr_insert_addr(struct uvm_pmemrange *, + struct vm_page *, int); +void uvm_pmr_insert_size(struct uvm_pmemrange *, + struct vm_page *); +struct vm_page *uvm_pmr_insert(struct uvm_pmemrange *, + struct vm_page *, int); +void uvm_pmr_remove_size(struct uvm_pmemrange *, + struct vm_page *); +void uvm_pmr_remove_addr(struct uvm_pmemrange *, + struct vm_page *); +void uvm_pmr_remove(struct uvm_pmemrange *, + struct vm_page *); +struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, + struct vm_page *, paddr_t); +psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, + struct vm_page **, int); +void uvm_pmr_split(paddr_t); +struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); +struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, + struct uvm_pmemrange *); +struct vm_page *uvm_pmr_extract_range(struct uvm_pmemrange *, + struct vm_page *, paddr_t, paddr_t, + struct pglist *); +psize_t pow2divide(psize_t, psize_t); +struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, + struct vm_page *, paddr_t, paddr_t, int); + +/* + * Computes num/denom and rounds it up to the next power-of-2. + * + * This is a division function which calculates an approximation of + * num/denom, with result =~ num/denom. It is meant to be fast and doesn't + * have to be accurate. + * + * Providing too large a value makes the allocator slightly faster, at the + * risk of hitting the failure case more often. Providing too small a value + * makes the allocator a bit slower, but less likely to hit a failure case. + */ +psize_t +pow2divide(psize_t num, psize_t denom) +{ + int rshift; + + for (rshift = 0; num > denom; rshift++, denom <<= 1); + return (paddr_t)1 << rshift; +} + +/* + * Predicate: lhs is a subrange or rhs. + * + * If rhs_low == 0: don't care about lower bound. + * If rhs_high == 0: don't care about upper bound. + */ +#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ + (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ + ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) + +/* + * Predicate: lhs intersects with rhs. + * + * If rhs_low == 0: don't care about lower bound. + * If rhs_high == 0: don't care about upper bound. + * Ranges don't intersect if they don't have any page in common, array + * semantics mean that < instead of <= should be used here. + */ +#define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ + (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ + ((rhs_high) == 0 || (lhs_low) < (rhs_high))) + +/* + * Align to power-of-2 alignment. + */ +#define PMR_ALIGN(pgno, align) \ + (((pgno) + ((align) - 1)) & ~((align) - 1)) + + +/* + * Comparator: sort by address ascending. + */ +int +uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) +{ + return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; +} + +/* + * Comparator: sort by use ascending. + * + * The higher the use value of a range, the more devices need memory in + * this range. Therefor allocate from the range with the lowest use first. + */ +int +uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) +{ + int result; + + result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; + if (result == 0) + result = uvm_pmemrange_addr_cmp(lhs, rhs); + return result; +} + +int +uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs) +{ + paddr_t lhs_addr, rhs_addr; + + lhs_addr = VM_PAGE_TO_PHYS(lhs); + rhs_addr = VM_PAGE_TO_PHYS(rhs); + + return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); +} + +int +uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs) +{ + psize_t lhs_size, rhs_size; + int cmp; + + /* Using second tree, so we receive pg[1] instead of pg[0]. */ + lhs_size = (lhs - 1)->fpgsz; + rhs_size = (rhs - 1)->fpgsz; + + cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); + if (cmp == 0) + cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); + return cmp; +} + +/* + * Find the first range of free pages that is at least sz pages long. + */ +struct vm_page * +uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) +{ + struct vm_page *node, *best; + + KASSERT(sz >= 1); + + if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) + return TAILQ_FIRST(&pmr->single[mti]); + + node = RB_ROOT(&pmr->size[mti]); + best = NULL; + while (node != NULL) { + if ((node - 1)->fpgsz >= sz) { + best = (node - 1); + node = RB_LEFT(node, objt); + } else + node = RB_RIGHT(node, objt); + } + return best; +} + +/* + * Finds the next range. The next range has a size >= pg->fpgsz. + * Returns NULL if no more ranges are available. + */ +struct vm_page * +uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) +{ + struct vm_page *npg; + + KASSERT(pmr != NULL && pg != NULL); + if (pg->fpgsz == 1) { + if (TAILQ_NEXT(pg, pageq) != NULL) + return TAILQ_NEXT(pg, pageq); + else + npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]); + } else + npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1); + + return npg == NULL ? NULL : npg - 1; +} + +/* + * Finds the previous and next ranges relative to the (uninserted) pg range. + * + * *pg_prev == NULL if no previous range is available, that can join with + * pg. + * *pg_next == NULL if no next range is available, that can join with + * pg. + */ +void +uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, + struct vm_page **pg_prev, struct vm_page **pg_next) +{ + KASSERT(pg_prev != NULL && pg_next != NULL); + + *pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); + if (*pg_next == NULL) + *pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + *pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next); + + KDASSERT(*pg_next == NULL || + VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); + KDASSERT(*pg_prev == NULL || + VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); + + /* Reset if not contig. */ + if (*pg_prev != NULL && + (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz + != atop(VM_PAGE_TO_PHYS(pg)) || + *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ + uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) + *pg_prev = NULL; + if (*pg_next != NULL && + (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz + != atop(VM_PAGE_TO_PHYS(*pg_next)) || + pg + pg->fpgsz != *pg_next || /* Array broke. */ + uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) + *pg_next = NULL; + return; +} + +/* + * Remove a range from the address tree. + * Address tree maintains pmr counters. + */ +void +uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); + KDASSERT(pg->pg_flags & PQ_FREE); + RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg); + + pmr->nsegs--; +} +/* + * Remove a range from the size tree. + */ +void +uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + int memtype; +#ifdef DEBUG + struct vm_page *i; +#endif + + KDASSERT(pg->fpgsz >= 1); + KDASSERT(pg->pg_flags & PQ_FREE); + memtype = uvm_pmr_pg_to_memtype(pg); + + if (pg->fpgsz == 1) { +#ifdef DEBUG + TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { + if (i == pg) + break; + } + KDASSERT(i == pg); +#endif + TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); + } else { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype], + pg + 1) == pg + 1); + RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); + } +} +/* Remove from both trees. */ +void +uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + uvm_pmr_assertvalid(pmr); + uvm_pmr_remove_size(pmr, pg); + uvm_pmr_remove_addr(pmr, pg); + uvm_pmr_assertvalid(pmr); +} + +/* + * Insert the range described in pg. + * Returns the range thus created (which may be joined with the previous and + * next ranges). + * If no_join, the caller guarantees that the range cannot possibly join + * with adjecent ranges. + */ +struct vm_page * +uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) +{ + struct vm_page *prev, *next; + +#ifdef DEBUG + struct vm_page *i; + int mt; +#endif + + KDASSERT(pg->pg_flags & PQ_FREE); + KDASSERT(pg->fpgsz >= 1); + +#ifdef DEBUG + for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { + TAILQ_FOREACH(i, &pmr->single[mt], pageq) + KDASSERT(i != pg); + if (pg->fpgsz > 1) { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt], + pg + 1) == NULL); + } + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); + } +#endif + + if (!no_join) { + uvm_pmr_pnaddr(pmr, pg, &prev, &next); + if (next != NULL) { + uvm_pmr_remove_size(pmr, next); + uvm_pmr_remove_addr(pmr, next); + pg->fpgsz += next->fpgsz; + next->fpgsz = 0; + } + if (prev != NULL) { + uvm_pmr_remove_size(pmr, prev); + prev->fpgsz += pg->fpgsz; + pg->fpgsz = 0; + return prev; + } + } + + RB_INSERT(uvm_pmr_addr, &pmr->addr, pg); + + pmr->nsegs++; + + return pg; +} +/* + * Insert the range described in pg. + * Returns the range thus created (which may be joined with the previous and + * next ranges). + * Page must already be in the address tree. + */ +void +uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + int memtype; +#ifdef DEBUG + struct vm_page *i; + int mti; +#endif + + KDASSERT(pg->fpgsz >= 1); + KDASSERT(pg->pg_flags & PQ_FREE); + + memtype = uvm_pmr_pg_to_memtype(pg); +#ifdef DEBUG + for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { + TAILQ_FOREACH(i, &pmr->single[mti], pageq) + KDASSERT(i != pg); + if (pg->fpgsz > 1) { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti], + pg + 1) == NULL); + } + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); + } + for (i = pg; i < pg + pg->fpgsz; i++) + KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); +#endif + + if (pg->fpgsz == 1) + TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); + else + RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); +} +/* Insert in both trees. */ +struct vm_page * +uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) +{ + uvm_pmr_assertvalid(pmr); + pg = uvm_pmr_insert_addr(pmr, pg, no_join); + uvm_pmr_insert_size(pmr, pg); + uvm_pmr_assertvalid(pmr); + return pg; +} + +/* + * Find the last page that is part of this segment. + * => pg: the range at which to start the search. + * => boundary: the page number boundary specification (0 = no boundary). + * => pmr: the pmemrange of the page. + * + * This function returns 1 before the next range, so if you want to have the + * next range, you need to run TAILQ_NEXT(result, pageq) after calling. + * The reason is that this way, the length of the segment is easily + * calculated using: atop(result) - atop(pg) + 1. + * Hence this function also never returns NULL. + */ +struct vm_page * +uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, + struct vm_page *pg, paddr_t boundary) +{ + paddr_t first_boundary; + struct vm_page *next; + struct vm_page *prev; + + KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && + pmr->high > atop(VM_PAGE_TO_PHYS(pg))); + if (boundary != 0) { + first_boundary = + PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); + } else + first_boundary = 0; + + /* + * Increase next until it hits the first page of the next segment. + * + * While loop checks the following: + * - next != NULL we have not reached the end of pgl + * - boundary == 0 || next < first_boundary + * we do not cross a boundary + * - atop(prev) + 1 == atop(next) + * still in the same segment + * - low <= last + * - high > last still in the same memory range + * - memtype is equal allocator is unable to view different memtypes + * as part of the same segment + * - prev + 1 == next no array breakage occurs + */ + prev = pg; + next = TAILQ_NEXT(prev, pageq); + while (next != NULL && + (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && + atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && + pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && + pmr->high > atop(VM_PAGE_TO_PHYS(next)) && + uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && + prev + 1 == next) { + prev = next; + next = TAILQ_NEXT(prev, pageq); + } + + /* + * End of this segment. + */ + return prev; +} + +/* + * Remove the first segment of contiguous pages from pgl. + * A segment ends if it crosses boundary (unless boundary = 0) or + * if it would enter a different uvm_pmemrange. + * + * Work: the page range that the caller is currently working with. + * May be null. + * + * If is_desperate is non-zero, the smallest segment is erased. Otherwise, + * the first segment is erased (which, if called by uvm_pmr_getpages(), + * probably is the smallest or very close to it). + */ +psize_t +uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, + struct vm_page **work, int is_desperate) +{ + struct vm_page *start, *end, *iter, *iter_end, *inserted; + psize_t count; + struct uvm_pmemrange *pmr, *pmr_iter; + + KASSERT(!TAILQ_EMPTY(pgl)); + + /* + * Initialize to first page. + * Unless desperate scan finds a better candidate, this is what'll be + * erased. + */ + start = TAILQ_FIRST(pgl); + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); + end = uvm_pmr_findnextsegment(pmr, start, boundary); + + /* + * If we are desperate, we _really_ want to get rid of the smallest + * element (rather than a close match to the smallest element). + */ + if (is_desperate) { + /* Lineair search for smallest segment. */ + pmr_iter = pmr; + for (iter = TAILQ_NEXT(end, pageq); + iter != NULL && start != end; + iter = TAILQ_NEXT(iter_end, pageq)) { + /* + * Only update pmr if it doesn't match current + * iteration. + */ + if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || + pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { + pmr_iter = uvm_pmemrange_find(atop( + VM_PAGE_TO_PHYS(iter))); + } + + iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, + boundary); + + /* + * Current iteration is smaller than best match so + * far; update. + */ + if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < + VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { + start = iter; + end = iter_end; + pmr = pmr_iter; + } + } + } + + /* + * Calculate count and end of the list. + */ + count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; + end = TAILQ_NEXT(end, pageq); + + /* + * Actually remove the range of pages. + * + * Sadly, this cannot be done using pointer iteration: + * vm_physseg is not guaranteed to be sorted on address, hence + * uvm_page_init() may not have initialized its array sorted by + * page number. + */ + for (iter = start; iter != end; iter = iter_end) { + iter_end = TAILQ_NEXT(iter, pageq); + TAILQ_REMOVE(pgl, iter, pageq); + } + + start->fpgsz = count; + inserted = uvm_pmr_insert(pmr, start, 0); + + /* + * If the caller was working on a range and this function modified + * that range, update the pointer. + */ + if (work != NULL && *work != NULL && + atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && + atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > + atop(VM_PAGE_TO_PHYS(*work))) + *work = inserted; + return count; +} + +/* + * Extract a number of pages from a segment of free pages. + * Called by uvm_pmr_getpages. + * + * Returns the segment that was created from pages left over at the tail + * of the remove set of pages, or NULL if no pages were left at the tail. + */ +struct vm_page * +uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, + paddr_t start, paddr_t end, struct pglist *result) +{ + struct vm_page *after, *pg_i; + psize_t before_sz, after_sz; +#ifdef DEBUG + psize_t i; +#endif + + KDASSERT(end > start); + KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); + KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); + KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); + KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); + + before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); + after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; + KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); + uvm_pmr_assertvalid(pmr); + + uvm_pmr_remove_size(pmr, pg); + if (before_sz == 0) + uvm_pmr_remove_addr(pmr, pg); + + /* Add selected pages to result. */ + for (pg_i = pg + before_sz; atop(VM_PAGE_TO_PHYS(pg_i)) < end; + pg_i++) { + KDASSERT(pg_i->pg_flags & PQ_FREE); + pg_i->fpgsz = 0; + TAILQ_INSERT_TAIL(result, pg_i, pageq); + } + + /* Before handling. */ + if (before_sz > 0) { + pg->fpgsz = before_sz; + uvm_pmr_insert_size(pmr, pg); + } + + /* After handling. */ + after = NULL; + if (after_sz > 0) { + after = pg + before_sz + (end - start); +#ifdef DEBUG + for (i = 0; i < after_sz; i++) { + KASSERT(!uvm_pmr_isfree(after + i)); + } +#endif + KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); + after->fpgsz = after_sz; + after = uvm_pmr_insert_addr(pmr, after, 1); + uvm_pmr_insert_size(pmr, after); + } + + uvm_pmr_assertvalid(pmr); + return after; +} + +/* + * Acquire a number of pages. + * + * count: the number of pages returned + * start: lowest page number + * end: highest page number +1 + * (start = end = 0: no limitation) + * align: power-of-2 alignment constraint (align = 1: no alignment) + * boundary: power-of-2 boundary (boundary = 0: no boundary) + * maxseg: maximum number of segments to return + * flags: UVM_PLA_* flags + * result: returned pages storage (uses pageq) + */ +int +uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, + paddr_t boundary, int maxseg, int flags, struct pglist *result) +{ + struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ + struct vm_page *found, *f_next; /* Iterate chunks. */ + psize_t fcount; /* Current found pages. */ + int fnsegs; /* Current segment counter. */ + int try, start_try; + psize_t search[3]; + paddr_t fstart, fend; /* Pages to be taken from found. */ + int memtype; /* Requested memtype. */ + int memtype_init; /* Best memtype. */ + int desperate; /* True if allocation failed. */ + + /* + * Validate arguments. + */ + KASSERT(count > 0 && + (start == 0 || end == 0 || start < end) && + align >= 1 && powerof2(align) && + maxseg > 0 && + (boundary == 0 || powerof2(boundary)) && + (boundary == 0 || maxseg * boundary >= count) && + TAILQ_EMPTY(result)); + + /* + * TRYCONTIG is a noop if you only want a single segment. + * Remove it if that's the case: otherwise it'll deny the fast + * allocation. + */ + if (maxseg == 1 || count == 1) + flags &= ~UVM_PLA_TRYCONTIG; + + /* + * Configure search. + * + * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. + * search[1] is multiple segments, chosen to fulfill the search in + * approximately even-sized segments. + * This is a good trade-off between slightly reduced allocation speed + * and less fragmentation. + * search[2] is the worst case, in which all segments are evaluated. + * This provides the least fragmentation, but makes the search + * possibly longer (although in the case it is selected, that no + * longer matters most). + * + * The exception is when maxseg == 1: since we can only fulfill that + * with one segment of size pages, only a single search type has to + * be attempted. + */ + if (maxseg == 1 || count == 1) { + start_try = 2; + search[2] = count; + } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { + start_try = 2; + search[2] = 1; + } else { + start_try = 0; + search[0] = count; + search[1] = pow2divide(count, maxseg); + search[2] = 1; + if ((flags & UVM_PLA_TRYCONTIG) == 0) + start_try = 1; + if (search[1] >= search[0]) { + search[1] = search[0]; + start_try = 1; + } + if (search[2] >= search[start_try]) { + start_try = 2; + } + } + + /* + * Memory type: if zeroed memory is requested, traverse the zero set. + * Otherwise, traverse the dirty set. + * + * The memtype iterator is reinitialized to memtype_init on entrance + * of a pmemrange. + */ + if (flags & UVM_PLA_ZERO) + memtype_init = UVM_PMR_MEMTYPE_ZERO; + else + memtype_init = UVM_PMR_MEMTYPE_DIRTY; + + /* + * Initially, we're not desperate. + * + * Note that if we return from a sleep, we are still desperate. + * Chances are that memory pressure is still high, so resetting + * seems over-optimistic to me. + */ + desperate = 0; + +ReTry: /* Return point after sleeping. */ + fcount = 0; + fnsegs = 0; + + uvm_lock_fpageq(); + +ReTryDesperate: + /* + * If we just want any page(s), go for the really fast option. + */ + if (count <= maxseg && align == 1 && boundary == 0 && + (flags & UVM_PLA_TRYCONTIG) == 0) { + fcount += uvm_pmr_get1page(count - fcount, memtype_init, + result, start, end); + + /* + * If we found sufficient pages, go to the succes exit code. + * + * Otherwise, go immediately to fail, since we collected + * all we could anyway. + */ + if (fcount == count) + goto Out; + else + goto Fail; + } + + /* + * The hart of the contig case. + * + * The code actually looks like this: + * + * foreach (struct pmemrange) { + * foreach (memtype) { + * foreach(try) { + * foreach (free range of memtype in pmemrange, + * starting at search[try]) { + * while (range has space left) + * take from range + * } + * } + * } + * + * if next pmemrange has higher usecount than current: + * enter desperate case (which will drain the pmemranges + * until empty prior to moving to the next one) + * } + * + * When desperate is activated, try always starts at the highest + * value. The memtype loop is using a goto ReScanMemtype. + * The try loop is using a goto ReScan. + * The 'range has space left' loop uses label DrainFound. + * + * Writing them all as loops would take up a lot of screen space in + * the form of indentation and some parts are easier to express + * using the labels. + */ + + TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { + /* Empty range. */ + if (pmr->nsegs == 0) + continue; + + /* Outside requested range. */ + if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) + continue; + + memtype = memtype_init; + +ReScanMemtype: /* Return point at memtype++. */ + try = start_try; + +ReScan: /* Return point at try++. */ + for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); + found != NULL; + found = f_next) { + f_next = uvm_pmr_nextsz(pmr, found, memtype); + + fstart = atop(VM_PAGE_TO_PHYS(found)); + if (start != 0) + fstart = MAX(start, fstart); +DrainFound: + /* + * Throw away the first segment if fnsegs == maxseg + * + * Note that f_next is still valid after this call, + * since we only allocated from entries before f_next. + * We don't revisit the entries we already extracted + * from unless we entered the desperate case. + */ + if (fnsegs == maxseg) { + fnsegs--; + fcount -= + uvm_pmr_remove_1strange(result, boundary, + &found, desperate); + } + + fstart = PMR_ALIGN(fstart, align); + fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; + if (fstart >= fend) + continue; + if (boundary != 0) { + fend = + MIN(fend, PMR_ALIGN(fstart + 1, boundary)); + } + if (end != 0) + fend = MIN(end, fend); + if (fend - fstart > count - fcount) + fend = fstart + (count - fcount); + + fcount += fend - fstart; + fnsegs++; + found = uvm_pmr_extract_range(pmr, found, + fstart, fend, result); + + if (fcount == count) + goto Out; + + /* + * If there's still space left in found, try to + * fully drain it prior to continueing. + */ + if (found != NULL) { + fstart = fend; + goto DrainFound; + } + } + + /* + * Try a smaller search now. + */ + if (++try < nitems(search)) + goto ReScan; + + /* + * Exhaust all memory types prior to going to the next memory + * segment. + * This means that zero-vs-dirty are eaten prior to moving + * to a pmemrange with a higher use-count. + * + * Code is basically a difficult way of writing: + * memtype = memtype_init; + * do { + * ...; + * memtype += 1; + * memtype %= MEMTYPE_MAX; + * } while (memtype != memtype_init); + */ + memtype += 1; + if (memtype == UVM_PMR_MEMTYPE_MAX) + memtype = 0; + if (memtype != memtype_init) + goto ReScanMemtype; + + /* + * If not desperate, enter desperate case prior to eating all + * the good stuff in the next range. + */ + if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && + TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) + break; + } + + /* + * Not enough memory of the requested type available. Fall back to + * less good memory that we'll clean up better later. + * + * This algorithm is not very smart though, it just starts scanning + * a different typed range, but the nicer ranges of the previous + * iteration may fall out. Hence there is a small chance of a false + * negative. + * + * When desparate: scan all sizes starting at the smallest + * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may + * allow us to hit the fast path now). + * + * Also, because we will revisit entries we scanned before, we need + * to reset the page queue, or we may end up releasing entries in + * such a way as to invalidate f_next. + */ + if (!desperate) { + desperate = 1; + start_try = nitems(search) - 1; + flags &= ~UVM_PLA_TRYCONTIG; + + while (!TAILQ_EMPTY(result)) + uvm_pmr_remove_1strange(result, 0, NULL, 0); + fnsegs = 0; + fcount = 0; + goto ReTryDesperate; + } + +Fail: + /* + * Allocation failed. + */ + + /* XXX: claim from memory reserve here */ + + while (!TAILQ_EMPTY(result)) + uvm_pmr_remove_1strange(result, 0, NULL, 0); + uvm_unlock_fpageq(); + + if (flags & UVM_PLA_WAITOK) { + uvm_wait("uvm_pmr_getpages"); + goto ReTry; + } else + wakeup(&uvm.pagedaemon_proc); + + return ENOMEM; + +Out: + + /* + * Allocation succesful. + */ + + uvmexp.free -= fcount; + + uvm_unlock_fpageq(); + + /* Update statistics and zero pages if UVM_PLA_ZERO. */ + TAILQ_FOREACH(found, result, pageq) { + atomic_clearbits_int(&found->pg_flags, + PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3); + + if (found->pg_flags & PG_ZERO) { + uvmexp.zeropages--; + } + if (flags & UVM_PLA_ZERO) { + if (found->pg_flags & PG_ZERO) + uvmexp.pga_zerohit++; + else { + uvmexp.pga_zeromiss++; + uvm_pagezero(found); + } + } + atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); + + found->uobject = NULL; + found->uanon = NULL; + found->pg_version++; + + /* + * Validate that the page matches range criterium. + */ + KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); + KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); + } + + return 0; +} + +/* + * Free a number of contig pages (invoked by uvm_page_init). + */ +void +uvm_pmr_freepages(struct vm_page *pg, psize_t count) +{ + struct uvm_pmemrange *pmr; + psize_t i, pmr_count; + + for (i = 0; i < count; i++) { + KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == + atop(VM_PAGE_TO_PHYS(pg)) + i); + + if (!((pg[i].pg_flags & PQ_FREE) == 0 && + VALID_FLAGS(pg[i].pg_flags))) { + printf("Flags: 0x%x, will panic now.\n", + pg[i].pg_flags); + } + KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && + VALID_FLAGS(pg[i].pg_flags)); + atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); + atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); + } + + uvm_lock_fpageq(); + + while (count > 0) { + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); + KASSERT(pmr != NULL); + + pmr_count = MIN(count, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); + pg->fpgsz = pmr_count; + uvm_pmr_insert(pmr, pg, 0); + + uvmexp.free += pmr_count; + count -= pmr_count; + pg += pmr_count; + } + wakeup(&uvmexp.free); + + uvm_unlock_fpageq(); +} + +/* + * Free all pages in the queue. + */ +void +uvm_pmr_freepageq(struct pglist *pgl) +{ + struct vm_page *pg; + + TAILQ_FOREACH(pg, pgl, pageq) { + if (!((pg->pg_flags & PQ_FREE) == 0 && + VALID_FLAGS(pg->pg_flags))) { + printf("Flags: 0x%x, will panic now.\n", + pg->pg_flags); + } + KASSERT((pg->pg_flags & PQ_FREE) == 0 && + VALID_FLAGS(pg->pg_flags)); + atomic_setbits_int(&pg->pg_flags, PQ_FREE); + atomic_clearbits_int(&pg->pg_flags, PG_ZERO); + } + + uvm_lock_fpageq(); + while (!TAILQ_EMPTY(pgl)) + uvmexp.free += uvm_pmr_remove_1strange(pgl, 0, NULL, 0); + wakeup(&uvmexp.free); + uvm_unlock_fpageq(); + + return; +} + +/* + * Store a pmemrange in the list. + * + * The list is sorted by use. + */ +struct uvm_pmemrange * +uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, + struct uvm_pmemrange *pmr) +{ + struct uvm_pmemrange *iter; + int cmp = 1; + + TAILQ_FOREACH(iter, useq, pmr_use) { + cmp = uvm_pmemrange_use_cmp(pmr, iter); + if (cmp == 0) + return iter; + if (cmp == -1) + break; + } + + if (iter == NULL) + TAILQ_INSERT_TAIL(useq, pmr, pmr_use); + else + TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); + return NULL; +} + +#ifdef DEBUG +/* + * Validation of the whole pmemrange. + * Called with fpageq locked. + */ +void +uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) +{ + struct vm_page *prev, *next, *i, *xref; + int lcv, mti; + + /* Validate address tree. */ + RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) { + /* Validate the range. */ + KASSERT(i->fpgsz > 0); + KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); + KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz + <= pmr->high); + + /* Validate each page in this range. */ + for (lcv = 0; lcv < i->fpgsz; lcv++) { + /* + * Only the first page has a size specification. + * Rest is size 0. + */ + KASSERT(lcv == 0 || i[lcv].fpgsz == 0); + /* + * Flag check. + */ + KASSERT(VALID_FLAGS(i[lcv].pg_flags) && + (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); + /* + * Free pages are: + * - not wired + * - not loaned + * - have no vm_anon + * - have no uvm_object + */ + KASSERT(i[lcv].wire_count == 0); + KASSERT(i[lcv].loan_count == 0); + KASSERT(i[lcv].uanon == (void*)0xdeadbeef || + i[lcv].uanon == NULL); + KASSERT(i[lcv].uobject == (void*)0xdeadbeef || + i[lcv].uobject == NULL); + /* + * Pages in a single range always have the same + * memtype. + */ + KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == + uvm_pmr_pg_to_memtype(&i[lcv])); + } + + /* Check that it shouldn't be joined with its predecessor. */ + prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i); + if (prev != NULL) { + KASSERT(uvm_pmr_pg_to_memtype(i) != + uvm_pmr_pg_to_memtype(prev) || + atop(VM_PAGE_TO_PHYS(i)) > + atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || + prev + prev->fpgsz != i); + } + + /* Assert i is in the size tree as well. */ + if (i->fpgsz == 1) { + TAILQ_FOREACH(xref, + &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { + if (xref == i) + break; + } + KASSERT(xref == i); + } else { + KASSERT(RB_FIND(uvm_pmr_size, + &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == + i + 1); + } + } + + /* Validate size tree. */ + for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { + for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { + next = uvm_pmr_nextsz(pmr, i, mti); + if (next != NULL) { + KASSERT(i->fpgsz <= + next->fpgsz); + } + + /* Assert i is in the addr tree as well. */ + KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i); + + /* Assert i is of the correct memory type. */ + KASSERT(uvm_pmr_pg_to_memtype(i) == mti); + } + } + + /* Validate nsegs statistic. */ + lcv = 0; + RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) + lcv++; + KASSERT(pmr->nsegs == lcv); +} +#endif /* DEBUG */ + +/* + * Split pmr at split point pageno. + * Called with fpageq unlocked. + * + * Split is only applied if a pmemrange spans pageno. + */ +void +uvm_pmr_split(paddr_t pageno) +{ + struct uvm_pmemrange *pmr, *drain; + struct vm_page *rebuild, *prev, *next; + psize_t prev_sz; + + uvm_lock_fpageq(); + pmr = uvm_pmemrange_find(pageno); + if (pmr == NULL || !(pmr->low < pageno)) { + /* No split required. */ + uvm_unlock_fpageq(); + return; + } + + KASSERT(pmr->low < pageno); + KASSERT(pmr->high > pageno); + + drain = uvm_pmr_allocpmr(); + drain->low = pageno; + drain->high = pmr->high; + drain->use = pmr->use; + + uvm_pmr_assertvalid(pmr); + uvm_pmr_assertvalid(drain); + KASSERT(drain->nsegs == 0); + + RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { + if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) + break; + } + if (rebuild == NULL) + prev = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild); + KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); + + /* + * Handle free chunk that spans the split point. + */ + if (prev != NULL && + atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { + psize_t before, after; + + KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); + + uvm_pmr_remove(pmr, prev); + prev_sz = prev->fpgsz; + before = pageno - atop(VM_PAGE_TO_PHYS(prev)); + after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; + + KASSERT(before > 0); + KASSERT(after > 0); + + prev->fpgsz = before; + uvm_pmr_insert(pmr, prev, 1); + (prev + before)->fpgsz = after; + uvm_pmr_insert(drain, prev + before, 1); + } + + /* + * Move free chunks that no longer fall in the range. + */ + for (; rebuild != NULL; rebuild = next) { + next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild); + + uvm_pmr_remove(pmr, rebuild); + uvm_pmr_insert(drain, rebuild, 1); + } + + pmr->high = pageno; + uvm_pmr_assertvalid(pmr); + uvm_pmr_assertvalid(drain); + + RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); + uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); + uvm_unlock_fpageq(); +} + +/* + * Increase the usage counter for the given range of memory. + * + * The more usage counters a given range of memory has, the more will be + * attempted not to allocate from it. + * + * Addresses here are in paddr_t, not page-numbers. + * The lowest and highest allowed address are specified. + */ +void +uvm_pmr_use_inc(paddr_t low, paddr_t high) +{ + struct uvm_pmemrange *pmr; + + /* + * If high+1 == 0 and low == 0, then you are increasing use + * of the whole address space, which won't make any difference. + * Skip in that case. + */ + high++; + if (high == 0 && low == 0) + return; + + /* + * pmr uses page numbers, translate low and high. + */ + low = atop(round_page(low)); + high = atop(trunc_page(high)); + uvm_pmr_split(low); + uvm_pmr_split(high); + + uvm_lock_fpageq(); + + /* Increase use count on segments in range. */ + RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { + if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { + TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); + pmr->use++; + uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); + } + uvm_pmr_assertvalid(pmr); + } + + uvm_unlock_fpageq(); +} + +/* + * Allocate a pmemrange. + * + * If called from uvm_page_init, the uvm_pageboot_alloc is used. + * If called after uvm_init, malloc is used. + * (And if called in between, you're dead.) + */ +struct uvm_pmemrange * +uvm_pmr_allocpmr() +{ + struct uvm_pmemrange *nw; + int i; + + if (!uvm.page_init_done) { + nw = (struct uvm_pmemrange *) + uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); + bzero(nw, sizeof(struct uvm_pmemrange)); + } else { + nw = malloc(sizeof(struct uvm_pmemrange), + M_VMMAP, M_NOWAIT | M_ZERO); + } + RB_INIT(&nw->addr); + for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { + RB_INIT(&nw->size[i]); + TAILQ_INIT(&nw->single[i]); + } + return nw; +} + +static const struct uvm_io_ranges uvm_io_ranges[] = UVM_IO_RANGES; + +/* + * Initialization of pmr. + * Called by uvm_page_init. + * + * Sets up pmemranges. + */ +void +uvm_pmr_init(void) +{ + struct uvm_pmemrange *new_pmr; + int i; + + TAILQ_INIT(&uvm.pmr_control.use); + RB_INIT(&uvm.pmr_control.addr); + + new_pmr = uvm_pmr_allocpmr(); + new_pmr->low = 0; + new_pmr->high = atop((paddr_t)-1) + 1; + + RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); + uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); + + for (i = 0; i < nitems(uvm_io_ranges); i++) + uvm_pmr_use_inc(uvm_io_ranges[i].low, uvm_io_ranges[i].high); +} + +/* + * Find the pmemrange that contains the given page number. + * + * (Manually traverses the binary tree, because that is cheaper on stack + * usage.) + */ +struct uvm_pmemrange * +uvm_pmemrange_find(paddr_t pageno) +{ + struct uvm_pmemrange *pmr; + + pmr = RB_ROOT(&uvm.pmr_control.addr); + while (pmr != NULL) { + if (pmr->low > pageno) + pmr = RB_LEFT(pmr, pmr_addr); + else if (pmr->high <= pageno) + pmr = RB_RIGHT(pmr, pmr_addr); + else + break; + } + + return pmr; +} + +#if defined(DDB) || defined(DEBUG) +/* + * Return true if the given page is in any of the free lists. + * Used by uvm_page_printit. + * This function is safe, even if the page is not on the freeq. + * Note: does not apply locking, only called from ddb. + */ +int +uvm_pmr_isfree(struct vm_page *pg) +{ + struct vm_page *r; + struct uvm_pmemrange *pmr; + + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); + if (pmr == NULL) + return 0; + r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); + if (r == NULL) + r = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + r = RB_PREV(uvm_pmr_addr, &pmr->addr, r); + if (r == NULL) + return 0; /* Empty tree. */ + + KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); + return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > + atop(VM_PAGE_TO_PHYS(pg)); +} +#endif /* DEBUG */ + +/* + * Given a root of a tree, find a range which intersects start, end and + * is of the same memtype. + * + * Page must be in the address tree. + */ +struct vm_page* +uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, + paddr_t start, paddr_t end, int memtype) +{ + int direction; + struct vm_page *root; + struct vm_page *high, *high_next; + struct vm_page *low, *low_next; + + KDASSERT(pmr != NULL && init_root != NULL); + root = init_root; + + /* + * Which direction to use for searching. + */ + if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) + direction = 1; + else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) + direction = -1; + else /* nothing to do */ + return root; + + /* + * First, update root to fall within the chosen range. + */ + while (root && !PMR_INTERSECTS_WITH( + atop(VM_PAGE_TO_PHYS(root)), + atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, + start, end)) { + if (direction == 1) + root = RB_RIGHT(root, objt); + else + root = RB_LEFT(root, objt); + } + if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) + return root; + + /* + * Root is valid, but of the wrong memtype. + * + * Try to find a range that has the given memtype in the subtree + * (memtype mismatches are costly, either because the conversion + * is expensive, or a later allocation will need to do the opposite + * conversion, which will be expensive). + * + * + * First, simply increase address until we hit something we can use. + * Cache the upper page, so we can page-walk later. + */ + high = root; + high_next = RB_RIGHT(high, objt); + while (high_next != NULL && PMR_INTERSECTS_WITH( + atop(VM_PAGE_TO_PHYS(high_next)), + atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, + start, end)) { + high = high_next; + if (uvm_pmr_pg_to_memtype(high) == memtype) + return high; + high_next = RB_RIGHT(high, objt); + } + + /* + * Second, decrease the address until we hit something we can use. + * Cache the lower page, so we can page-walk later. + */ + low = root; + low_next = RB_RIGHT(low, objt); + while (low_next != NULL && PMR_INTERSECTS_WITH( + atop(VM_PAGE_TO_PHYS(low_next)), + atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, + start, end)) { + low = low_next; + if (uvm_pmr_pg_to_memtype(low) == memtype) + return low; + low_next = RB_RIGHT(low, objt); + } + + /* + * Ack, no hits. Walk the address tree until to find something usable. + */ + for (low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low); + low != high; + low = RB_NEXT(uvm_pmr_addr, &pmr->addr, low)) { + KASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(high_next)), + atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, + start, end)); + if (uvm_pmr_pg_to_memtype(low) == memtype) + return low; + } + + /* + * Nothing found. + */ + return NULL; +} + +/* + * Allocate any page, the fastest way. Page number constraints only. + */ +int +uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, + paddr_t start, paddr_t end) +{ + struct uvm_pmemrange *pmr; + struct vm_page *found, *splitpg; + psize_t fcount; + int memtype; + + fcount = 0; + for (pmr = TAILQ_FIRST(&uvm.pmr_control.use); + pmr != NULL && fcount != count; pmr = TAILQ_NEXT(pmr, pmr_use)) { + /* Outside requested range. */ + if (!(start == 0 && end == 0) && + !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) + continue; + + /* Range is empty. */ + if (pmr->nsegs == 0) + continue; + + /* + * Loop over all memtypes, starting at memtype_init. + */ + memtype = memtype_init; + do { + found = TAILQ_FIRST(&pmr->single[memtype]); + /* + * If found is outside the range, walk the list + * until we find something that intersects with + * boundaries. + */ + while (found && !PMR_INTERSECTS_WITH( + atop(VM_PAGE_TO_PHYS(found)), + atop(VM_PAGE_TO_PHYS(found)) + 1, + start, end)) + found = TAILQ_NEXT(found, pageq); + + if (found == NULL) { + found = RB_ROOT(&pmr->size[memtype]); + /* Size tree gives pg[1] instead of pg[0] */ + if (found != NULL) + found--; + + found = uvm_pmr_rootupdate(pmr, found, + start, end, memtype); + } + if (found != NULL) { + uvm_pmr_assertvalid(pmr); + uvm_pmr_remove_size(pmr, found); + + /* + * If the page intersects the end, then it'll + * need splitting. + * + * Note that we don't need to split if the page + * intersects start: the drain function will + * simply stop on hitting start. + */ + if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + + found->fpgsz > end) { + psize_t splitsz = + atop(VM_PAGE_TO_PHYS(found)) + + found->fpgsz - end; + + uvm_pmr_remove_addr(pmr, found); + uvm_pmr_assertvalid(pmr); + found->fpgsz -= splitsz; + splitpg = found + found->fpgsz; + splitpg->fpgsz = splitsz; + uvm_pmr_insert(pmr, splitpg, 1); + + /* + * At this point, splitpg and found + * actually should be joined. + * But we explicitly disable that, + * because we will start subtracting + * from found. + */ + KASSERT(start == 0 || + atop(VM_PAGE_TO_PHYS(found)) + + found->fpgsz > start); + uvm_pmr_insert_addr(pmr, found, 1); + } + + /* + * Fetch pages from the end. + * If the range is larger than the requested + * number of pages, this saves us an addr-tree + * update. + * + * Since we take from the end and insert at + * the head, any ranges keep preserved. + */ + while (found->fpgsz > 0 && fcount < count && + (start == 0 || + atop(VM_PAGE_TO_PHYS(found)) + + found->fpgsz > start)) { + found->fpgsz--; + fcount++; + TAILQ_INSERT_HEAD(result, + &found[found->fpgsz], pageq); + } + if (found->fpgsz > 0) { + uvm_pmr_insert_size(pmr, found); + KDASSERT(fcount == count); + uvm_pmr_assertvalid(pmr); + return fcount; + } + + /* + * Delayed addr-tree removal. + */ + uvm_pmr_remove_addr(pmr, found); + uvm_pmr_assertvalid(pmr); + } else { + /* + * Skip to the next memtype. + */ + memtype += 1; + if (memtype == UVM_PMR_MEMTYPE_MAX) + memtype = 0; + } + } while (memtype != memtype_init && fcount != count); + } + + /* + * Search finished. + * + * Ran out of ranges before enough pages were gathered, or we hit the + * case where found->fpgsz == count - fcount, in which case the + * above exit condition didn't trigger. + * + * On failure, caller will free the pages. + */ + return fcount; +} + +#ifdef DDB +/* + * Print information about pmemrange. + * Does not do locking (so either call it from DDB or acquire fpageq lock + * before invoking. + */ +void +uvm_pmr_print(void) +{ + struct uvm_pmemrange *pmr; + struct vm_page *pg; + psize_t size[UVM_PMR_MEMTYPE_MAX]; + psize_t free; + int useq_len; + int mt; + + printf("Ranges, use queue:\n"); + useq_len = 0; + TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { + useq_len++; + free = 0; + for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { + pg = RB_MAX(uvm_pmr_size, &pmr->size[mt]); + if (pg != NULL) + pg--; + else + pg = TAILQ_FIRST(&pmr->single[mt]); + size[mt] = (pg == NULL ? 0 : pg->fpgsz); + + RB_FOREACH(pg, uvm_pmr_addr, &pmr->addr) + free += pg->fpgsz; + } + + printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", + (unsigned long)pmr->low, (unsigned long)pmr->high, + pmr->use, (unsigned long)pmr->nsegs); + for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { + printf(" maxsegsz[%d]=0x%lx", mt, + (unsigned long)size[mt]); + } + printf(" free=0x%lx\n", (unsigned long)free); + } + printf("#ranges = %d\n", useq_len); +} +#endif diff --git a/sys/uvm/uvm_pmemrange.h b/sys/uvm/uvm_pmemrange.h new file mode 100644 index 00000000000..3fb477e6ef7 --- /dev/null +++ b/sys/uvm/uvm_pmemrange.h @@ -0,0 +1,83 @@ +/* $OpenBSD: uvm_pmemrange.h,v 1.5 2010/04/22 19:02:55 oga Exp $ */ + +/* + * Copyright (c) 2009 Ariane van der Steldt <ariane@stack.nl> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * uvm_pmemrange.h: describe and manage free physical memory. + */ + +#ifndef _UVM_UVM_PMEMRANGE_H_ +#define _UVM_UVM_PMEMRANGE_H_ + +#include <uvm/uvm_extern.h> +#include <uvm/uvm_page.h> + +RB_HEAD(uvm_pmr_addr, vm_page); +RB_HEAD(uvm_pmr_size, vm_page); + +/* + * Page types available: + * - DIRTY: this page may contain random data. + * - ZERO: this page has been zeroed. + */ +#define UVM_PMR_MEMTYPE_DIRTY 0 +#define UVM_PMR_MEMTYPE_ZERO 1 +#define UVM_PMR_MEMTYPE_MAX 2 + +/* + * An address range of memory. + */ +struct uvm_pmemrange { + struct uvm_pmr_addr addr; /* Free page chunks, sorted by addr. */ + struct uvm_pmr_size size[UVM_PMR_MEMTYPE_MAX]; + /* Free page chunks, sorted by size. */ + TAILQ_HEAD(, vm_page) single[UVM_PMR_MEMTYPE_MAX]; + /* single page regions (uses pageq) */ + + paddr_t low; /* Start of address range (pgno). */ + paddr_t high; /* End +1 (pgno). */ + int use; /* Use counter. */ + psize_t nsegs; /* Current range count. */ + + TAILQ_ENTRY(uvm_pmemrange) pmr_use; + /* pmr, sorted by use */ + RB_ENTRY(uvm_pmemrange) pmr_addr; + /* pmr, sorted by address */ +}; + +RB_HEAD(uvm_pmemrange_addr, uvm_pmemrange); +TAILQ_HEAD(uvm_pmemrange_use, uvm_pmemrange); + +/* + * pmr control structure. Contained in uvm.pmr_control. + */ +struct uvm_pmr_control { + struct uvm_pmemrange_addr addr; + struct uvm_pmemrange_use use; +}; + +void uvm_pmr_freepages(struct vm_page *, psize_t); +void uvm_pmr_freepageq(struct pglist *pgl); +int uvm_pmr_getpages(psize_t, paddr_t, paddr_t, paddr_t, paddr_t, + int, int, struct pglist *); +void uvm_pmr_init(void); + +#ifdef DDB +int uvm_pmr_isfree(struct vm_page *pg); +#endif + +#endif /* _UVM_UVM_PMEMRANGE_H_ */ |