diff options
author | 1999-02-26 01:30:10 +0000 | |
---|---|---|
committer | 1999-02-26 01:30:10 +0000 | |
commit | cd7ee8acd30fe8d4b178a6bcda689f469732e4bc (patch) | |
tree | 00ca09c99c7798adde771b6c8afd33bbf1e14fc0 | |
parent | convert to mdoc, document changes from db 1.8.6 (diff) | |
download | wireguard-openbsd-cd7ee8acd30fe8d4b178a6bcda689f469732e4bc.tar.xz wireguard-openbsd-cd7ee8acd30fe8d4b178a6bcda689f469732e4bc.zip |
Import of uvm from NetBSD. Some local changes, some code disabled
46 files changed, 23052 insertions, 0 deletions
diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h new file mode 100644 index 00000000000..4f4d5164527 --- /dev/null +++ b/sys/uvm/uvm.h @@ -0,0 +1,181 @@ +/* $NetBSD: uvm.h,v 1.13 1998/10/11 22:59:53 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm.h,v 1.1.2.14 1998/02/02 20:07:19 chuck Exp + */ + +#ifndef _UVM_UVM_H_ +#define _UVM_UVM_H_ + +#include <uvm/uvm_extern.h> + +#include <uvm/uvm_stat.h> + +/* + * pull in prototypes + */ + +#include <uvm/uvm_amap.h> +#include <uvm/uvm_aobj.h> +#include <uvm/uvm_fault.h> +#include <uvm/uvm_glue.h> +#include <uvm/uvm_km.h> +#include <uvm/uvm_loan.h> +#include <uvm/uvm_map.h> +#include <uvm/uvm_object.h> +#include <uvm/uvm_page.h> +#include <uvm/uvm_pager.h> +#include <uvm/uvm_pdaemon.h> +#include <uvm/uvm_swap.h> + +/* + * pull in VM_NFREELIST + */ +#include <machine/vmparam.h> + +/* + * uvm structure (vm global state: collected in one structure for ease + * of reference...) + */ + +struct uvm { + /* vm_page related parameters */ + /* vm_page queues */ + struct pglist page_free[VM_NFREELIST]; /* unallocated pages */ + struct pglist page_active; /* allocated pages, in use */ + struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */ + struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */ + simple_lock_data_t pageqlock; /* lock for active/inactive page q */ + simple_lock_data_t fpageqlock; /* lock for free page q */ + /* page daemon trigger */ + int pagedaemon; /* daemon sleeps on this */ + struct proc *pagedaemon_proc; /* daemon's pid */ + simple_lock_data_t pagedaemon_lock; + /* page hash */ + struct pglist *page_hash; /* page hash table (vp/off->page) */ + int page_nhash; /* number of buckets */ + int page_hashmask; /* hash mask */ + simple_lock_data_t hashlock; /* lock on page_hash array */ + /* anon stuff */ + struct vm_anon *afree; /* anon free list */ + simple_lock_data_t afreelock; /* lock on anon free list */ + + /* static kernel map entry pool */ + vm_map_entry_t kentry_free; /* free page pool */ + simple_lock_data_t kentry_lock; + + /* aio_done is locked by uvm.pagedaemon_lock and splbio! */ + struct uvm_aiohead aio_done; /* done async i/o reqs */ + + /* pager VM area bounds */ + vaddr_t pager_sva; /* start of pager VA area */ + vaddr_t pager_eva; /* end of pager VA area */ + + /* kernel object: to support anonymous pageable kernel memory */ + struct uvm_object *kernel_object; +}; + +extern struct uvm uvm; + +/* + * historys + */ + +UVMHIST_DECL(maphist); +UVMHIST_DECL(pdhist); + +/* + * vm_map_entry etype bits: + */ + +#define UVM_ET_OBJ 0x01 /* it is a uvm_object */ +#define UVM_ET_SUBMAP 0x02 /* it is a vm_map submap */ +#define UVM_ET_COPYONWRITE 0x04 /* copy_on_write */ +#define UVM_ET_NEEDSCOPY 0x08 /* needs_copy */ + +#define UVM_ET_ISOBJ(E) (((E)->etype & UVM_ET_OBJ) != 0) +#define UVM_ET_ISSUBMAP(E) (((E)->etype & UVM_ET_SUBMAP) != 0) +#define UVM_ET_ISCOPYONWRITE(E) (((E)->etype & UVM_ET_COPYONWRITE) != 0) +#define UVM_ET_ISNEEDSCOPY(E) (((E)->etype & UVM_ET_NEEDSCOPY) != 0) + +/* + * macros + */ + +/* + * UVM_UNLOCK_AND_WAIT: atomic unlock+wait... front end for the + * (poorly named) thread_sleep_msg function. + */ + +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) + +#define UVM_UNLOCK_AND_WAIT(event,lock,intr,msg, timo) \ + thread_sleep_msg(event,lock,intr,msg, timo) + +#else + +#define UVM_UNLOCK_AND_WAIT(event,lock,intr,msg, timo) \ + thread_sleep_msg(event,NULL,intr,msg, timo) + +#endif + +/* + * UVM_PAGE_OWN: track page ownership (only if UVM_PAGE_TRKOWN) + */ + +#if defined(UVM_PAGE_TRKOWN) + +#define UVM_PAGE_OWN(PG, TAG) uvm_page_own(PG, TAG) + +#else /* UVM_PAGE_TRKOWN */ + +#define UVM_PAGE_OWN(PG, TAG) /* nothing */ + +#endif /* UVM_PAGE_TRKOWN */ + +/* + * pull in inlines + */ + +#include <uvm/uvm_amap_i.h> +#include <uvm/uvm_fault_i.h> +#include <uvm/uvm_map_i.h> +#include <uvm/uvm_page_i.h> +#include <uvm/uvm_pager_i.h> + +#endif /* _UVM_UVM_H_ */ diff --git a/sys/uvm/uvm_amap.c b/sys/uvm/uvm_amap.c new file mode 100644 index 00000000000..8685f643392 --- /dev/null +++ b/sys/uvm/uvm_amap.c @@ -0,0 +1,1066 @@ +/* $NetBSD: uvm_amap.c,v 1.19 1999/01/28 14:46:27 chuck Exp $ */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * uvm_amap.c: amap operations + */ + +/* + * this file contains functions that perform operations on amaps. see + * uvm_amap.h for a brief explanation of the role of amaps in uvm. + */ + +#undef UVM_AMAP_INLINE /* enable/disable amap inlines */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/pool.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#define UVM_AMAP_C /* ensure disabled inlines are in */ +#include <uvm/uvm.h> +#include <uvm/uvm_swap.h> + +/* + * pool for allocation of vm_map structures. note that the pool has + * its own simplelock for its protection. also note that in order to + * avoid an endless loop, the amap pool's allocator cannot allocate + * memory from an amap (it currently goes through the kernel uobj, so + * we are ok). + */ + +struct pool uvm_amap_pool; + +/* + * local functions + */ + +static struct vm_amap *amap_alloc1 __P((int, int, int)); + +#ifdef UVM_AMAP_PPREF +/* + * what is ppref? ppref is an _optional_ amap feature which is used + * to keep track of reference counts on a per-page basis. it is enabled + * when UVM_AMAP_PPREF is defined. + * + * when enabled, an array of ints is allocated for the pprefs. this + * array is allocated only when a partial reference is added to the + * map (either by unmapping part of the amap, or gaining a reference + * to only a part of an amap). if the malloc of the array fails + * (M_NOWAIT), then we set the array pointer to PPREF_NONE to indicate + * that we tried to do ppref's but couldn't alloc the array so just + * give up (after all, this is an optional feature!). + * + * the array is divided into page sized "chunks." for chunks of length 1, + * the chunk reference count plus one is stored in that chunk's slot. + * for chunks of length > 1 the first slot contains (the reference count + * plus one) * -1. [the negative value indicates that the length is + * greater than one.] the second slot of the chunk contains the length + * of the chunk. here is an example: + * + * actual REFS: 2 2 2 2 3 1 1 0 0 0 4 4 0 1 1 1 + * ppref: -3 4 x x 4 -2 2 -1 3 x -5 2 1 -2 3 x + * <----------><-><----><-------><----><-><-------> + * (x = don't care) + * + * this allows us to allow one int to contain the ref count for the whole + * chunk. note that the "plus one" part is needed because a reference + * count of zero is neither positive or negative (need a way to tell + * if we've got one zero or a bunch of them). + * + * here are some in-line functions to help us. + */ + +static __inline void pp_getreflen __P((int *, int, int *, int *)); +static __inline void pp_setreflen __P((int *, int, int, int)); + +/* + * pp_getreflen: get the reference and length for a specific offset + * + * => ppref's amap must be locked + */ +static __inline void +pp_getreflen(ppref, offset, refp, lenp) + int *ppref, offset, *refp, *lenp; +{ + + if (ppref[offset] > 0) { /* chunk size must be 1 */ + *refp = ppref[offset] - 1; /* don't forget to adjust */ + *lenp = 1; + } else { + *refp = (ppref[offset] * -1) - 1; + *lenp = ppref[offset+1]; + } +} + +/* + * pp_setreflen: set the reference and length for a specific offset + * + * => ppref's amap must be locked + */ +static __inline void +pp_setreflen(ppref, offset, ref, len) + int *ppref, offset, ref, len; +{ + if (len == 1) { + ppref[offset] = ref + 1; + } else { + ppref[offset] = (ref + 1) * -1; + ppref[offset+1] = len; + } +} +#endif + +/* + * amap_init: called at boot time to init global amap data structures + */ + +void +amap_init() + +{ + /* + * Initialize the vm_amap pool. + */ + pool_init(&uvm_amap_pool, sizeof(struct vm_amap), 0, 0, 0, + "amappl", 0, pool_page_alloc_nointr, pool_page_free_nointr, + M_UVMAMAP); +} + +/* + * amap_alloc1: internal function that allocates an amap, but does not + * init the overlay. + * + * => lock on returned amap is init'd + */ +static inline struct vm_amap * +amap_alloc1(slots, padslots, waitf) + int slots, padslots, waitf; +{ + struct vm_amap *amap; + int totalslots = slots + padslots; + + amap = pool_get(&uvm_amap_pool, (waitf == M_WAITOK) ? PR_WAITOK : 0); + if (amap == NULL) + return(NULL); + + simple_lock_init(&amap->am_l); + amap->am_ref = 1; + amap->am_flags = 0; +#ifdef UVM_AMAP_PPREF + amap->am_ppref = NULL; +#endif + amap->am_maxslot = totalslots; + amap->am_nslot = slots; + amap->am_nused = 0; + MALLOC(amap->am_slots, int *, totalslots * sizeof(int), M_UVMAMAP, waitf); + if (amap->am_slots) { + MALLOC(amap->am_bckptr, int *, totalslots * sizeof(int), M_UVMAMAP, waitf); + if (amap->am_bckptr) { + MALLOC(amap->am_anon, struct vm_anon **, + totalslots * sizeof(struct vm_anon *), M_UVMAMAP, waitf); + } + } + + if (amap->am_anon) + return(amap); + + if (amap->am_slots) { + FREE(amap->am_slots, M_UVMAMAP); + if (amap->am_bckptr) + FREE(amap->am_bckptr, M_UVMAMAP); + } + pool_put(&uvm_amap_pool, amap); + return (NULL); +} + +/* + * amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM + * + * => caller should ensure sz is a multiple of PAGE_SIZE + * => reference count to new amap is set to one + * => new amap is returned unlocked + */ + +struct vm_amap * +amap_alloc(sz, padsz, waitf) + vaddr_t sz, padsz; + int waitf; +{ + struct vm_amap *amap; + int slots, padslots; + UVMHIST_FUNC("amap_alloc"); UVMHIST_CALLED(maphist); + + AMAP_B2SLOT(slots, sz); /* load slots */ + AMAP_B2SLOT(padslots, padsz); + + amap = amap_alloc1(slots, padslots, waitf); + if (amap) + bzero(amap->am_anon, (slots + padslots) * sizeof(struct vm_anon *)); + + UVMHIST_LOG(maphist,"<- done, amap = 0x%x, sz=%d", amap, sz, 0, 0); + return(amap); +} + + +/* + * amap_free: free an amap + * + * => the amap must be locked (mainly for simplelock accounting) + * => the amap should have a zero reference count and be empty + */ +void +amap_free(amap) + struct vm_amap *amap; +{ + UVMHIST_FUNC("amap_free"); UVMHIST_CALLED(maphist); + +#ifdef DIAGNOSTIC + if (amap->am_ref || amap->am_nused) + panic("amap_free"); +#endif + + FREE(amap->am_slots, M_UVMAMAP); + FREE(amap->am_bckptr, M_UVMAMAP); + FREE(amap->am_anon, M_UVMAMAP); +#ifdef UVM_AMAP_PPREF + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) + FREE(amap->am_ppref, M_UVMAMAP); +#endif + amap_unlock(amap); /* mainly for lock debugging */ + pool_put(&uvm_amap_pool, amap); + + UVMHIST_LOG(maphist,"<- done, freed amap = 0x%x", amap, 0, 0, 0); +} + +/* + * amap_extend: extend the size of an amap (if needed) + * + * => called from uvm_map when we want to extend an amap to cover + * a new mapping (rather than allocate a new one) + * => amap should be unlocked (we will lock it) + * => to safely extend an amap it should have a reference count of + * one (thus it can't be shared) + * => XXXCDC: needs a waitflag or failure return value? + * => XXXCDC: support padding at this level? + */ +void +amap_extend(entry, addsize) + vm_map_entry_t entry; + vsize_t addsize; +{ + struct vm_amap *amap = entry->aref.ar_amap; + int slotoff = entry->aref.ar_pageoff; + int slotmapped, slotadd, slotneed; +#ifdef UVM_AMAP_PPREF + int *newppref, *oldppref; +#endif + u_int *newsl, *newbck, *oldsl, *oldbck; + struct vm_anon **newover, **oldover; + int slotadded; + UVMHIST_FUNC("amap_extend"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, " (entry=0x%x, addsize=0x%x)", entry,addsize,0,0); + + /* + * first, determine how many slots we need in the amap. don't + * forget that ar_pageoff could be non-zero: this means that + * there are some unused slots before us in the amap. + */ + + amap_lock(amap); /* lock! */ + + AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */ + AMAP_B2SLOT(slotadd, addsize); /* slots to add */ + slotneed = slotoff + slotmapped + slotadd; + + /* + * case 1: we already have enough slots in the map and thus + * only need to bump the reference counts on the slots we are + * adding. + */ + + if (amap->am_nslot >= slotneed) { +#ifdef UVM_AMAP_PPREF + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { + amap_pp_adjref(amap, slotoff + slotmapped, addsize, 1); + } +#endif + amap_unlock(amap); + UVMHIST_LOG(maphist,"<- done (case 1), amap = 0x%x, sltneed=%d", + amap, slotneed, 0, 0); + return; /* done! */ + } + + /* + * case 2: we pre-allocated slots for use and we just need to + * bump nslot up to take account for these slots. + */ + if (amap->am_maxslot >= slotneed) { +#ifdef UVM_AMAP_PPREF + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { + if ((slotoff + slotmapped) < amap->am_nslot) + amap_pp_adjref(amap, slotoff + slotmapped, + (amap->am_nslot - (slotoff + slotmapped)) << + PAGE_SHIFT, 1); + pp_setreflen(amap->am_ppref, amap->am_nslot, 1, + slotneed - amap->am_nslot); + } +#endif + amap->am_nslot = slotneed; + amap_unlock(amap); + /* + * no need to zero am_anon since that was done at + * alloc time and we never shrink an allocation. + */ + UVMHIST_LOG(maphist,"<- done (case 2), amap = 0x%x, slotneed=%d", + amap, slotneed, 0, 0); + return; + } + + /* + * case 3: we need to malloc a new amap and copy all the amap + * data over from old amap to the new one. + * + * XXXCDC: could we take advantage of a kernel realloc()? + */ + + amap_unlock(amap); /* unlock in case we sleep in malloc */ +#ifdef UVM_AMAP_PPREF + newppref = NULL; + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { + MALLOC(newppref, int *, slotneed * sizeof(int), M_UVMAMAP, + M_NOWAIT); + if (newppref == NULL) { + /* give up if malloc fails */ + FREE(amap->am_ppref, M_UVMAMAP); + amap->am_ppref = PPREF_NONE; + } + } +#endif + MALLOC(newsl, int *, slotneed * sizeof(int), M_UVMAMAP, M_WAITOK); + MALLOC(newbck, int *, slotneed * sizeof(int), M_UVMAMAP, M_WAITOK); + MALLOC(newover, struct vm_anon **, slotneed * sizeof(struct vm_anon *), + M_UVMAMAP, M_WAITOK); + amap_lock(amap); /* re-lock! */ + +#ifdef DIAGNOSTIC + if (amap->am_maxslot >= slotneed) + panic("amap_extend: amap changed during malloc"); +#endif + + /* + * now copy everything over to new malloc'd areas... + */ + + slotadded = slotneed - amap->am_nslot; + + /* do am_slots */ + oldsl = amap->am_slots; + bcopy(oldsl, newsl, sizeof(int) * amap->am_nused); + amap->am_slots = newsl; + + /* do am_anon */ + oldover = amap->am_anon; + bcopy(oldover, newover, sizeof(struct vm_anon *) * amap->am_nslot); + bzero(newover + amap->am_nslot, sizeof(struct vm_anon *) * slotadded); + amap->am_anon = newover; + + /* do am_bckptr */ + oldbck = amap->am_bckptr; + bcopy(oldbck, newbck, sizeof(int) * amap->am_nslot); + bzero(newbck + amap->am_nslot, sizeof(int) * slotadded); /* XXX: needed? */ + amap->am_bckptr = newbck; + +#ifdef UVM_AMAP_PPREF + /* do ppref */ + oldppref = amap->am_ppref; + if (newppref) { + bcopy(oldppref, newppref, sizeof(int) * amap->am_nslot); + bzero(newppref + amap->am_nslot, sizeof(int) * slotadded); + amap->am_ppref = newppref; + if ((slotoff + slotmapped) < amap->am_nslot) + amap_pp_adjref(amap, slotoff + slotmapped, + (amap->am_nslot - (slotoff + slotmapped)) << + PAGE_SHIFT, 1); + pp_setreflen(newppref, amap->am_nslot, 1, slotadded); + } +#endif + + /* update master values */ + amap->am_nslot = slotneed; + amap->am_maxslot = slotneed; + + /* unlock */ + amap_unlock(amap); + + /* and free */ + FREE(oldsl, M_UVMAMAP); + FREE(oldbck, M_UVMAMAP); + FREE(oldover, M_UVMAMAP); +#ifdef UVM_AMAP_PPREF + if (oldppref && oldppref != PPREF_NONE) + FREE(oldppref, M_UVMAMAP); +#endif + UVMHIST_LOG(maphist,"<- done (case 3), amap = 0x%x, slotneed=%d", + amap, slotneed, 0, 0); +} + +/* + * amap_share_protect: change protection of anons in a shared amap + * + * for shared amaps, given the current data structure layout, it is + * not possible for us to directly locate all maps referencing the + * shared anon (to change the protection). in order to protect data + * in shared maps we use pmap_page_protect(). [this is useful for IPC + * mechanisms like map entry passing that may want to write-protect + * all mappings of a shared amap.] we traverse am_anon or am_slots + * depending on the current state of the amap. + * + * => entry's map and amap must be locked by the caller + */ +void +amap_share_protect(entry, prot) + vm_map_entry_t entry; + vm_prot_t prot; +{ + struct vm_amap *amap = entry->aref.ar_amap; + int slots, lcv, slot, stop; + + AMAP_B2SLOT(slots, (entry->end - entry->start)); + stop = entry->aref.ar_pageoff + slots; + + if (slots < amap->am_nused) { + /* cheaper to traverse am_anon */ + for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) { + if (amap->am_anon[lcv] == NULL) + continue; + if (amap->am_anon[lcv]->u.an_page != NULL) + pmap_page_protect( + PMAP_PGARG(amap->am_anon[lcv]->u.an_page), + prot); + } + return; + } + + /* cheaper to traverse am_slots */ + for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { + slot = amap->am_slots[lcv]; + if (slot < entry->aref.ar_pageoff || slot >= stop) + continue; + if (amap->am_anon[slot]->u.an_page != NULL) + pmap_page_protect( + PMAP_PGARG(amap->am_anon[slot]->u.an_page), prot); + } + return; +} + +/* + * amap_wipeout: wipeout all anon's in an amap; then free the amap! + * + * => called from amap_unref when the final reference to an amap is + * discarded (i.e. when reference count == 1) + * => the amap should be locked (by the caller) + */ + +void +amap_wipeout(amap) + struct vm_amap *amap; +{ + int lcv, slot; + struct vm_anon *anon; + UVMHIST_FUNC("amap_wipeout"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(amap=0x%x)", amap, 0,0,0); + + for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { + int refs; + + slot = amap->am_slots[lcv]; + anon = amap->am_anon[slot]; + + if (anon == NULL || anon->an_ref == 0) + panic("amap_wipeout: corrupt amap"); + + simple_lock(&anon->an_lock); /* lock anon */ + + UVMHIST_LOG(maphist," processing anon 0x%x, ref=%d", anon, + anon->an_ref, 0, 0); + + refs = --anon->an_ref; + simple_unlock(&anon->an_lock); + if (refs == 0) { + /* + * we had the last reference to a vm_anon. free it. + */ + uvm_anfree(anon); + } + } + + /* + * now we free the map + */ + + amap->am_ref = 0; /* ... was one */ + amap->am_nused = 0; + amap_free(amap); /* will unlock and free amap */ + UVMHIST_LOG(maphist,"<- done!", 0,0,0,0); +} + +/* + * amap_copy: ensure that a map entry's "needs_copy" flag is false + * by copying the amap if necessary. + * + * => an entry with a null amap pointer will get a new (blank) one. + * => the map that the map entry belongs to must be locked by caller. + * => the amap currently attached to "entry" (if any) must be unlocked. + * => if canchunk is true, then we may clip the entry into a chunk + * => "startva" and "endva" are used only if canchunk is true. they are + * used to limit chunking (e.g. if you have a large space that you + * know you are going to need to allocate amaps for, there is no point + * in allowing that to be chunked) + */ + +void +amap_copy(map, entry, waitf, canchunk, startva, endva) + vm_map_t map; + vm_map_entry_t entry; + int waitf; + boolean_t canchunk; + vaddr_t startva, endva; +{ + struct vm_amap *amap, *srcamap; + int slots, lcv; + vaddr_t chunksize; + UVMHIST_FUNC("amap_copy"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist, " (map=%p, entry=%p, waitf=%d)", map, entry, waitf, 0); + + /* + * is there a map to copy? if not, create one from scratch. + */ + + if (entry->aref.ar_amap == NULL) { + + /* + * check to see if we have a large amap that we can + * chunk. we align startva/endva to chunk-sized + * boundaries and then clip to them. + */ + + if (canchunk && atop(entry->end - entry->start) >= + UVM_AMAP_LARGE) { + /* convert slots to bytes */ + chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT; + startva = (startva / chunksize) * chunksize; + endva = roundup(endva, chunksize); + UVMHIST_LOG(maphist, " chunk amap ==> clip 0x%x->0x%x" + "to 0x%x->0x%x", entry->start, entry->end, startva, + endva); + UVM_MAP_CLIP_START(map, entry, startva); + /* watch out for endva wrap-around! */ + if (endva >= startva) + UVM_MAP_CLIP_END(map, entry, endva); + } + + UVMHIST_LOG(maphist, "<- done [creating new amap 0x%x->0x%x]", + entry->start, entry->end, 0, 0); + entry->aref.ar_pageoff = 0; + entry->aref.ar_amap = amap_alloc(entry->end - entry->start, 0, + waitf); + if (entry->aref.ar_amap != NULL) + entry->etype &= ~UVM_ET_NEEDSCOPY; + return; + } + + /* + * first check and see if we are the only map entry + * referencing the amap we currently have. if so, then we can + * just take it over rather than copying it. note that we are + * reading am_ref with the amap unlocked... the value can only + * be one if we have the only reference to the amap (via our + * locked map). if we are greater than one we fall through to + * the next case (where we double check the value). + */ + + if (entry->aref.ar_amap->am_ref == 1) { + entry->etype &= ~UVM_ET_NEEDSCOPY; + UVMHIST_LOG(maphist, "<- done [ref cnt = 1, took it over]", + 0, 0, 0, 0); + return; + } + + /* + * looks like we need to copy the map. + */ + + UVMHIST_LOG(maphist," amap=%p, ref=%d, must copy it", + entry->aref.ar_amap, entry->aref.ar_amap->am_ref, 0, 0); + AMAP_B2SLOT(slots, entry->end - entry->start); + amap = amap_alloc1(slots, 0, waitf); + if (amap == NULL) { + UVMHIST_LOG(maphist, " amap_alloc1 failed", 0,0,0,0); + return; + } + srcamap = entry->aref.ar_amap; + amap_lock(srcamap); + + /* + * need to double check reference count now that we've got the + * src amap locked down. the reference count could have + * changed while we were in malloc. if the reference count + * dropped down to one we take over the old map rather than + * copying the amap. + */ + + if (srcamap->am_ref == 1) { /* take it over? */ + entry->etype &= ~UVM_ET_NEEDSCOPY; + amap->am_ref--; /* drop final reference to map */ + amap_free(amap); /* dispose of new (unused) amap */ + amap_unlock(srcamap); + return; + } + + /* + * we must copy it now. + */ + + UVMHIST_LOG(maphist, " copying amap now",0, 0, 0, 0); + for (lcv = 0 ; lcv < slots; lcv++) { + amap->am_anon[lcv] = + srcamap->am_anon[entry->aref.ar_pageoff + lcv]; + if (amap->am_anon[lcv] == NULL) + continue; + simple_lock(&amap->am_anon[lcv]->an_lock); + amap->am_anon[lcv]->an_ref++; + simple_unlock(&amap->am_anon[lcv]->an_lock); + amap->am_bckptr[lcv] = amap->am_nused; + amap->am_slots[amap->am_nused] = lcv; + amap->am_nused++; + } + + /* + * drop our reference to the old amap (srcamap) and unlock. + * we know that the reference count on srcamap is greater than + * one (we checked above), so there is no way we could drop + * the count to zero. [and no need to worry about freeing it] + */ + + srcamap->am_ref--; + if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) + srcamap->am_flags &= ~AMAP_SHARED; /* clear shared flag */ +#ifdef UVM_AMAP_PPREF + if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) { + amap_pp_adjref(srcamap, entry->aref.ar_pageoff, + entry->end - entry->start, -1); + } +#endif + + amap_unlock(srcamap); + + /* + * install new amap. + */ + + entry->aref.ar_pageoff = 0; + entry->aref.ar_amap = amap; + entry->etype &= ~UVM_ET_NEEDSCOPY; + + /* + * done! + */ + UVMHIST_LOG(maphist, "<- done",0, 0, 0, 0); +} + +/* + * amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2) + * + * called during fork(2) when the parent process has a wired map + * entry. in that case we want to avoid write-protecting pages + * in the parent's map (e.g. like what you'd do for a COW page) + * so we resolve the COW here. + * + * => assume parent's entry was wired, thus all pages are resident. + * => assume pages that are loaned out (loan_count) are already mapped + * read-only in all maps, and thus no need for us to worry about them + * => assume both parent and child vm_map's are locked + * => caller passes child's map/entry in to us + * => if we run out of memory we will unlock the amap and sleep _with_ the + * parent and child vm_map's locked(!). we have to do this since + * we are in the middle of a fork(2) and we can't let the parent + * map change until we are done copying all the map entrys. + * => XXXCDC: out of memory should cause fork to fail, but there is + * currently no easy way to do this (needs fix) + * => page queues must be unlocked (we may lock them) + */ + +void +amap_cow_now(map, entry) + struct vm_map *map; + struct vm_map_entry *entry; +{ + struct vm_amap *amap = entry->aref.ar_amap; + int lcv, slot; + struct vm_anon *anon, *nanon; + struct vm_page *pg, *npg; + + /* + * note that if we unlock the amap then we must ReStart the "lcv" for + * loop because some other process could reorder the anon's in the + * am_anon[] array on us while the lock is dropped. + */ +ReStart: + amap_lock(amap); + + for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { + + /* + * get the page + */ + + slot = amap->am_slots[lcv]; + anon = amap->am_anon[slot]; + simple_lock(&anon->an_lock); + pg = anon->u.an_page; + + /* + * page must be resident since parent is wired + */ + + if (pg == NULL) + panic("amap_cow_now: non-resident wired page in anon %p", + anon); + + /* + * if the anon ref count is one and the page is not loaned, + * then we are safe (the child has exclusive access to the + * page). if the page is loaned, then it must already be + * mapped read-only. + * + * we only need to get involved when these are not true. + * [note: if loan_count == 0, then the anon must own the page] + */ + + if (anon->an_ref > 1 && pg->loan_count == 0) { + + /* + * if the page is busy then we have to unlock, wait for + * it and then restart. + */ + if (pg->flags & PG_BUSY) { + pg->flags |= PG_WANTED; + amap_unlock(amap); + UVM_UNLOCK_AND_WAIT(pg, &anon->an_lock, FALSE, + "cownow", 0); + goto ReStart; + } + + /* + * ok, time to do a copy-on-write to a new anon + */ + nanon = uvm_analloc(); + if (nanon) + npg = uvm_pagealloc(NULL, 0, nanon); + else + npg = NULL; /* XXX: quiet gcc warning */ + + if (nanon == NULL || npg == NULL) { + /* out of memory */ + /* + * XXXCDC: we should cause fork to fail, but + * we can't ... + */ + if (nanon) + uvm_anfree(nanon); + simple_unlock(&anon->an_lock); + amap_unlock(amap); + uvm_wait("cownowpage"); + goto ReStart; + } + + /* + * got it... now we can copy the data and replace anon + * with our new one... + */ + uvm_pagecopy(pg, npg); /* old -> new */ + anon->an_ref--; /* can't drop to zero */ + amap->am_anon[slot] = nanon; /* replace */ + + /* + * drop PG_BUSY on new page ... since we have had it's + * owner locked the whole time it can't be + * PG_RELEASED | PG_WANTED. + */ + npg->flags &= ~(PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(npg, NULL); + uvm_lock_pageq(); + uvm_pageactivate(npg); + uvm_unlock_pageq(); + } + + simple_unlock(&anon->an_lock); + /* + * done with this anon, next ...! + */ + + } /* end of 'for' loop */ + + return; +} + +/* + * amap_splitref: split a single reference into two seperate references + * + * => called from uvm_map's clip routines + * => origref's map should be locked + * => origref->ar_amap should be unlocked (we will lock) + */ +void +amap_splitref(origref, splitref, offset) + struct vm_aref *origref, *splitref; + vaddr_t offset; +{ + int leftslots; + UVMHIST_FUNC("amap_splitref"); UVMHIST_CALLED(maphist); + + AMAP_B2SLOT(leftslots, offset); + if (leftslots == 0) + panic("amap_splitref: split at zero offset"); + + /* + * lock the amap + */ + amap_lock(origref->ar_amap); + + /* + * now: amap is locked and we have a valid am_mapped array. + */ + + if (origref->ar_amap->am_nslot - origref->ar_pageoff - leftslots <= 0) + panic("amap_splitref: map size check failed"); + +#ifdef UVM_AMAP_PPREF + /* + * establish ppref before we add a duplicate reference to the amap + */ + if (origref->ar_amap->am_ppref == NULL) + amap_pp_establish(origref->ar_amap); +#endif + + splitref->ar_amap = origref->ar_amap; + splitref->ar_amap->am_ref++; /* not a share reference */ + splitref->ar_pageoff = origref->ar_pageoff + leftslots; + + amap_unlock(origref->ar_amap); +} + +#ifdef UVM_AMAP_PPREF + +/* + * amap_pp_establish: add a ppref array to an amap, if possible + * + * => amap locked by caller + */ +void +amap_pp_establish(amap) + struct vm_amap *amap; +{ + + MALLOC(amap->am_ppref, int *, sizeof(int) * amap->am_maxslot, + M_UVMAMAP, M_NOWAIT); + + /* + * if we fail then we just won't use ppref for this amap + */ + if (amap->am_ppref == NULL) { + amap->am_ppref = PPREF_NONE; /* not using it */ + return; + } + + /* + * init ppref + */ + bzero(amap->am_ppref, sizeof(int) * amap->am_maxslot); + pp_setreflen(amap->am_ppref, 0, amap->am_ref, amap->am_nslot); + return; +} + +/* + * amap_pp_adjref: adjust reference count to a part of an amap using the + * per-page reference count array. + * + * => map and amap locked by caller + * => caller must check that ppref != PPREF_NONE before calling + */ +void +amap_pp_adjref(amap, curslot, bytelen, adjval) + struct vm_amap *amap; + int curslot; + vsize_t bytelen; + int adjval; +{ + int slots, stopslot, *ppref, lcv; + int ref, len; + + /* + * get init values + */ + + AMAP_B2SLOT(slots, bytelen); + stopslot = curslot + slots; + ppref = amap->am_ppref; + + /* + * first advance to the correct place in the ppref array, fragment + * if needed. + */ + + for (lcv = 0 ; lcv < curslot ; lcv += len) { + pp_getreflen(ppref, lcv, &ref, &len); + if (lcv + len > curslot) { /* goes past start? */ + pp_setreflen(ppref, lcv, ref, curslot - lcv); + pp_setreflen(ppref, curslot, ref, len - (curslot -lcv)); + len = curslot - lcv; /* new length of entry @ lcv */ + } + } + + /* + * now adjust reference counts in range (make sure we dont overshoot) + */ + + if (lcv != curslot) + panic("amap_pp_adjref: overshot target"); + + for (/* lcv already set */; lcv < stopslot ; lcv += len) { + pp_getreflen(ppref, lcv, &ref, &len); + if (lcv + len > stopslot) { /* goes past end? */ + pp_setreflen(ppref, lcv, ref, stopslot - lcv); + pp_setreflen(ppref, stopslot, ref, + len - (stopslot - lcv)); + len = stopslot - lcv; + } + ref = ref + adjval; /* ADJUST! */ + if (ref < 0) + panic("amap_pp_adjref: negative reference count"); + pp_setreflen(ppref, lcv, ref, len); + if (ref == 0) + amap_wiperange(amap, lcv, len); + } + +} + +/* + * amap_wiperange: wipe out a range of an amap + * [different from amap_wipeout because the amap is kept intact] + * + * => both map and amap must be locked by caller. + */ +void +amap_wiperange(amap, slotoff, slots) + struct vm_amap *amap; + int slotoff, slots; +{ + int byanon, lcv, stop, curslot, ptr; + struct vm_anon *anon; + UVMHIST_FUNC("amap_wiperange"); UVMHIST_CALLED(maphist); + + /* + * we can either traverse the amap by am_anon or by am_slots depending + * on which is cheaper. decide now. + */ + + if (slots < amap->am_nused) { + byanon = TRUE; + lcv = slotoff; + stop = slotoff + slots; + } else { + byanon = FALSE; + lcv = 0; + stop = amap->am_nused; + } + + /* + * ok, now do it! + */ + + for (; lcv < stop; lcv++) { + int refs; + + /* + * verify the anon is ok. + */ + if (byanon) { + if (amap->am_anon[lcv] == NULL) + continue; + curslot = lcv; + } else { + curslot = amap->am_slots[lcv]; + if (curslot < slotoff || curslot >= stop) + continue; + } + anon = amap->am_anon[curslot]; + + /* + * remove it from the amap + */ + amap->am_anon[curslot] = NULL; + ptr = amap->am_bckptr[curslot]; + if (ptr != (amap->am_nused - 1)) { + amap->am_slots[ptr] = + amap->am_slots[amap->am_nused - 1]; + amap->am_bckptr[amap->am_slots[ptr]] = + ptr; /* back ptr. */ + } + amap->am_nused--; + + /* + * drop anon reference count + */ + simple_lock(&anon->an_lock); + refs = --anon->an_ref; + simple_unlock(&anon->an_lock); + if (refs == 0) { + /* + * we just eliminated the last reference to an anon. + * free it. + */ + uvm_anfree(anon); + } + } +} + +#endif diff --git a/sys/uvm/uvm_amap.h b/sys/uvm/uvm_amap.h new file mode 100644 index 00000000000..8783790017f --- /dev/null +++ b/sys/uvm/uvm_amap.h @@ -0,0 +1,282 @@ +/* $NetBSD: uvm_amap.h,v 1.10 1999/01/28 14:46:27 chuck Exp $ */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _UVM_UVM_AMAP_H_ +#define _UVM_UVM_AMAP_H_ + +/* + * uvm_amap.h: general amap interface and amap implementation-specific info + */ + +/* + * an amap structure contains pointers to a set of anons that are + * mapped together in virtual memory (an anon is a single page of + * anonymous virtual memory -- see uvm_anon.h). in uvm we hide the + * details of the implementation of amaps behind a general amap + * interface. this allows us to change the amap implementation + * without having to touch the rest of the code. this file is divided + * into two parts: the definition of the uvm amap interface and the + * amap implementation-specific definitions. + */ + +/* + * part 1: amap interface + */ + +/* + * forward definition of vm_amap structure. only amap + * implementation-specific code should directly access the fields of + * this structure. + */ + +struct vm_amap; + +/* + * handle inline options... we allow amap ops to be inline, but we also + * provide a hook to turn this off. macros can also be used. + */ + +#ifdef UVM_AMAP_INLINE /* defined/undef'd in uvm_amap.c */ +#define AMAP_INLINE static __inline /* inline enabled */ +#else +#define AMAP_INLINE /* inline disabled */ +#endif /* UVM_AMAP_INLINE */ + + +/* + * prototypes for the amap interface + */ + +AMAP_INLINE +vaddr_t amap_add /* add an anon to an amap */ + __P((struct vm_aref *, vaddr_t, + struct vm_anon *, int)); +struct vm_amap *amap_alloc /* allocate a new amap */ + __P((vaddr_t, vaddr_t, int)); +void amap_copy /* clear amap needs-copy flag */ + __P((vm_map_t, vm_map_entry_t, int, + boolean_t, vaddr_t, vaddr_t)); +void amap_cow_now /* resolve all COW faults now */ + __P((vm_map_t, vm_map_entry_t)); +void amap_extend /* make amap larger */ + __P((vm_map_entry_t, vsize_t)); +int amap_flags /* get amap's flags */ + __P((struct vm_amap *)); +void amap_free /* free amap */ + __P((struct vm_amap *)); +void amap_init /* init amap module (at boot time) */ + __P((void)); +void amap_lock /* lock amap */ + __P((struct vm_amap *)); +AMAP_INLINE +struct vm_anon *amap_lookup /* lookup an anon @ offset in amap */ + __P((struct vm_aref *, vaddr_t)); +AMAP_INLINE +void amap_lookups /* lookup multiple anons */ + __P((struct vm_aref *, vaddr_t, + struct vm_anon **, int)); +AMAP_INLINE +void amap_ref /* add a reference to an amap */ + __P((vm_map_entry_t, int)); +int amap_refs /* get number of references of amap */ + __P((struct vm_amap *)); +void amap_share_protect /* protect pages in a shared amap */ + __P((vm_map_entry_t, vm_prot_t)); +void amap_splitref /* split reference to amap into two */ + __P((struct vm_aref *, struct vm_aref *, + vaddr_t)); +AMAP_INLINE +void amap_unadd /* remove an anon from an amap */ + __P((struct vm_amap *, vaddr_t)); +void amap_unlock /* unlock amap */ + __P((struct vm_amap *)); +AMAP_INLINE +void amap_unref /* drop reference to an amap */ + __P((vm_map_entry_t, int)); +void amap_wipeout /* remove all anons from amap */ + __P((struct vm_amap *)); + +/* + * amap flag values + */ + +#define AMAP_SHARED 0x1 /* amap is shared */ +#define AMAP_REFALL 0x2 /* amap_ref: reference entire amap */ + + +/**********************************************************************/ + +/* + * part 2: amap implementation-specific info + */ + +/* + * we currently provide an array-based amap implementation. in this + * implementation we provide the option of tracking split references + * so that we don't lose track of references during partial unmaps + * ... this is enabled with the "UVM_AMAP_PPREF" define. + */ + +#define UVM_AMAP_PPREF /* track partial references */ + +/* + * here is the definition of the vm_amap structure for this implementation. + */ + +struct vm_amap { + simple_lock_data_t am_l; /* simple lock [locks all vm_amap fields] */ + int am_ref; /* reference count */ + int am_flags; /* flags */ + int am_maxslot; /* max # of slots allocated */ + int am_nslot; /* # of slots currently in map ( <= maxslot) */ + int am_nused; /* # of slots currently in use */ + int *am_slots; /* contig array of active slots */ + int *am_bckptr; /* back pointer array to am_slots */ + struct vm_anon **am_anon; /* array of anonymous pages */ +#ifdef UVM_AMAP_PPREF + int *am_ppref; /* per page reference count (if !NULL) */ +#endif +}; + +/* + * note that am_slots, am_bckptr, and am_anon are arrays. this allows + * fast lookup of pages based on their virual address at the expense of + * some extra memory. in the future we should be smarter about memory + * usage and fall back to a non-array based implementation on systems + * that are short of memory (XXXCDC). + * + * the entries in the array are called slots... for example an amap that + * covers four pages of virtual memory is said to have four slots. here + * is an example of the array usage for a four slot amap. note that only + * slots one and three have anons assigned to them. "D/C" means that we + * "don't care" about the value. + * + * 0 1 2 3 + * am_anon: NULL, anon0, NULL, anon1 (actual pointers to anons) + * am_bckptr: D/C, 1, D/C, 0 (points to am_slots entry) + * + * am_slots: 3, 1, D/C, D/C (says slots 3 and 1 are in use) + * + * note that am_bckptr is D/C if the slot in am_anon is set to NULL. + * to find the entry in am_slots for an anon, look at am_bckptr[slot], + * thus the entry for slot 3 in am_slots[] is at am_slots[am_bckptr[3]]. + * in general, if am_anon[X] is non-NULL, then the following must be + * true: am_slots[am_bckptr[X]] == X + * + * note that am_slots is always contig-packed. + */ + +/* + * defines for handling of large sparce amaps: + * + * one of the problems of array-based amaps is that if you allocate a + * large sparcely-used area of virtual memory you end up allocating + * large arrays that, for the most part, don't get used. this is a + * problem for BSD in that the kernel likes to make these types of + * allocations to "reserve" memory for possible future use. + * + * for example, the kernel allocates (reserves) a large chunk of user + * VM for possible stack growth. most of the time only a page or two + * of this VM is actually used. since the stack is anonymous memory + * it makes sense for it to live in an amap, but if we allocated an + * amap for the entire stack range we could end up wasting a large + * amount of malloc'd KVM. + * + * for example, on the i386 at boot time we allocate two amaps for the stack + * of /sbin/init: + * 1. a 7680 slot amap at protection 0 (reserve space for stack) + * 2. a 512 slot amap at protection 7 (top of stack) + * + * most of the array allocated for the amaps for this is never used. + * the amap interface provides a way for us to avoid this problem by + * allowing amap_copy() to break larger amaps up into smaller sized + * chunks (controlled by the "canchunk" option). we use this feature + * to reduce our memory usage with the BSD stack management. if we + * are asked to create an amap with more than UVM_AMAP_LARGE slots in it, + * we attempt to break it up into a UVM_AMAP_CHUNK sized amap if the + * "canchunk" flag is set. + * + * so, in the i386 example, the 7680 slot area is never referenced so + * nothing gets allocated (amap_copy is never called because the protection + * is zero). the 512 slot area for the top of the stack is referenced. + * the chunking code breaks it up into 16 slot chunks (hopefully a single + * 16 slot chunk is enough to handle the whole stack). + */ + +#define UVM_AMAP_LARGE 256 /* # of slots in "large" amap */ +#define UVM_AMAP_CHUNK 16 /* # of slots to chunk large amaps in */ + + +/* + * macros + */ + +/* AMAP_B2SLOT: convert byte offset to slot */ +#ifdef DIAGNOSTIC +#define AMAP_B2SLOT(S,B) { \ + if ((B) & (PAGE_SIZE - 1)) \ + panic("AMAP_B2SLOT: invalid byte count"); \ + (S) = (B) >> PAGE_SHIFT; \ +} +#else +#define AMAP_B2SLOT(S,B) (S) = (B) >> PAGE_SHIFT +#endif + +/* + * lock/unlock/refs/flags macros + */ + +#define amap_flags(AMAP) ((AMAP)->am_flags) +#define amap_lock(AMAP) simple_lock(&(AMAP)->am_l) +#define amap_refs(AMAP) ((AMAP)->am_ref) +#define amap_unlock(AMAP) simple_unlock(&(AMAP)->am_l) + +/* + * if we enable PPREF, then we have a couple of extra functions that + * we need to prototype here... + */ + +#ifdef UVM_AMAP_PPREF + +#define PPREF_NONE ((int *) -1) /* not using ppref */ + +void amap_pp_adjref /* adjust references */ + __P((struct vm_amap *, int, vsize_t, int)); +void amap_pp_establish /* establish ppref */ + __P((struct vm_amap *)); +void amap_wiperange /* wipe part of an amap */ + __P((struct vm_amap *, int, int)); +#endif /* UVM_AMAP_PPREF */ + +#endif /* _UVM_UVM_AMAP_H_ */ diff --git a/sys/uvm/uvm_amap_i.h b/sys/uvm/uvm_amap_i.h new file mode 100644 index 00000000000..d5bbe11c054 --- /dev/null +++ b/sys/uvm/uvm_amap_i.h @@ -0,0 +1,291 @@ +/* $NetBSD: uvm_amap_i.h,v 1.11 1999/01/28 14:46:27 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_amap_i.h,v 1.1.2.4 1998/01/05 18:12:57 chuck Exp + */ + +#ifndef _UVM_UVM_AMAP_I_H_ +#define _UVM_UVM_AMAP_I_H_ + +/* + * uvm_amap_i.h + */ + +/* + * if inlines are enabled always pull in these functions, otherwise + * pull them in only once (when we are compiling uvm_amap.c). + */ + +#if defined(UVM_AMAP_INLINE) || defined(UVM_AMAP_C) + +/* + * amap_lookup: look up a page in an amap + * + * => amap should be locked by caller. + */ +AMAP_INLINE struct vm_anon * +amap_lookup(aref, offset) + struct vm_aref *aref; + vaddr_t offset; +{ + int slot; + struct vm_amap *amap = aref->ar_amap; + UVMHIST_FUNC("amap_lookup"); UVMHIST_CALLED(maphist); + + AMAP_B2SLOT(slot, offset); + slot += aref->ar_pageoff; + + if (slot >= amap->am_nslot) + panic("amap_lookup: offset out of range"); + + UVMHIST_LOG(maphist, "<- done (amap=0x%x, offset=0x%x, result=0x%x)", + amap, offset, amap->am_anon[slot], 0); + return(amap->am_anon[slot]); +} + +/* + * amap_lookups: look up a range of pages in an amap + * + * => amap should be locked by caller. + * => XXXCDC: this interface is biased toward array-based amaps. fix. + */ +AMAP_INLINE void +amap_lookups(aref, offset, anons, npages) + struct vm_aref *aref; + vaddr_t offset; + struct vm_anon **anons; + int npages; +{ + int slot; + struct vm_amap *amap = aref->ar_amap; + UVMHIST_FUNC("amap_lookups"); UVMHIST_CALLED(maphist); + + AMAP_B2SLOT(slot, offset); + slot += aref->ar_pageoff; + + UVMHIST_LOG(maphist, " slot=%d, npages=%d, nslot=%d", slot, npages, + amap->am_nslot, 0); + + if ((slot + (npages - 1)) >= amap->am_nslot) + panic("amap_lookups: offset out of range"); + + bcopy(&amap->am_anon[slot], anons, npages * sizeof(struct vm_anon *)); + + UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0); + return; +} + +/* + * amap_add: add (or replace) a page to an amap + * + * => caller must lock amap. + * => if (replace) caller must lock anon because we might have to call + * pmap_page_protect on the anon's page. + * => returns an "offset" which is meaningful to amap_unadd(). + */ +AMAP_INLINE vaddr_t +amap_add(aref, offset, anon, replace) + struct vm_aref *aref; + vaddr_t offset; + struct vm_anon *anon; + int replace; +{ + int slot; + struct vm_amap *amap = aref->ar_amap; + UVMHIST_FUNC("amap_add"); UVMHIST_CALLED(maphist); + + AMAP_B2SLOT(slot, offset); + slot += aref->ar_pageoff; + + if (slot >= amap->am_nslot) + panic("amap_add: offset out of range"); + + if (replace) { + + if (amap->am_anon[slot] == NULL) + panic("amap_add: replacing null anon"); + if (amap->am_anon[slot]->u.an_page != NULL && + (amap->am_flags & AMAP_SHARED) != 0) { + pmap_page_protect( + PMAP_PGARG(amap->am_anon[slot]->u.an_page), + VM_PROT_NONE); + /* + * XXX: suppose page is supposed to be wired somewhere? + */ + } + } else { /* !replace */ + if (amap->am_anon[slot] != NULL) + panic("amap_add: slot in use"); + + amap->am_bckptr[slot] = amap->am_nused; + amap->am_slots[amap->am_nused] = slot; + amap->am_nused++; + } + amap->am_anon[slot] = anon; + UVMHIST_LOG(maphist, + "<- done (amap=0x%x, offset=0x%x, anon=0x%x, rep=%d)", + amap, offset, anon, replace); + + return(slot); +} + +/* + * amap_unadd: remove a page from an amap, given we know the slot #. + * + * => caller must lock amap + */ +AMAP_INLINE void +amap_unadd(amap, slot) + struct vm_amap *amap; + vaddr_t slot; +{ + int ptr; + UVMHIST_FUNC("amap_unadd"); UVMHIST_CALLED(maphist); + + if (slot >= amap->am_nslot) + panic("amap_add: offset out of range"); + + if (amap->am_anon[slot] == NULL) + panic("amap_unadd: nothing there"); + + amap->am_anon[slot] = NULL; + ptr = amap->am_bckptr[slot]; + + if (ptr != (amap->am_nused - 1)) { /* swap to keep slots contig? */ + amap->am_slots[ptr] = amap->am_slots[amap->am_nused - 1]; + amap->am_bckptr[amap->am_slots[ptr]] = ptr; /* back link */ + } + amap->am_nused--; + UVMHIST_LOG(maphist, "<- done (amap=0x%x, slot=0x%x)", amap, slot,0, 0); +} + +/* + * amap_ref: gain a reference to an amap + * + * => amap must not be locked (we will lock) + * => called at fork time to gain the child's reference + */ +AMAP_INLINE void +amap_ref(entry, flags) + vm_map_entry_t entry; + int flags; +{ + struct vm_amap *amap = entry->aref.ar_amap; + UVMHIST_FUNC("amap_ref"); UVMHIST_CALLED(maphist); + + amap_lock(amap); + amap->am_ref++; + if (flags & AMAP_SHARED) + amap->am_flags |= AMAP_SHARED; +#ifdef UVM_AMAP_PPREF + if (amap->am_ppref == NULL && (flags & AMAP_REFALL) == 0 && + (entry->start - entry->end) >> PAGE_SHIFT != amap->am_nslot) + amap_pp_establish(amap); + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { + if (flags & AMAP_REFALL) + amap_pp_adjref(amap, 0, amap->am_nslot << PAGE_SHIFT, 1); + else + amap_pp_adjref(amap, entry->aref.ar_pageoff, + entry->end - entry->start, 1); + } +#endif + amap_unlock(amap); + UVMHIST_LOG(maphist,"<- done! amap=0x%x", amap, 0, 0, 0); +} + +/* + * amap_unref: remove a reference to an amap + * + * => caller must remove all pmap-level references to this amap before + * dropping the reference + * => called from uvm_unmap_detach [only] ... note that entry is no + * longer part of a map and thus has no need for locking + * => amap must be unlocked (we will lock it). + */ +AMAP_INLINE void +amap_unref(entry, all) + vm_map_entry_t entry; + int all; +{ + struct vm_amap *amap = entry->aref.ar_amap; + UVMHIST_FUNC("amap_unref"); UVMHIST_CALLED(maphist); + + /* + * lock it + */ + amap_lock(amap); + + UVMHIST_LOG(maphist,"(entry=0x%x) amap=0x%x refs=%d, nused=%d", + entry, amap, amap->am_ref, amap->am_nused); + + /* + * if we are the last reference, free the amap and return. + */ + + if (amap->am_ref == 1) { + amap_wipeout(amap); /* drops final ref and frees */ + UVMHIST_LOG(maphist,"<- done (was last ref)!", 0, 0, 0, 0); + return; /* no need to unlock */ + } + + /* + * otherwise just drop the reference count(s) + */ + + amap->am_ref--; + if (amap->am_ref == 1 && (amap->am_flags & AMAP_SHARED) != 0) + amap->am_flags &= ~AMAP_SHARED; /* clear shared flag */ +#ifdef UVM_AMAP_PPREF + if (amap->am_ppref == NULL && all == 0 && + (entry->start - entry->end) >> PAGE_SHIFT != amap->am_nslot) + amap_pp_establish(amap); + if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { + if (all) + amap_pp_adjref(amap, 0, amap->am_nslot << PAGE_SHIFT, -1); + else + amap_pp_adjref(amap, entry->aref.ar_pageoff, + entry->end - entry->start, -1); + } +#endif + amap_unlock(amap); + + UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); +} + +#endif /* defined(UVM_AMAP_INLINE) || defined(UVM_AMAP_C) */ + +#endif /* _UVM_UVM_AMAP_I_H_ */ diff --git a/sys/uvm/uvm_anon.c b/sys/uvm/uvm_anon.c new file mode 100644 index 00000000000..214e12df701 --- /dev/null +++ b/sys/uvm/uvm_anon.c @@ -0,0 +1,345 @@ +/* $NetBSD: uvm_anon.c,v 1.1 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * uvm_anon.c: uvm anon ops + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/pool.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_swap.h> + +/* + * allocate anons + */ +void +uvm_anon_init() +{ + struct vm_anon *anon; + int nanon = uvmexp.free - (uvmexp.free / 16); /* XXXCDC ??? */ + int lcv; + + /* + * Allocate the initial anons. + */ + anon = (struct vm_anon *)uvm_km_alloc(kernel_map, + sizeof(*anon) * nanon); + if (anon == NULL) { + printf("uvm_anon_init: can not allocate %d anons\n", nanon); + panic("uvm_anon_init"); + } + + bzero(anon, sizeof(*anon) * nanon); + uvm.afree = NULL; + uvmexp.nanon = uvmexp.nfreeanon = nanon; + for (lcv = 0 ; lcv < nanon ; lcv++) { + anon[lcv].u.an_nxt = uvm.afree; + uvm.afree = &anon[lcv]; + } + simple_lock_init(&uvm.afreelock); +} + +/* + * add some more anons to the free pool. called when we add + * more swap space. + */ +void +uvm_anon_add(pages) + int pages; +{ + struct vm_anon *anon; + int lcv; + + anon = (struct vm_anon *)uvm_km_alloc(kernel_map, + sizeof(*anon) * pages); + + /* XXX Should wait for VM to free up. */ + if (anon == NULL) { + printf("uvm_anon_add: can not allocate %d anons\n", pages); + panic("uvm_anon_add"); + } + + simple_lock(&uvm.afreelock); + bzero(anon, sizeof(*anon) * pages); + uvmexp.nanon += pages; + uvmexp.nfreeanon += pages; + for (lcv = 0; lcv < pages; lcv++) { + simple_lock_init(&anon->an_lock); + anon[lcv].u.an_nxt = uvm.afree; + uvm.afree = &anon[lcv]; + } + simple_unlock(&uvm.afreelock); +} + +/* + * allocate an anon + */ +struct vm_anon * +uvm_analloc() +{ + struct vm_anon *a; + + simple_lock(&uvm.afreelock); + a = uvm.afree; + if (a) { + uvm.afree = a->u.an_nxt; + uvmexp.nfreeanon--; + a->an_ref = 1; + a->an_swslot = 0; + a->u.an_page = NULL; /* so we can free quickly */ + } + simple_unlock(&uvm.afreelock); + return(a); +} + +/* + * uvm_anfree: free a single anon structure + * + * => caller must remove anon from its amap before calling (if it was in + * an amap). + * => anon must be unlocked and have a zero reference count. + * => we may lock the pageq's. + */ +void +uvm_anfree(anon) + struct vm_anon *anon; +{ + struct vm_page *pg; + UVMHIST_FUNC("uvm_anfree"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(anon=0x%x)", anon, 0,0,0); + + /* + * get page + */ + + pg = anon->u.an_page; + + /* + * if there is a resident page and it is loaned, then anon may not + * own it. call out to uvm_anon_lockpage() to ensure the real owner + * of the page has been identified and locked. + */ + + if (pg && pg->loan_count) + pg = uvm_anon_lockloanpg(anon); + + /* + * if we have a resident page, we must dispose of it before freeing + * the anon. + */ + + if (pg) { + + /* + * if the page is owned by a uobject (now locked), then we must + * kill the loan on the page rather than free it. + */ + + if (pg->uobject) { + + /* kill loan */ + uvm_lock_pageq(); +#ifdef DIAGNOSTIC + if (pg->loan_count < 1) + panic("uvm_anfree: obj owned page " + "with no loan count"); +#endif + pg->loan_count--; + pg->uanon = NULL; + uvm_unlock_pageq(); + simple_unlock(&pg->uobject->vmobjlock); + + } else { + + /* + * page has no uobject, so we must be the owner of it. + * + * if page is busy then we just mark it as released + * (who ever has it busy must check for this when they + * wake up). if the page is not busy then we can + * free it now. + */ + + if ((pg->flags & PG_BUSY) != 0) { + /* tell them to dump it when done */ + pg->flags |= PG_RELEASED; + simple_unlock(&anon->an_lock); + UVMHIST_LOG(maphist, + " anon 0x%x, page 0x%x: BUSY (released!)", + anon, pg, 0, 0); + return; + } + + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + uvm_lock_pageq(); /* lock out pagedaemon */ + uvm_pagefree(pg); /* bye bye */ + uvm_unlock_pageq(); /* free the daemon */ + + UVMHIST_LOG(maphist," anon 0x%x, page 0x%x: freed now!", + anon, pg, 0, 0); + } + } + + /* + * are we using any backing store resources? if so, free them. + */ + if (anon->an_swslot) { + /* + * on backing store: no I/O in progress. sole amap reference + * is ours and we've got it locked down. thus we can free, + * and be done. + */ + UVMHIST_LOG(maphist," freeing anon 0x%x, paged to swslot 0x%x", + anon, anon->an_swslot, 0, 0); + uvm_swap_free(anon->an_swslot, 1); + anon->an_swslot = 0; + } + + /* + * now that we've stripped the data areas from the anon, free the anon + * itself! + */ + simple_lock(&uvm.afreelock); + anon->u.an_nxt = uvm.afree; + uvm.afree = anon; + uvmexp.nfreeanon++; + simple_unlock(&uvm.afreelock); + UVMHIST_LOG(maphist,"<- done!",0,0,0,0); +} + +/* + * uvm_anon_lockloanpg: given a locked anon, lock its resident page + * + * => anon is locked by caller + * => on return: anon is locked + * if there is a resident page: + * if it has a uobject, it is locked by us + * if it is ownerless, we take over as owner + * we return the resident page (it can change during + * this function) + * => note that the only time an anon has an ownerless resident page + * is if the page was loaned from a uvm_object and the uvm_object + * disowned it + * => this only needs to be called when you want to do an operation + * on an anon's resident page and that page has a non-zero loan + * count. + */ +struct vm_page * +uvm_anon_lockloanpg(anon) + struct vm_anon *anon; +{ + struct vm_page *pg; + boolean_t locked = FALSE; + + /* + * loop while we have a resident page that has a non-zero loan count. + * if we successfully get our lock, we will "break" the loop. + * note that the test for pg->loan_count is not protected -- this + * may produce false positive results. note that a false positive + * result may cause us to do more work than we need to, but it will + * not produce an incorrect result. + */ + + while (((pg = anon->u.an_page) != NULL) && pg->loan_count != 0) { + + /* + * quickly check to see if the page has an object before + * bothering to lock the page queues. this may also produce + * a false positive result, but that's ok because we do a real + * check after that. + * + * XXX: quick check -- worth it? need volatile? + */ + + if (pg->uobject) { + + uvm_lock_pageq(); + if (pg->uobject) { /* the "real" check */ + locked = + simple_lock_try(&pg->uobject->vmobjlock); + } else { + /* object disowned before we got PQ lock */ + locked = TRUE; + } + uvm_unlock_pageq(); + + /* + * if we didn't get a lock (try lock failed), then we + * toggle our anon lock and try again + */ + + if (!locked) { + simple_unlock(&anon->an_lock); + /* + * someone locking the object has a chance to + * lock us right now + */ + simple_lock(&anon->an_lock); + continue; /* start over */ + } + } + + /* + * if page is un-owned [i.e. the object dropped its ownership], + * then we can take over as owner! + */ + + if (pg->uobject == NULL && (pg->pqflags & PQ_ANON) == 0) { + uvm_lock_pageq(); + pg->pqflags |= PQ_ANON; /* take ownership... */ + pg->loan_count--; /* ... and drop our loan */ + uvm_unlock_pageq(); + } + + /* + * we did it! break the loop + */ + break; + } + + /* + * done! + */ + + return(pg); +} diff --git a/sys/uvm/uvm_anon.h b/sys/uvm/uvm_anon.h new file mode 100644 index 00000000000..f52f6f646f4 --- /dev/null +++ b/sys/uvm/uvm_anon.h @@ -0,0 +1,105 @@ +/* $NetBSD: uvm_anon.h,v 1.9 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _UVM_UVM_ANON_H_ +#define _UVM_UVM_ANON_H_ + +/* + * uvm_anon.h + */ + +/* + * anonymous memory management + * + * anonymous virtual memory is short term virtual memory that goes away + * when the processes referencing it go away. an anonymous page of + * virtual memory is described by the following data structure: + */ + +struct vm_anon { + int an_ref; /* reference count [an_lock] */ + simple_lock_data_t an_lock; /* lock for an_ref */ + union { + struct vm_anon *an_nxt; /* if on free list [afreelock] */ + struct vm_page *an_page;/* if in RAM [an_lock] */ + } u; + int an_swslot; /* drum swap slot # (if != 0) + [an_lock. also, it is ok to read + an_swslot if we hold an_page PG_BUSY] */ +}; + +/* + * a pool of vm_anon data structures is allocated and put on a global + * free list at boot time. vm_anon's on the free list use "an_nxt" as + * a pointer to the next item on the free list. for active vm_anon's + * the data can be in one of the following state: [1] in a vm_page + * with no backing store allocated yet, [2] in a vm_page with backing + * store allocated, or [3] paged out to backing store (no vm_page). + * + * for pageout in case [2]: if the page has been modified then we must + * flush it out to backing store, otherwise we can just dump the + * vm_page. + */ + +/* + * anons are grouped together in anonymous memory maps, or amaps. + * amaps are defined in uvm_amap.h. + */ + +/* + * processes reference anonymous virtual memory maps with an anonymous + * reference structure: + */ + +struct vm_aref { + int ar_pageoff; /* page offset into amap we start */ + struct vm_amap *ar_amap; /* pointer to amap */ +}; + +/* + * the offset field indicates which part of the amap we are referencing. + * locked by vm_map lock. + */ + +/* + * prototypes + */ + +struct vm_anon *uvm_analloc __P((void)); +void uvm_anfree __P((struct vm_anon *)); +void uvm_anon_init __P((void)); +void uvm_anon_add __P((int)); +struct vm_page *uvm_anon_lockloanpg __P((struct vm_anon *)); + +#endif /* _UVM_UVM_ANON_H_ */ diff --git a/sys/uvm/uvm_aobj.c b/sys/uvm/uvm_aobj.c new file mode 100644 index 00000000000..8e0d3fc22ef --- /dev/null +++ b/sys/uvm/uvm_aobj.c @@ -0,0 +1,1090 @@ +/* $NetBSD: uvm_aobj.c,v 1.15 1998/10/18 23:49:59 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and + * Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp + */ +/* + * uvm_aobj.c: anonymous memory uvm_object pager + * + * author: Chuck Silvers <chuq@chuq.com> + * started: Jan-1998 + * + * - design mostly from Chuck Cranor + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/pool.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * an aobj manages anonymous-memory backed uvm_objects. in addition + * to keeping the list of resident pages, it also keeps a list of + * allocated swap blocks. depending on the size of the aobj this list + * of allocated swap blocks is either stored in an array (small objects) + * or in a hash table (large objects). + */ + +/* + * local structures + */ + +/* + * for hash tables, we break the address space of the aobj into blocks + * of UAO_SWHASH_CLUSTER_SIZE pages. we require the cluster size to + * be a power of two. + */ + +#define UAO_SWHASH_CLUSTER_SHIFT 4 +#define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT) + +/* get the "tag" for this page index */ +#define UAO_SWHASH_ELT_TAG(PAGEIDX) \ + ((PAGEIDX) >> UAO_SWHASH_CLUSTER_SHIFT) + +/* given an ELT and a page index, find the swap slot */ +#define UAO_SWHASH_ELT_PAGESLOT(ELT, PAGEIDX) \ + ((ELT)->slots[(PAGEIDX) & (UAO_SWHASH_CLUSTER_SIZE - 1)]) + +/* given an ELT, return its pageidx base */ +#define UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \ + ((ELT)->tag << UAO_SWHASH_CLUSTER_SHIFT) + +/* + * the swhash hash function + */ +#define UAO_SWHASH_HASH(AOBJ, PAGEIDX) \ + (&(AOBJ)->u_swhash[(((PAGEIDX) >> UAO_SWHASH_CLUSTER_SHIFT) \ + & (AOBJ)->u_swhashmask)]) + +/* + * the swhash threshhold determines if we will use an array or a + * hash table to store the list of allocated swap blocks. + */ + +#define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4) +#define UAO_USES_SWHASH(AOBJ) \ + ((AOBJ)->u_pages > UAO_SWHASH_THRESHOLD) /* use hash? */ + +/* + * the number of buckets in a swhash, with an upper bound + */ +#define UAO_SWHASH_MAXBUCKETS 256 +#define UAO_SWHASH_BUCKETS(AOBJ) \ + (min((AOBJ)->u_pages >> UAO_SWHASH_CLUSTER_SHIFT, \ + UAO_SWHASH_MAXBUCKETS)) + + +/* + * uao_swhash_elt: when a hash table is being used, this structure defines + * the format of an entry in the bucket list. + */ + +struct uao_swhash_elt { + LIST_ENTRY(uao_swhash_elt) list; /* the hash list */ + vaddr_t tag; /* our 'tag' */ + int count; /* our number of active slots */ + int slots[UAO_SWHASH_CLUSTER_SIZE]; /* the slots */ +}; + +/* + * uao_swhash: the swap hash table structure + */ + +LIST_HEAD(uao_swhash, uao_swhash_elt); + +/* + * uao_swhash_elt_pool: pool of uao_swhash_elt structures + */ + +struct pool uao_swhash_elt_pool; + +/* + * uvm_aobj: the actual anon-backed uvm_object + * + * => the uvm_object is at the top of the structure, this allows + * (struct uvm_device *) == (struct uvm_object *) + * => only one of u_swslots and u_swhash is used in any given aobj + */ + +struct uvm_aobj { + struct uvm_object u_obj; /* has: lock, pgops, memq, #pages, #refs */ + int u_pages; /* number of pages in entire object */ + int u_flags; /* the flags (see uvm_aobj.h) */ + int *u_swslots; /* array of offset->swapslot mappings */ + /* + * hashtable of offset->swapslot mappings + * (u_swhash is an array of bucket heads) + */ + struct uao_swhash *u_swhash; + u_long u_swhashmask; /* mask for hashtable */ + LIST_ENTRY(uvm_aobj) u_list; /* global list of aobjs */ +}; + +/* + * uvm_aobj_pool: pool of uvm_aobj structures + */ + +struct pool uvm_aobj_pool; + +/* + * local functions + */ + +static void uao_init __P((void)); +static struct uao_swhash_elt *uao_find_swhash_elt __P((struct uvm_aobj *, + int, boolean_t)); +static int uao_find_swslot __P((struct uvm_aobj *, + int)); +static boolean_t uao_flush __P((struct uvm_object *, + vaddr_t, vaddr_t, + int)); +static void uao_free __P((struct uvm_aobj *)); +static int uao_get __P((struct uvm_object *, vaddr_t, + vm_page_t *, int *, int, + vm_prot_t, int, int)); +static boolean_t uao_releasepg __P((struct vm_page *, + struct vm_page **)); + + + +/* + * aobj_pager + * + * note that some functions (e.g. put) are handled elsewhere + */ + +struct uvm_pagerops aobj_pager = { + uao_init, /* init */ + NULL, /* attach */ + uao_reference, /* reference */ + uao_detach, /* detach */ + NULL, /* fault */ + uao_flush, /* flush */ + uao_get, /* get */ + NULL, /* asyncget */ + NULL, /* put (done by pagedaemon) */ + NULL, /* cluster */ + NULL, /* mk_pcluster */ + uvm_shareprot, /* shareprot */ + NULL, /* aiodone */ + uao_releasepg /* releasepg */ +}; + +/* + * uao_list: global list of active aobjs, locked by uao_list_lock + */ + +static LIST_HEAD(aobjlist, uvm_aobj) uao_list; +static simple_lock_data_t uao_list_lock; + + +/* + * functions + */ + +/* + * hash table/array related functions + */ + +/* + * uao_find_swhash_elt: find (or create) a hash table entry for a page + * offset. + * + * => the object should be locked by the caller + */ + +static struct uao_swhash_elt * +uao_find_swhash_elt(aobj, pageidx, create) + struct uvm_aobj *aobj; + int pageidx; + boolean_t create; +{ + struct uao_swhash *swhash; + struct uao_swhash_elt *elt; + int page_tag; + + swhash = UAO_SWHASH_HASH(aobj, pageidx); /* first hash to get bucket */ + page_tag = UAO_SWHASH_ELT_TAG(pageidx); /* tag to search for */ + + /* + * now search the bucket for the requested tag + */ + for (elt = swhash->lh_first; elt != NULL; elt = elt->list.le_next) { + if (elt->tag == page_tag) + return(elt); + } + + /* fail now if we are not allowed to create a new entry in the bucket */ + if (!create) + return NULL; + + + /* + * allocate a new entry for the bucket and init/insert it in + */ + elt = pool_get(&uao_swhash_elt_pool, PR_WAITOK); + LIST_INSERT_HEAD(swhash, elt, list); + elt->tag = page_tag; + elt->count = 0; + bzero(elt->slots, sizeof(elt->slots)); + + return(elt); +} + +/* + * uao_find_swslot: find the swap slot number for an aobj/pageidx + * + * => object must be locked by caller + */ +__inline static int +uao_find_swslot(aobj, pageidx) + struct uvm_aobj *aobj; + int pageidx; +{ + + /* + * if noswap flag is set, then we never return a slot + */ + + if (aobj->u_flags & UAO_FLAG_NOSWAP) + return(0); + + /* + * if hashing, look in hash table. + */ + + if (UAO_USES_SWHASH(aobj)) { + struct uao_swhash_elt *elt = + uao_find_swhash_elt(aobj, pageidx, FALSE); + + if (elt) + return(UAO_SWHASH_ELT_PAGESLOT(elt, pageidx)); + else + return(NULL); + } + + /* + * otherwise, look in the array + */ + return(aobj->u_swslots[pageidx]); +} + +/* + * uao_set_swslot: set the swap slot for a page in an aobj. + * + * => setting a slot to zero frees the slot + * => object must be locked by caller + */ +int +uao_set_swslot(uobj, pageidx, slot) + struct uvm_object *uobj; + int pageidx, slot; +{ + struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; + int oldslot; + UVMHIST_FUNC("uao_set_swslot"); UVMHIST_CALLED(pdhist); + UVMHIST_LOG(pdhist, "aobj %p pageidx %d slot %d", + aobj, pageidx, slot, 0); + + /* + * if noswap flag is set, then we can't set a slot + */ + + if (aobj->u_flags & UAO_FLAG_NOSWAP) { + + if (slot == 0) + return(0); /* a clear is ok */ + + /* but a set is not */ + printf("uao_set_swslot: uobj = %p\n", uobj); + panic("uao_set_swslot: attempt to set a slot on a NOSWAP object"); + } + + /* + * are we using a hash table? if so, add it in the hash. + */ + + if (UAO_USES_SWHASH(aobj)) { + /* + * Avoid allocating an entry just to free it again if + * the page had not swap slot in the first place, and + * we are freeing. + */ + struct uao_swhash_elt *elt = + uao_find_swhash_elt(aobj, pageidx, slot ? TRUE : FALSE); + if (elt == NULL) { +#ifdef DIAGNOSTIC + if (slot) + panic("uao_set_swslot: didn't create elt"); +#endif + return (0); + } + + oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx); + UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot; + + /* + * now adjust the elt's reference counter and free it if we've + * dropped it to zero. + */ + + /* an allocation? */ + if (slot) { + if (oldslot == 0) + elt->count++; + } else { /* freeing slot ... */ + if (oldslot) /* to be safe */ + elt->count--; + + if (elt->count == 0) { + LIST_REMOVE(elt, list); + pool_put(&uao_swhash_elt_pool, elt); + } + } + + } else { + /* we are using an array */ + oldslot = aobj->u_swslots[pageidx]; + aobj->u_swslots[pageidx] = slot; + } + return (oldslot); +} + +/* + * end of hash/array functions + */ + +/* + * uao_free: free all resources held by an aobj, and then free the aobj + * + * => the aobj should be dead + */ +static void +uao_free(aobj) + struct uvm_aobj *aobj; +{ + + if (UAO_USES_SWHASH(aobj)) { + int i, hashbuckets = aobj->u_swhashmask + 1; + + /* + * free the swslots from each hash bucket, + * then the hash bucket, and finally the hash table itself. + */ + for (i = 0; i < hashbuckets; i++) { + struct uao_swhash_elt *elt, *next; + + for (elt = aobj->u_swhash[i].lh_first; elt != NULL; + elt = next) { + int j; + + for (j = 0; j < UAO_SWHASH_CLUSTER_SIZE; j++) + { + int slot = elt->slots[j]; + + if (slot) + uvm_swap_free(slot, 1); + } + + next = elt->list.le_next; + pool_put(&uao_swhash_elt_pool, elt); + } + } + FREE(aobj->u_swhash, M_UVMAOBJ); + } else { + int i; + + /* + * free the array + */ + + for (i = 0; i < aobj->u_pages; i++) + { + int slot = aobj->u_swslots[i]; + + if (slot) + uvm_swap_free(slot, 1); + } + FREE(aobj->u_swslots, M_UVMAOBJ); + } + + /* + * finally free the aobj itself + */ + pool_put(&uvm_aobj_pool, aobj); +} + +/* + * pager functions + */ + +/* + * uao_create: create an aobj of the given size and return its uvm_object. + * + * => for normal use, flags are always zero + * => for the kernel object, the flags are: + * UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once) + * UAO_FLAG_KERNSWAP - enable swapping of kernel object (" ") + */ +struct uvm_object * +uao_create(size, flags) + vsize_t size; + int flags; +{ + static struct uvm_aobj kernel_object_store; /* home of kernel_object */ + static int kobj_alloced = 0; /* not allocated yet */ + int pages = round_page(size) >> PAGE_SHIFT; + struct uvm_aobj *aobj; + + /* + * malloc a new aobj unless we are asked for the kernel object + */ + if (flags & UAO_FLAG_KERNOBJ) { /* want kernel object? */ + if (kobj_alloced) + panic("uao_create: kernel object already allocated"); + + /* + * XXXTHORPEJ: Need to call this now, so the pool gets + * initialized! + */ + uao_init(); + + aobj = &kernel_object_store; + aobj->u_pages = pages; + aobj->u_flags = UAO_FLAG_NOSWAP; /* no swap to start */ + /* we are special, we never die */ + aobj->u_obj.uo_refs = UVM_OBJ_KERN; + kobj_alloced = UAO_FLAG_KERNOBJ; + } else if (flags & UAO_FLAG_KERNSWAP) { + aobj = &kernel_object_store; + if (kobj_alloced != UAO_FLAG_KERNOBJ) + panic("uao_create: asked to enable swap on kernel object"); + kobj_alloced = UAO_FLAG_KERNSWAP; + } else { /* normal object */ + aobj = pool_get(&uvm_aobj_pool, PR_WAITOK); + aobj->u_pages = pages; + aobj->u_flags = 0; /* normal object */ + aobj->u_obj.uo_refs = 1; /* start with 1 reference */ + } + + /* + * allocate hash/array if necessary + * + * note: in the KERNSWAP case no need to worry about locking since + * we are still booting we should be the only thread around. + */ + if (flags == 0 || (flags & UAO_FLAG_KERNSWAP) != 0) { + int mflags = (flags & UAO_FLAG_KERNSWAP) != 0 ? + M_NOWAIT : M_WAITOK; + + /* allocate hash table or array depending on object size */ + if (UAO_USES_SWHASH(aobj)) { + aobj->u_swhash = newhashinit(UAO_SWHASH_BUCKETS(aobj), + M_UVMAOBJ, mflags, &aobj->u_swhashmask); + if (aobj->u_swhash == NULL) + panic("uao_create: hashinit swhash failed"); + } else { + MALLOC(aobj->u_swslots, int *, pages * sizeof(int), + M_UVMAOBJ, mflags); + if (aobj->u_swslots == NULL) + panic("uao_create: malloc swslots failed"); + bzero(aobj->u_swslots, pages * sizeof(int)); + } + + if (flags) { + aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */ + return(&aobj->u_obj); + /* done! */ + } + } + + /* + * init aobj fields + */ + simple_lock_init(&aobj->u_obj.vmobjlock); + aobj->u_obj.pgops = &aobj_pager; + TAILQ_INIT(&aobj->u_obj.memq); + aobj->u_obj.uo_npages = 0; + + /* + * now that aobj is ready, add it to the global list + * XXXCHS: uao_init hasn't been called'd in the KERNOBJ case, + * do we really need the kernel object on this list anyway? + */ + simple_lock(&uao_list_lock); + LIST_INSERT_HEAD(&uao_list, aobj, u_list); + simple_unlock(&uao_list_lock); + + /* + * done! + */ + return(&aobj->u_obj); +} + + + +/* + * uao_init: set up aobj pager subsystem + * + * => called at boot time from uvm_pager_init() + */ +static void +uao_init() +{ + static int uao_initialized; + + if (uao_initialized) + return; + uao_initialized = TRUE; + + LIST_INIT(&uao_list); + simple_lock_init(&uao_list_lock); + + /* + * NOTE: Pages fror this pool must not come from a pageable + * kernel map! + */ + pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt), + 0, 0, 0, "uaoeltpl", 0, NULL, NULL, M_UVMAOBJ); + + pool_init(&uvm_aobj_pool, sizeof(struct uvm_aobj), 0, 0, 0, + "aobjpl", 0, + pool_page_alloc_nointr, pool_page_free_nointr, M_UVMAOBJ); +} + +/* + * uao_reference: add a ref to an aobj + * + * => aobj must be unlocked (we will lock it) + */ +void +uao_reference(uobj) + struct uvm_object *uobj; +{ + UVMHIST_FUNC("uao_reference"); UVMHIST_CALLED(maphist); + + /* + * kernel_object already has plenty of references, leave it alone. + */ + + if (uobj->uo_refs == UVM_OBJ_KERN) + return; + + simple_lock(&uobj->vmobjlock); + uobj->uo_refs++; /* bump! */ + UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", + uobj, uobj->uo_refs,0,0); + simple_unlock(&uobj->vmobjlock); +} + +/* + * uao_detach: drop a reference to an aobj + * + * => aobj must be unlocked, we will lock it + */ +void +uao_detach(uobj) + struct uvm_object *uobj; +{ + struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; + struct vm_page *pg; + boolean_t busybody; + UVMHIST_FUNC("uao_detach"); UVMHIST_CALLED(maphist); + + /* + * detaching from kernel_object is a noop. + */ + if (uobj->uo_refs == UVM_OBJ_KERN) + return; + + simple_lock(&uobj->vmobjlock); + + UVMHIST_LOG(maphist," (uobj=0x%x) ref=%d", uobj,uobj->uo_refs,0,0); + uobj->uo_refs--; /* drop ref! */ + if (uobj->uo_refs) { /* still more refs? */ + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0); + return; + } + + /* + * remove the aobj from the global list. + */ + simple_lock(&uao_list_lock); + LIST_REMOVE(aobj, u_list); + simple_unlock(&uao_list_lock); + + /* + * free all the pages that aren't PG_BUSY, mark for release any that are. + */ + + busybody = FALSE; + for (pg = uobj->memq.tqh_first ; pg != NULL ; pg = pg->listq.tqe_next) { + int swslot; + + if (pg->flags & PG_BUSY) { + pg->flags |= PG_RELEASED; + busybody = TRUE; + continue; + } + + + /* zap the mappings, free the swap slot, free the page */ + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + + swslot = uao_set_swslot(&aobj->u_obj, + pg->offset >> PAGE_SHIFT, 0); + if (swslot) { + uvm_swap_free(swslot, 1); + } + + uvm_lock_pageq(); + uvm_pagefree(pg); + uvm_unlock_pageq(); + } + + /* + * if we found any busy pages, we're done for now. + * mark the aobj for death, releasepg will finish up for us. + */ + if (busybody) { + aobj->u_flags |= UAO_FLAG_KILLME; + simple_unlock(&aobj->u_obj.vmobjlock); + return; + } + + /* + * finally, free the rest. + */ + uao_free(aobj); +} + +/* + * uao_flush: uh, yea, sure it's flushed. really! + */ +boolean_t +uao_flush(uobj, start, end, flags) + struct uvm_object *uobj; + vaddr_t start, end; + int flags; +{ + + /* + * anonymous memory doesn't "flush" + */ + /* + * XXX + * deal with PGO_DEACTIVATE (for madvise(MADV_SEQUENTIAL)) + * and PGO_FREE (for msync(MSINVALIDATE)) + */ + return TRUE; +} + +/* + * uao_get: fetch me a page + * + * we have three cases: + * 1: page is resident -> just return the page. + * 2: page is zero-fill -> allocate a new page and zero it. + * 3: page is swapped out -> fetch the page from swap. + * + * cases 1 and 2 can be handled with PGO_LOCKED, case 3 cannot. + * so, if the "center" page hits case 3 (or any page, with PGO_ALLPAGES), + * then we will need to return VM_PAGER_UNLOCK. + * + * => prefer map unlocked (not required) + * => object must be locked! we will _unlock_ it before starting any I/O. + * => flags: PGO_ALLPAGES: get all of the pages + * PGO_LOCKED: fault data structures are locked + * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx] + * => NOTE: caller must check for released pages!! + */ +static int +uao_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags) + struct uvm_object *uobj; + vaddr_t offset; + struct vm_page **pps; + int *npagesp; + int centeridx, advice, flags; + vm_prot_t access_type; +{ + struct uvm_aobj *aobj = (struct uvm_aobj *)uobj; + vaddr_t current_offset; + vm_page_t ptmp; + int lcv, gotpages, maxpages, swslot, rv; + boolean_t done; + UVMHIST_FUNC("uao_get"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "aobj=%p offset=%d, flags=%d", aobj, offset, flags,0); + + /* + * get number of pages + */ + + maxpages = *npagesp; + + /* + * step 1: handled the case where fault data structures are locked. + */ + + if (flags & PGO_LOCKED) { + + /* + * step 1a: get pages that are already resident. only do + * this if the data structures are locked (i.e. the first + * time through). + */ + + done = TRUE; /* be optimistic */ + gotpages = 0; /* # of pages we got so far */ + + for (lcv = 0, current_offset = offset ; lcv < maxpages ; + lcv++, current_offset += PAGE_SIZE) { + /* do we care about this page? if not, skip it */ + if (pps[lcv] == PGO_DONTCARE) + continue; + + ptmp = uvm_pagelookup(uobj, current_offset); + + /* + * if page is new, attempt to allocate the page, then + * zero-fill it. + */ + if (ptmp == NULL && uao_find_swslot(aobj, + current_offset >> PAGE_SHIFT) == 0) { + ptmp = uvm_pagealloc(uobj, current_offset, + NULL); + if (ptmp) { + /* new page */ + ptmp->flags &= ~(PG_BUSY|PG_FAKE); + ptmp->pqflags |= PQ_AOBJ; + UVM_PAGE_OWN(ptmp, NULL); + uvm_pagezero(ptmp); + } + } + + /* + * to be useful must get a non-busy, non-released page + */ + if (ptmp == NULL || + (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + if (lcv == centeridx || + (flags & PGO_ALLPAGES) != 0) + /* need to do a wait or I/O! */ + done = FALSE; + continue; + } + + /* + * useful page: busy/lock it and plug it in our + * result array + */ + /* caller must un-busy this page */ + ptmp->flags |= PG_BUSY; + UVM_PAGE_OWN(ptmp, "uao_get1"); + pps[lcv] = ptmp; + gotpages++; + + } /* "for" lcv loop */ + + /* + * step 1b: now we've either done everything needed or we + * to unlock and do some waiting or I/O. + */ + + UVMHIST_LOG(pdhist, "<- done (done=%d)", done, 0,0,0); + + *npagesp = gotpages; + if (done) + /* bingo! */ + return(VM_PAGER_OK); + else + /* EEK! Need to unlock and I/O */ + return(VM_PAGER_UNLOCK); + } + + /* + * step 2: get non-resident or busy pages. + * object is locked. data structures are unlocked. + */ + + for (lcv = 0, current_offset = offset ; lcv < maxpages ; + lcv++, current_offset += PAGE_SIZE) { + /* + * - skip over pages we've already gotten or don't want + * - skip over pages we don't _have_ to get + */ + if (pps[lcv] != NULL || + (lcv != centeridx && (flags & PGO_ALLPAGES) == 0)) + continue; + + /* + * we have yet to locate the current page (pps[lcv]). we + * first look for a page that is already at the current offset. + * if we find a page, we check to see if it is busy or + * released. if that is the case, then we sleep on the page + * until it is no longer busy or released and repeat the lookup. + * if the page we found is neither busy nor released, then we + * busy it (so we own it) and plug it into pps[lcv]. this + * 'break's the following while loop and indicates we are + * ready to move on to the next page in the "lcv" loop above. + * + * if we exit the while loop with pps[lcv] still set to NULL, + * then it means that we allocated a new busy/fake/clean page + * ptmp in the object and we need to do I/O to fill in the data. + */ + + /* top of "pps" while loop */ + while (pps[lcv] == NULL) { + /* look for a resident page */ + ptmp = uvm_pagelookup(uobj, current_offset); + + /* not resident? allocate one now (if we can) */ + if (ptmp == NULL) { + + ptmp = uvm_pagealloc(uobj, current_offset, + NULL); /* alloc */ + + /* out of RAM? */ + if (ptmp == NULL) { + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(pdhist, + "sleeping, ptmp == NULL\n",0,0,0,0); + uvm_wait("uao_getpage"); + simple_lock(&uobj->vmobjlock); + /* goto top of pps while loop */ + continue; + } + + /* + * safe with PQ's unlocked: because we just + * alloc'd the page + */ + ptmp->pqflags |= PQ_AOBJ; + + /* + * got new page ready for I/O. break pps while + * loop. pps[lcv] is still NULL. + */ + break; + } + + /* page is there, see if we need to wait on it */ + if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + ptmp->flags |= PG_WANTED; + UVMHIST_LOG(pdhist, + "sleeping, ptmp->flags 0x%x\n", + ptmp->flags,0,0,0); + UVM_UNLOCK_AND_WAIT(ptmp, &uobj->vmobjlock, 0, + "uao_get", 0); + simple_lock(&uobj->vmobjlock); + continue; /* goto top of pps while loop */ + } + + /* + * if we get here then the page has become resident and + * unbusy between steps 1 and 2. we busy it now (so we + * own it) and set pps[lcv] (so that we exit the while + * loop). + */ + /* we own it, caller must un-busy */ + ptmp->flags |= PG_BUSY; + UVM_PAGE_OWN(ptmp, "uao_get2"); + pps[lcv] = ptmp; + } + + /* + * if we own the valid page at the correct offset, pps[lcv] will + * point to it. nothing more to do except go to the next page. + */ + if (pps[lcv]) + continue; /* next lcv */ + + /* + * we have a "fake/busy/clean" page that we just allocated. + * do the needed "i/o", either reading from swap or zeroing. + */ + swslot = uao_find_swslot(aobj, current_offset >> PAGE_SHIFT); + + /* + * just zero the page if there's nothing in swap. + */ + if (swslot == 0) + { + /* + * page hasn't existed before, just zero it. + */ + uvm_pagezero(ptmp); + } + else + { + UVMHIST_LOG(pdhist, "pagein from swslot %d", + swslot, 0,0,0); + + /* + * page in the swapped-out page. + * unlock object for i/o, relock when done. + */ + simple_unlock(&uobj->vmobjlock); + rv = uvm_swap_get(ptmp, swslot, PGO_SYNCIO); + simple_lock(&uobj->vmobjlock); + + /* + * I/O done. check for errors. + */ + if (rv != VM_PAGER_OK) + { + UVMHIST_LOG(pdhist, "<- done (error=%d)", + rv,0,0,0); + if (ptmp->flags & PG_WANTED) + /* object lock still held */ + thread_wakeup(ptmp); + ptmp->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(ptmp, NULL); + uvm_lock_pageq(); + uvm_pagefree(ptmp); + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + return (rv); + } + } + + /* + * we got the page! clear the fake flag (indicates valid + * data now in page) and plug into our result array. note + * that page is still busy. + * + * it is the callers job to: + * => check if the page is released + * => unbusy the page + * => activate the page + */ + + ptmp->flags &= ~PG_FAKE; /* data is valid ... */ + pmap_clear_modify(PMAP_PGARG(ptmp)); /* ... and clean */ + pps[lcv] = ptmp; + + } /* lcv loop */ + + /* + * finally, unlock object and return. + */ + + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(pdhist, "<- done (OK)",0,0,0,0); + return(VM_PAGER_OK); +} + +/* + * uao_releasepg: handle released page in an aobj + * + * => "pg" is a PG_BUSY [caller owns it], PG_RELEASED page that we need + * to dispose of. + * => caller must handle PG_WANTED case + * => called with page's object locked, pageq's unlocked + * => returns TRUE if page's object is still alive, FALSE if we + * killed the page's object. if we return TRUE, then we + * return with the object locked. + * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return + * with the page queues locked [for pagedaemon] + * => if (nextpgp == NULL) => we return with page queues unlocked [normal case] + * => we kill the aobj if it is not referenced and we are suppose to + * kill it ("KILLME"). + */ +static boolean_t uao_releasepg(pg, nextpgp) + struct vm_page *pg; + struct vm_page **nextpgp; /* OUT */ +{ + struct uvm_aobj *aobj = (struct uvm_aobj *) pg->uobject; + int slot; + +#ifdef DIAGNOSTIC + if ((pg->flags & PG_RELEASED) == 0) + panic("uao_releasepg: page not released!"); +#endif + + /* + * dispose of the page [caller handles PG_WANTED] and swap slot. + */ + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + slot = uao_set_swslot(&aobj->u_obj, pg->offset >> PAGE_SHIFT, 0); + if (slot) + uvm_swap_free(slot, 1); + uvm_lock_pageq(); + if (nextpgp) + *nextpgp = pg->pageq.tqe_next; /* next page for daemon */ + uvm_pagefree(pg); + if (!nextpgp) + uvm_unlock_pageq(); /* keep locked for daemon */ + + /* + * if we're not killing the object, we're done. + */ + if ((aobj->u_flags & UAO_FLAG_KILLME) == 0) + return TRUE; + +#ifdef DIAGNOSTIC + if (aobj->u_obj.uo_refs) + panic("uvm_km_releasepg: kill flag set on referenced object!"); +#endif + + /* + * if there are still pages in the object, we're done for now. + */ + if (aobj->u_obj.uo_npages != 0) + return TRUE; + +#ifdef DIAGNOSTIC + if (aobj->u_obj.memq.tqh_first) + panic("uvn_releasepg: pages in object with npages == 0"); +#endif + + /* + * finally, free the rest. + */ + uao_free(aobj); + + return FALSE; +} diff --git a/sys/uvm/uvm_aobj.h b/sys/uvm/uvm_aobj.h new file mode 100644 index 00000000000..61beadb2157 --- /dev/null +++ b/sys/uvm/uvm_aobj.h @@ -0,0 +1,77 @@ +/* $NetBSD: uvm_aobj.h,v 1.6 1998/02/12 07:36:45 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and + * Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_aobj.h,v 1.1.2.4 1998/02/06 05:19:28 chs Exp + */ +/* + * uvm_aobj.h: anonymous memory uvm_object pager + * + * author: Chuck Silvers <chuq@chuq.com> + * started: Jan-1998 + * + * - design mostly from Chuck Cranor + */ + +#ifndef _UVM_UVM_AOBJ_H_ +#define _UVM_UVM_AOBJ_H_ + +/* + * flags + */ + +/* flags for uao_create: can only be used one time (at bootup) */ +#define UAO_FLAG_KERNOBJ 0x1 /* create kernel object */ +#define UAO_FLAG_KERNSWAP 0x2 /* enable kernel swap */ + +/* internal flags */ +#define UAO_FLAG_KILLME 0x4 /* aobj should die when last released + * page is no longer PG_BUSY ... */ +#define UAO_FLAG_NOSWAP 0x8 /* aobj can't swap (kernel obj only!) */ + +/* + * prototypes + */ + +int uao_set_swslot __P((struct uvm_object *, int, int)); + +/* + * globals + */ + +extern struct uvm_pagerops aobj_pager; + +#endif /* _UVM_UVM_AOBJ_H_ */ diff --git a/sys/uvm/uvm_ddb.h b/sys/uvm/uvm_ddb.h new file mode 100644 index 00000000000..7c82bdf0dd6 --- /dev/null +++ b/sys/uvm/uvm_ddb.h @@ -0,0 +1,56 @@ +/* $NetBSD: uvm_ddb.h,v 1.1 1998/07/04 22:18:53 jonathan Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_extern.h,v 1.1.2.21 1998/02/07 01:16:53 chs Exp + */ + +#ifndef _UVM_UVM_DDB_H_ +#define _UVM_UVM_DDB_H_ + +#if defined(DDB) +void uvm_map_print __P((vm_map_t, boolean_t)); +void uvm_map_printit __P((vm_map_t, boolean_t, + int (*) __P((const char *, ...)))); + +void uvm_object_print __P((struct uvm_object *, boolean_t)); +void uvm_object_printit __P((struct uvm_object *, boolean_t, + int (*) __P((const char *, ...)))); +void uvm_page_print __P((struct vm_page *, boolean_t)); +void uvm_page_printit __P((struct vm_page *, boolean_t, + int (*) __P((const char *, ...)))); +#endif +#endif _UVM_UVM_DDB_H_ diff --git a/sys/uvm/uvm_device.c b/sys/uvm/uvm_device.c new file mode 100644 index 00000000000..6c249c42877 --- /dev/null +++ b/sys/uvm/uvm_device.c @@ -0,0 +1,507 @@ +/* $NetBSD: uvm_device.c,v 1.11 1998/11/19 05:23:26 mrg Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ + +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_device.c,v 1.1.2.9 1998/02/06 05:11:47 chs Exp + */ + +/* + * uvm_device.c: the device pager. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_device.h> + +/* + * private global data structure + * + * we keep a list of active device objects in the system. + */ + +LIST_HEAD(udv_list_struct, uvm_device); +static struct udv_list_struct udv_list; +static simple_lock_data_t udv_lock; + +/* + * functions + */ + +static void udv_init __P((void)); +struct uvm_object *udv_attach __P((void *, vm_prot_t)); +static void udv_reference __P((struct uvm_object *)); +static void udv_detach __P((struct uvm_object *)); +static int udv_fault __P((struct uvm_faultinfo *, vaddr_t, + vm_page_t *, int, int, vm_fault_t, + vm_prot_t, int)); +static boolean_t udv_flush __P((struct uvm_object *, vaddr_t, + vaddr_t, int)); +static int udv_asyncget __P((struct uvm_object *, vaddr_t, + int)); +static int udv_put __P((struct uvm_object *, vm_page_t *, + int, boolean_t)); + +/* + * master pager structure + */ + +struct uvm_pagerops uvm_deviceops = { + udv_init, + udv_attach, + udv_reference, + udv_detach, + udv_fault, + udv_flush, + NULL, /* no get function since we have udv_fault */ + udv_asyncget, + udv_put, + NULL, /* no cluster function */ + NULL, /* no put cluster function */ + NULL, /* no share protect. no share maps for us */ + NULL, /* no AIO-DONE function since no async i/o */ + NULL, /* no releasepg function since no normal pages */ +}; + +/* + * the ops! + */ + +/* + * udv_init + * + * init pager private data structures. + */ + +void +udv_init() +{ + + LIST_INIT(&udv_list); + simple_lock_init(&udv_lock); +} + +/* + * udv_attach + * + * get a VM object that is associated with a device. allocate a new + * one if needed. + * + * => caller must _not_ already be holding the lock on the uvm_object. + * => in fact, nothing should be locked so that we can sleep here. + */ +struct uvm_object * +udv_attach(arg, accessprot) + void *arg; + vm_prot_t accessprot; +{ + dev_t device = *((dev_t *) arg); + struct uvm_device *udv, *lcv; + int (*mapfn) __P((dev_t, int, int)); + UVMHIST_FUNC("udv_attach"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(device=0x%x)", device,0,0,0); + + /* + * before we do anything, ensure this device supports mmap + */ + + mapfn = cdevsw[major(device)].d_mmap; + if (mapfn == NULL || + mapfn == (int (*) __P((dev_t, int, int))) enodev || + mapfn == (int (*) __P((dev_t, int, int))) nullop) + return(NULL); + + /* + * keep looping until we get it + */ + + while (1) { + + /* + * first, attempt to find it on the main list + */ + + simple_lock(&udv_lock); + for (lcv = udv_list.lh_first ; lcv != NULL ; lcv = lcv->u_list.le_next) { + if (device == lcv->u_device) + break; + } + + /* + * got it on main list. put a hold on it and unlock udv_lock. + */ + + if (lcv) { + + /* + * if someone else has a hold on it, sleep and start + * over again. + */ + + if (lcv->u_flags & UVM_DEVICE_HOLD) { + lcv->u_flags |= UVM_DEVICE_WANTED; + UVM_UNLOCK_AND_WAIT(lcv, &udv_lock, FALSE, + "udv_attach",0); + continue; + } + + /* we are now holding it */ + lcv->u_flags |= UVM_DEVICE_HOLD; + simple_unlock(&udv_lock); + + /* + * bump reference count, unhold, return. + */ + + simple_lock(&lcv->u_obj.vmobjlock); + lcv->u_obj.uo_refs++; + simple_unlock(&lcv->u_obj.vmobjlock); + + simple_lock(&udv_lock); + if (lcv->u_flags & UVM_DEVICE_WANTED) + wakeup(lcv); + lcv->u_flags &= ~(UVM_DEVICE_WANTED|UVM_DEVICE_HOLD); + simple_unlock(&udv_lock); + return(&lcv->u_obj); + } + + /* + * did not find it on main list. need to malloc a new one. + */ + + simple_unlock(&udv_lock); + /* NOTE: we could sleep in the following malloc() */ + MALLOC(udv, struct uvm_device *, sizeof(*udv), M_TEMP, M_WAITOK); + simple_lock(&udv_lock); + + /* + * now we have to double check to make sure no one added it + * to the list while we were sleeping... + */ + + for (lcv = udv_list.lh_first ; lcv != NULL ; + lcv = lcv->u_list.le_next) { + if (device == lcv->u_device) + break; + } + + /* + * did we lose a race to someone else? free our memory and retry. + */ + + if (lcv) { + simple_unlock(&udv_lock); + FREE(udv, M_TEMP); + continue; + } + + /* + * we have it! init the data structures, add to list + * and return. + */ + + simple_lock_init(&udv->u_obj.vmobjlock); + udv->u_obj.pgops = &uvm_deviceops; + TAILQ_INIT(&udv->u_obj.memq); /* not used, but be safe */ + udv->u_obj.uo_npages = 0; + udv->u_obj.uo_refs = 1; + udv->u_flags = 0; + udv->u_device = device; + LIST_INSERT_HEAD(&udv_list, udv, u_list); + simple_unlock(&udv_lock); + + return(&udv->u_obj); + + } /* while(1) loop */ + + /*NOTREACHED*/ +} + +/* + * udv_reference + * + * add a reference to a VM object. Note that the reference count must + * already be one (the passed in reference) so there is no chance of the + * udv being released or locked out here. + * + * => caller must call with object unlocked. + */ + +static void +udv_reference(uobj) + struct uvm_object *uobj; +{ + UVMHIST_FUNC("udv_reference"); UVMHIST_CALLED(maphist); + + simple_lock(&uobj->vmobjlock); + uobj->uo_refs++; + UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", + uobj, uobj->uo_refs,0,0); + simple_unlock(&uobj->vmobjlock); +} + +/* + * udv_detach + * + * remove a reference to a VM object. + * + * => caller must call with object unlocked and map locked. + */ + +static void +udv_detach(uobj) + struct uvm_object *uobj; +{ + struct uvm_device *udv = (struct uvm_device *) uobj; + UVMHIST_FUNC("udv_detach"); UVMHIST_CALLED(maphist); + + /* + * loop until done + */ + + while (1) { + simple_lock(&uobj->vmobjlock); + + if (uobj->uo_refs > 1) { + uobj->uo_refs--; /* drop ref! */ + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(maphist," <- done, uobj=0x%x, ref=%d", + uobj,uobj->uo_refs,0,0); + return; + } + +#ifdef DIAGNOSTIC + if (uobj->uo_npages || uobj->memq.tqh_first) + panic("udv_detach: pages in a device object?"); +#endif + + /* + * now lock udv_lock + */ + simple_lock(&udv_lock); + + /* + * is it being held? if so, wait until others are done. + */ + if (udv->u_flags & UVM_DEVICE_HOLD) { + + /* + * want it + */ + udv->u_flags |= UVM_DEVICE_WANTED; + simple_unlock(&uobj->vmobjlock); + UVM_UNLOCK_AND_WAIT(udv, &udv_lock, FALSE, "udv_detach",0); + continue; + } + + /* + * got it! nuke it now. + */ + + LIST_REMOVE(udv, u_list); + if (udv->u_flags & UVM_DEVICE_WANTED) + wakeup(udv); + FREE(udv, M_TEMP); + break; /* DONE! */ + + } /* while (1) loop */ + + UVMHIST_LOG(maphist," <- done, freed uobj=0x%x", uobj,0,0,0); + return; +} + + +/* + * udv_flush + * + * flush pages out of a uvm object. a no-op for devices. + */ + +static boolean_t udv_flush(uobj, start, stop, flags) + struct uvm_object *uobj; + vaddr_t start, stop; + int flags; +{ + + return(TRUE); +} + +/* + * udv_fault: non-standard fault routine for device "pages" + * + * => rather than having a "get" function, we have a fault routine + * since we don't return vm_pages we need full control over the + * pmap_enter map in + * => all the usual fault data structured are locked by the caller + * (i.e. maps(read), amap (if any), uobj) + * => on return, we unlock all fault data structures + * => flags: PGO_ALLPAGES: get all of the pages + * PGO_LOCKED: fault data structures are locked + * XXX: currently PGO_LOCKED is always required ... consider removing + * it as a flag + * => NOTE: vaddr is the VA of pps[0] in ufi->entry, _NOT_ pps[centeridx] + */ + +static int +udv_fault(ufi, vaddr, pps, npages, centeridx, fault_type, access_type, flags) + struct uvm_faultinfo *ufi; + vaddr_t vaddr; + vm_page_t *pps; + int npages, centeridx, flags; + vm_fault_t fault_type; + vm_prot_t access_type; +{ + struct vm_map_entry *entry = ufi->entry; + struct uvm_object *uobj = entry->object.uvm_obj; + struct uvm_device *udv = (struct uvm_device *)uobj; + vaddr_t curr_offset, curr_va; + paddr_t paddr; + int lcv, retval, mdpgno; + dev_t device; + int (*mapfn) __P((dev_t, int, int)); + UVMHIST_FUNC("udv_fault"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist," flags=%d", flags,0,0,0); + + /* + * XXX: !PGO_LOCKED calls are currently not allowed (or used) + */ + + if ((flags & PGO_LOCKED) == 0) + panic("udv_fault: !PGO_LOCKED fault"); + + /* + * we do not allow device mappings to be mapped copy-on-write + * so we kill any attempt to do so here. + */ + + if (UVM_ET_ISCOPYONWRITE(entry)) { + UVMHIST_LOG(maphist, "<- failed -- COW entry (etype=0x%x)", + entry->etype, 0,0,0); + uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj, NULL); + return(VM_PAGER_ERROR); + } + + /* + * get device map function. + */ + device = udv->u_device; + mapfn = cdevsw[major(device)].d_mmap; + + /* + * now we must determine the offset in udv to use and the VA to + * use for pmap_enter. note that we always use orig_map's pmap + * for pmap_enter (even if we have a submap). since virtual + * addresses in a submap must match the main map, this is ok. + */ + /* udv offset = (offset from start of entry) + entry's offset */ + curr_offset = (vaddr - entry->start) + entry->offset; + /* pmap va = vaddr (virtual address of pps[0]) */ + curr_va = vaddr; + + /* + * loop over the page range entering in as needed + */ + + retval = VM_PAGER_OK; + for (lcv = 0 ; lcv < npages ; lcv++, curr_offset += PAGE_SIZE, + curr_va += PAGE_SIZE) { + if ((flags & PGO_ALLPAGES) == 0 && lcv != centeridx) + continue; + + if (pps[lcv] == PGO_DONTCARE) + continue; + + mdpgno = (*mapfn)(device, (int)curr_offset, access_type); + if (mdpgno == -1) { + retval = VM_PAGER_ERROR; + break; + } + paddr = pmap_phys_address(mdpgno); + UVMHIST_LOG(maphist, + " MAPPING: device: pm=0x%x, va=0x%x, pa=0x%x, at=%d", + ufi->orig_map->pmap, curr_va, (int)paddr, access_type); + pmap_enter(ufi->orig_map->pmap, curr_va, paddr, access_type, 0); + + } + + uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj, NULL); + return(retval); +} + +/* + * udv_asyncget: start async I/O to bring pages into ram + * + * => caller must lock object(???XXX: see if this is best) + * => a no-op for devices + */ + +static int +udv_asyncget(uobj, offset, npages) + struct uvm_object *uobj; + vaddr_t offset; + int npages; +{ + + return(KERN_SUCCESS); +} + +/* + * udv_put: flush page data to backing store. + * + * => this function should never be called (since we never have any + * page structures to "put") + */ + +static int +udv_put(uobj, pps, npages, flags) + struct uvm_object *uobj; + struct vm_page **pps; + int npages, flags; +{ + + panic("udv_put: trying to page out to a device!"); +} diff --git a/sys/uvm/uvm_device.h b/sys/uvm/uvm_device.h new file mode 100644 index 00000000000..347e4cb1dac --- /dev/null +++ b/sys/uvm/uvm_device.h @@ -0,0 +1,76 @@ +/* $NetBSD: uvm_device.h,v 1.5 1998/03/09 00:58:56 mrg Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_device.h,v 1.1.2.2 1997/10/03 17:39:44 chuck Exp + */ + +#ifndef _UVM_UVM_DEVICE_H_ +#define _UVM_UVM_DEVICE_H_ + +/* + * uvm_device.h + * + * device handle into the VM system. + */ + +/* + * the uvm_device structure. object is put at the top of the data structure. + * this allows: + * (struct uvm_device *) == (struct uvm_object *) + */ + +struct uvm_device { + struct uvm_object u_obj; /* the actual VM object */ + int u_flags; /* flags [LOCKED BY UDV_LOCK!] */ + dev_t u_device; /* our device */ + LIST_ENTRY(uvm_device) u_list; /* list of device objects */ +}; + +/* + * u_flags values + */ + +#define UVM_DEVICE_HOLD 0x1 /* someone has a "hold" on it */ +#define UVM_DEVICE_WANTED 0x2 /* someone wants to put a "hold" on */ + +/* + * prototypes + */ + +struct uvm_object *udv_attach __P((void *, vm_prot_t)); + +#endif /* _UVM_UVM_DEVICE_H_ */ diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h new file mode 100644 index 00000000000..bcec521f665 --- /dev/null +++ b/sys/uvm/uvm_extern.h @@ -0,0 +1,386 @@ +/* $NetBSD: uvm_extern.h,v 1.21 1998/09/08 23:44:21 thorpej Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_extern.h,v 1.1.2.21 1998/02/07 01:16:53 chs Exp + */ + +#ifndef _UVM_UVM_EXTERN_H_ +#define _UVM_UVM_EXTERN_H_ + +/* + * uvm_extern.h: this file defines the external interface to the VM system. + * + * this should be the only file included by non-VM parts of the kernel + * which need access to VM services. if you want to know the interface + * to the MI VM layer without knowing the details, this is the file to + * learn. + * + * NOTE: vm system calls are prototyped in syscallargs.h + */ + +/* + * defines + */ + +/* + * the following defines are for uvm_map and functions which call it. + */ + +/* protections bits */ +#define UVM_PROT_MASK 0x07 /* protection mask */ +#define UVM_PROT_NONE 0x00 /* protection none */ +#define UVM_PROT_ALL 0x07 /* everything */ +#define UVM_PROT_READ 0x01 /* read */ +#define UVM_PROT_WRITE 0x02 /* write */ +#define UVM_PROT_EXEC 0x04 /* exec */ + +/* protection short codes */ +#define UVM_PROT_R 0x01 /* read */ +#define UVM_PROT_W 0x02 /* write */ +#define UVM_PROT_RW 0x03 /* read-write */ +#define UVM_PROT_X 0x04 /* exec */ +#define UVM_PROT_RX 0x05 /* read-exec */ +#define UVM_PROT_WX 0x06 /* write-exec */ +#define UVM_PROT_RWX 0x07 /* read-write-exec */ + +/* 0x08: not used */ + +/* inherit codes */ +#define UVM_INH_MASK 0x30 /* inherit mask */ +#define UVM_INH_SHARE 0x00 /* "share" */ +#define UVM_INH_COPY 0x10 /* "copy" */ +#define UVM_INH_NONE 0x20 /* "none" */ +#define UVM_INH_DONATE 0x30 /* "donate" << not used */ + +/* 0x40, 0x80: not used */ + +/* bits 0x700: max protection, 0x800: not used */ + +/* bits 0x7000: advice, 0x8000: not used */ +/* advice: matches MADV_* from sys/mman.h */ +#define UVM_ADV_NORMAL 0x0 /* 'normal' */ +#define UVM_ADV_RANDOM 0x1 /* 'random' */ +#define UVM_ADV_SEQUENTIAL 0x2 /* 'sequential' */ +/* 0x3: will need, 0x4: dontneed */ +#define UVM_ADV_MASK 0x7 /* mask */ + +/* mapping flags */ +#define UVM_FLAG_FIXED 0x010000 /* find space */ +#define UVM_FLAG_OVERLAY 0x020000 /* establish overlay */ +#define UVM_FLAG_NOMERGE 0x040000 /* don't merge map entries */ +#define UVM_FLAG_COPYONW 0x080000 /* set copy_on_write flag */ +#define UVM_FLAG_AMAPPAD 0x100000 /* for bss: pad amap to reduce malloc() */ +#define UVM_FLAG_TRYLOCK 0x200000 /* fail if we can not lock map */ + +/* macros to extract info */ +#define UVM_PROTECTION(X) ((X) & UVM_PROT_MASK) +#define UVM_INHERIT(X) (((X) & UVM_INH_MASK) >> 4) +#define UVM_MAXPROTECTION(X) (((X) >> 8) & UVM_PROT_MASK) +#define UVM_ADVICE(X) (((X) >> 12) & UVM_ADV_MASK) + +#define UVM_MAPFLAG(PROT,MAXPROT,INH,ADVICE,FLAGS) \ + ((MAXPROT << 8)|(PROT)|(INH)|((ADVICE) << 12)|(FLAGS)) + +/* magic offset value */ +#define UVM_UNKNOWN_OFFSET ((vaddr_t) -1) + /* offset not known(obj) or don't care(!obj) */ + +/* + * the following defines are for uvm_km_kmemalloc's flags + */ + +#define UVM_KMF_NOWAIT 0x1 /* matches M_NOWAIT */ +#define UVM_KMF_VALLOC 0x2 /* allocate VA only */ +#define UVM_KMF_TRYLOCK UVM_FLAG_TRYLOCK /* try locking only */ + +/* + * the following defines the strategies for uvm_pagealloc_strat() + */ +#define UVM_PGA_STRAT_NORMAL 0 /* high -> low free list walk */ +#define UVM_PGA_STRAT_ONLY 1 /* only specified free list */ +#define UVM_PGA_STRAT_FALLBACK 2 /* ONLY falls back on NORMAL */ + +/* + * structures + */ + +struct core; +struct mount; +struct pglist; +struct proc; +struct ucred; +struct uio; +struct uvm_object; +struct vm_anon; +struct vmspace; +struct pmap; +struct vnode; + +/* + * uvmexp: global data structures that are exported to parts of the kernel + * other than the vm system. + */ + +struct uvmexp { + /* vm_page constants */ + int pagesize; /* size of a page (PAGE_SIZE): must be power of 2 */ + int pagemask; /* page mask */ + int pageshift; /* page shift */ + + /* vm_page counters */ + int npages; /* number of pages we manage */ + int free; /* number of free pages */ + int active; /* number of active pages */ + int inactive; /* number of pages that we free'd but may want back */ + int paging; /* number of pages in the process of being paged out */ + int wired; /* number of wired pages */ + int reserve_pagedaemon; /* number of pages reserved for pagedaemon */ + int reserve_kernel; /* number of pages reserved for kernel */ + + /* pageout params */ + int freemin; /* min number of free pages */ + int freetarg; /* target number of free pages */ + int inactarg; /* target number of inactive pages */ + int wiredmax; /* max number of wired pages */ + + /* swap */ + int nswapdev; /* number of configured swap devices in system */ + int swpages; /* number of PAGE_SIZE'ed swap pages */ + int swpginuse; /* number of swap pages in use */ + int nswget; /* number of times fault calls uvm_swap_get() */ + int nanon; /* number total of anon's in system */ + int nfreeanon; /* number of free anon's */ + + /* stat counters */ + int faults; /* page fault count */ + int traps; /* trap count */ + int intrs; /* interrupt count */ + int swtch; /* context switch count */ + int softs; /* software interrupt count */ + int syscalls; /* system calls */ + int pageins; /* pagein operation count */ + /* pageouts are in pdpageouts below */ + int swapins; /* swapins */ + int swapouts; /* swapouts */ + int pgswapin; /* pages swapped in */ + int pgswapout; /* pages swapped out */ + int forks; /* forks */ + int forks_ppwait; /* forks where parent waits */ + int forks_sharevm; /* forks where vmspace is shared */ + + /* fault subcounters */ + int fltnoram; /* number of times fault was out of ram */ + int fltnoanon; /* number of times fault was out of anons */ + int fltpgwait; /* number of times fault had to wait on a page */ + int fltpgrele; /* number of times fault found a released page */ + int fltrelck; /* number of times fault relock called */ + int fltrelckok; /* number of times fault relock is a success */ + int fltanget; /* number of times fault gets anon page */ + int fltanretry; /* number of times fault retrys an anon get */ + int fltamcopy; /* number of times fault clears "needs copy" */ + int fltnamap; /* number of times fault maps a neighbor anon page */ + int fltnomap; /* number of times fault maps a neighbor obj page */ + int fltlget; /* number of times fault does a locked pgo_get */ + int fltget; /* number of times fault does an unlocked get */ + int flt_anon; /* number of times fault anon (case 1a) */ + int flt_acow; /* number of times fault anon cow (case 1b) */ + int flt_obj; /* number of times fault is on object page (2a) */ + int flt_prcopy; /* number of times fault promotes with copy (2b) */ + int flt_przero; /* number of times fault promotes with zerofill (2b) */ + + /* daemon counters */ + int pdwoke; /* number of times daemon woke up */ + int pdrevs; /* number of times daemon rev'd clock hand */ + int pdswout; /* number of times daemon called for swapout */ + int pdfreed; /* number of pages daemon freed since boot */ + int pdscans; /* number of pages daemon scaned since boot */ + int pdanscan; /* number of anonymous pages scanned by daemon */ + int pdobscan; /* number of object pages scanned by daemon */ + int pdreact; /* number of pages daemon reactivated since boot */ + int pdbusy; /* number of times daemon found a busy page */ + int pdpageouts; /* number of times daemon started a pageout */ + int pdpending; /* number of times daemon got a pending pagout */ + int pddeact; /* number of pages daemon deactivates */ + + /* kernel memory objects: managed by uvm_km_kmemalloc() only! */ + struct uvm_object *kmem_object; + struct uvm_object *mb_object; +}; + + +extern struct uvmexp uvmexp; + +/* + * macros + */ + +/* zalloc zeros memory, alloc does not */ +#define uvm_km_zalloc(MAP,SIZE) uvm_km_alloc1(MAP,SIZE,TRUE) +#define uvm_km_alloc(MAP,SIZE) uvm_km_alloc1(MAP,SIZE,FALSE) + +/* + * typedefs + */ + +typedef unsigned int uvm_flag_t; +typedef int vm_fault_t; + +/* uvm_aobj.c */ +struct uvm_object *uao_create __P((vsize_t, int)); +void uao_detach __P((struct uvm_object *)); +void uao_reference __P((struct uvm_object *)); + +/* uvm_fault.c */ +int uvm_fault __P((vm_map_t, vaddr_t, + vm_fault_t, vm_prot_t)); + /* handle a page fault */ + +/* uvm_glue.c */ +#if defined(KGDB) +void uvm_chgkprot __P((caddr_t, size_t, int)); +#endif +void uvm_fork __P((struct proc *, struct proc *, boolean_t)); +void uvm_exit __P((struct proc *)); +void uvm_init_limits __P((struct proc *)); +boolean_t uvm_kernacc __P((caddr_t, size_t, int)); +__dead void uvm_scheduler __P((void)) __attribute__((noreturn)); +void uvm_swapin __P((struct proc *)); +boolean_t uvm_useracc __P((caddr_t, size_t, int)); +void uvm_vslock __P((struct proc *, caddr_t, size_t)); +void uvm_vsunlock __P((struct proc *, caddr_t, size_t)); + + +/* uvm_init.c */ +void uvm_init __P((void)); + /* init the uvm system */ + +/* uvm_io.c */ +int uvm_io __P((vm_map_t, struct uio *)); + +/* uvm_km.c */ +vaddr_t uvm_km_alloc1 __P((vm_map_t, vsize_t, boolean_t)); +void uvm_km_free __P((vm_map_t, vaddr_t, vsize_t)); +void uvm_km_free_wakeup __P((vm_map_t, vaddr_t, + vsize_t)); +vaddr_t uvm_km_kmemalloc __P((vm_map_t, struct uvm_object *, + vsize_t, int)); +struct vm_map *uvm_km_suballoc __P((vm_map_t, vaddr_t *, + vaddr_t *, vsize_t, boolean_t, + boolean_t, vm_map_t)); +vaddr_t uvm_km_valloc __P((vm_map_t, vsize_t)); +vaddr_t uvm_km_valloc_wait __P((vm_map_t, vsize_t)); +vaddr_t uvm_km_alloc_poolpage1 __P((vm_map_t, + struct uvm_object *, boolean_t)); +void uvm_km_free_poolpage1 __P((vm_map_t, vaddr_t)); + +#define uvm_km_alloc_poolpage(waitok) uvm_km_alloc_poolpage1(kmem_map, \ + uvmexp.kmem_object, (waitok)) +#define uvm_km_free_poolpage(addr) uvm_km_free_poolpage1(kmem_map, (addr)) + +/* uvm_map.c */ +int uvm_map __P((vm_map_t, vaddr_t *, vsize_t, + struct uvm_object *, vaddr_t, uvm_flag_t)); +int uvm_map_pageable __P((vm_map_t, vaddr_t, + vaddr_t, boolean_t)); +boolean_t uvm_map_checkprot __P((vm_map_t, vaddr_t, + vaddr_t, vm_prot_t)); +int uvm_map_protect __P((vm_map_t, vaddr_t, + vaddr_t, vm_prot_t, boolean_t)); +struct vmspace *uvmspace_alloc __P((vaddr_t, vaddr_t, + boolean_t)); +void uvmspace_init __P((struct vmspace *, struct pmap *, + vaddr_t, vaddr_t, boolean_t)); +void uvmspace_exec __P((struct proc *)); +struct vmspace *uvmspace_fork __P((struct vmspace *)); +void uvmspace_free __P((struct vmspace *)); +void uvmspace_share __P((struct proc *, struct proc *)); +void uvmspace_unshare __P((struct proc *)); + + +/* uvm_meter.c */ +void uvm_meter __P((void)); +int uvm_sysctl __P((int *, u_int, void *, size_t *, + void *, size_t, struct proc *)); +void uvm_total __P((struct vmtotal *)); + +/* uvm_mmap.c */ +int uvm_mmap __P((vm_map_t, vaddr_t *, vsize_t, + vm_prot_t, vm_prot_t, int, + caddr_t, vaddr_t)); + +/* uvm_page.c */ +struct vm_page *uvm_pagealloc_strat __P((struct uvm_object *, + vaddr_t, struct vm_anon *, int, int)); +#define uvm_pagealloc(obj, off, anon) \ + uvm_pagealloc_strat((obj), (off), (anon), UVM_PGA_STRAT_NORMAL, 0) +void uvm_pagerealloc __P((struct vm_page *, + struct uvm_object *, vaddr_t)); +/* Actually, uvm_page_physload takes PF#s which need their own type */ +void uvm_page_physload __P((vaddr_t, vaddr_t, + vaddr_t, vaddr_t, int)); +void uvm_setpagesize __P((void)); + +/* uvm_pdaemon.c */ +void uvm_pageout __P((void)); + +/* uvm_pglist.c */ +int uvm_pglistalloc __P((psize_t, paddr_t, + paddr_t, paddr_t, paddr_t, + struct pglist *, int, int)); +void uvm_pglistfree __P((struct pglist *)); + +/* uvm_swap.c */ +void uvm_swap_init __P((void)); + +/* uvm_unix.c */ +int uvm_coredump __P((struct proc *, struct vnode *, + struct ucred *, struct core *)); +int uvm_grow __P((struct proc *, vaddr_t)); + +/* uvm_user.c */ +int uvm_deallocate __P((vm_map_t, vaddr_t, vsize_t)); + +/* uvm_vnode.c */ +void uvm_vnp_setsize __P((struct vnode *, u_quad_t)); +void uvm_vnp_sync __P((struct mount *)); +void uvm_vnp_terminate __P((struct vnode *)); + /* terminate a uvm/uvn object */ +boolean_t uvm_vnp_uncache __P((struct vnode *)); +struct uvm_object *uvn_attach __P((void *, vm_prot_t)); + +#endif /* _UVM_UVM_EXTERN_H_ */ + diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c new file mode 100644 index 00000000000..10978e8c14e --- /dev/null +++ b/sys/uvm/uvm_fault.c @@ -0,0 +1,1747 @@ +/* $NetBSD: uvm_fault.c,v 1.19 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp + */ + +/* + * uvm_fault.c: fault handler + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/user.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * + * a word on page faults: + * + * types of page faults we handle: + * + * CASE 1: upper layer faults CASE 2: lower layer faults + * + * CASE 1A CASE 1B CASE 2A CASE 2B + * read/write1 write>1 read/write +-cow_write/zero + * | | | | + * +--|--+ +--|--+ +-----+ + | + | +-----+ + * amap | V | | ----------->new| | | | ^ | + * +-----+ +-----+ +-----+ + | + | +--|--+ + * | | | + * +-----+ +-----+ +--|--+ | +--|--+ + * uobj | d/c | | d/c | | V | +----| | + * +-----+ +-----+ +-----+ +-----+ + * + * d/c = don't care + * + * case [0]: layerless fault + * no amap or uobj is present. this is an error. + * + * case [1]: upper layer fault [anon active] + * 1A: [read] or [write with anon->an_ref == 1] + * I/O takes place in top level anon and uobj is not touched. + * 1B: [write with anon->an_ref > 1] + * new anon is alloc'd and data is copied off ["COW"] + * + * case [2]: lower layer fault [uobj] + * 2A: [read on non-NULL uobj] or [write to non-copy_on_write area] + * I/O takes place directly in object. + * 2B: [write to copy_on_write] or [read on NULL uobj] + * data is "promoted" from uobj to a new anon. + * if uobj is null, then we zero fill. + * + * we follow the standard UVM locking protocol ordering: + * + * MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ) + * we hold a PG_BUSY page if we unlock for I/O + * + * + * the code is structured as follows: + * + * - init the "IN" params in the ufi structure + * ReFault: + * - do lookups [locks maps], check protection, handle needs_copy + * - check for case 0 fault (error) + * - establish "range" of fault + * - if we have an amap lock it and extract the anons + * - if sequential advice deactivate pages behind us + * - at the same time check pmap for unmapped areas and anon for pages + * that we could map in (and do map it if found) + * - check object for resident pages that we could map in + * - if (case 2) goto Case2 + * - >>> handle case 1 + * - ensure source anon is resident in RAM + * - if case 1B alloc new anon and copy from source + * - map the correct page in + * Case2: + * - >>> handle case 2 + * - ensure source page is resident (if uobj) + * - if case 2B alloc new anon and copy from source (could be zero + * fill if uobj == NULL) + * - map the correct page in + * - done! + * + * note on paging: + * if we have to do I/O we place a PG_BUSY page in the correct object, + * unlock everything, and do the I/O. when I/O is done we must reverify + * the state of the world before assuming that our data structures are + * valid. [because mappings could change while the map is unlocked] + * + * alternative 1: unbusy the page in question and restart the page fault + * from the top (ReFault). this is easy but does not take advantage + * of the information that we already have from our previous lookup, + * although it is possible that the "hints" in the vm_map will help here. + * + * alternative 2: the system already keeps track of a "version" number of + * a map. [i.e. every time you write-lock a map (e.g. to change a + * mapping) you bump the version number up by one...] so, we can save + * the version number of the map before we release the lock and start I/O. + * then when I/O is done we can relock and check the version numbers + * to see if anything changed. this might save us some over 1 because + * we don't have to unbusy the page and may be less compares(?). + * + * alternative 3: put in backpointers or a way to "hold" part of a map + * in place while I/O is in progress. this could be complex to + * implement (especially with structures like amap that can be referenced + * by multiple map entries, and figuring out what should wait could be + * complex as well...). + * + * given that we are not currently multiprocessor or multithreaded we might + * as well choose alternative 2 now. maybe alternative 3 would be useful + * in the future. XXX keep in mind for future consideration//rechecking. + */ + +/* + * local data structures + */ + +struct uvm_advice { + int advice; + int nback; + int nforw; +}; + +/* + * page range array: + * note: index in array must match "advice" value + * XXX: borrowed numbers from freebsd. do they work well for us? + */ + +static struct uvm_advice uvmadvice[] = { + { MADV_NORMAL, 3, 4 }, + { MADV_RANDOM, 0, 0 }, + { MADV_SEQUENTIAL, 8, 7}, +}; + +#define UVM_MAXRANGE 16 /* must be max() of nback+nforw+1 */ + +/* + * private prototypes + */ + +static void uvmfault_amapcopy __P((struct uvm_faultinfo *)); +static __inline void uvmfault_anonflush __P((struct vm_anon **, int)); + +/* + * inline functions + */ + +/* + * uvmfault_anonflush: try and deactivate pages in specified anons + * + * => does not have to deactivate page if it is busy + */ + +static __inline void +uvmfault_anonflush(anons, n) + struct vm_anon **anons; + int n; +{ + int lcv; + struct vm_page *pg; + + for (lcv = 0 ; lcv < n ; lcv++) { + if (anons[lcv] == NULL) + continue; + simple_lock(&anons[lcv]->an_lock); + pg = anons[lcv]->u.an_page; + if (pg && (pg->flags & PG_BUSY) == 0 && pg->loan_count == 0) { + uvm_lock_pageq(); + if (pg->wire_count == 0) { + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + uvm_pagedeactivate(pg); + } + uvm_unlock_pageq(); + } + simple_unlock(&anons[lcv]->an_lock); + } +} + +/* + * normal functions + */ + +/* + * uvmfault_amapcopy: clear "needs_copy" in a map. + * + * => called with VM data structures unlocked (usually, see below) + * => we get a write lock on the maps and clear needs_copy for a VA + * => if we are out of RAM we sleep (waiting for more) + */ + +static void +uvmfault_amapcopy(ufi) + struct uvm_faultinfo *ufi; +{ + + /* + * while we haven't done the job + */ + + while (1) { + + /* + * no mapping? give up. + */ + + if (uvmfault_lookup(ufi, TRUE) == FALSE) + return; + + /* + * copy if needed. + */ + + if (UVM_ET_ISNEEDSCOPY(ufi->entry)) + amap_copy(ufi->map, ufi->entry, M_NOWAIT, TRUE, + ufi->orig_rvaddr, ufi->orig_rvaddr + 1); + + /* + * didn't work? must be out of RAM. unlock and sleep. + */ + + if (UVM_ET_ISNEEDSCOPY(ufi->entry)) { + uvmfault_unlockmaps(ufi, TRUE); + uvm_wait("fltamapcopy"); + continue; + } + + /* + * got it! unlock and return. + */ + + uvmfault_unlockmaps(ufi, TRUE); + return; + } + /*NOTREACHED*/ +} + +/* + * uvmfault_anonget: get data in an anon into a non-busy, non-released + * page in that anon. + * + * => maps, amap, and anon locked by caller. + * => if we fail (result != VM_PAGER_OK) we unlock everything. + * => if we are successful, we return with everything still locked. + * => we don't move the page on the queues [gets moved later] + * => if we allocate a new page [we_own], it gets put on the queues. + * either way, the result is that the page is on the queues at return time + * => for pages which are on loan from a uvm_object (and thus are not + * owned by the anon): if successful, we return with the owning object + * locked. the caller must unlock this object when it unlocks everything + * else. + */ + +int uvmfault_anonget(ufi, amap, anon) + struct uvm_faultinfo *ufi; + struct vm_amap *amap; + struct vm_anon *anon; +{ + boolean_t we_own; /* we own anon's page? */ + boolean_t locked; /* did we relock? */ + struct vm_page *pg; + int result; + UVMHIST_FUNC("uvmfault_anonget"); UVMHIST_CALLED(maphist); + + result = 0; /* XXX shut up gcc */ + uvmexp.fltanget++; + /* bump rusage counters */ + if (anon->u.an_page) + curproc->p_addr->u_stats.p_ru.ru_minflt++; + else + curproc->p_addr->u_stats.p_ru.ru_majflt++; + + /* + * loop until we get it, or fail. + */ + + while (1) { + + we_own = FALSE; /* TRUE if we set PG_BUSY on a page */ + pg = anon->u.an_page; + + /* + * if there is a resident page and it is loaned, then anon + * may not own it. call out to uvm_anon_lockpage() to ensure + * the real owner of the page has been identified and locked. + */ + + if (pg && pg->loan_count) + pg = uvm_anon_lockloanpg(anon); + + /* + * page there? make sure it is not busy/released. + */ + + if (pg) { + + /* + * at this point, if the page has a uobject [meaning + * we have it on loan], then that uobject is locked + * by us! if the page is busy, we drop all the + * locks (including uobject) and try again. + */ + + if ((pg->flags & (PG_BUSY|PG_RELEASED)) == 0) { + UVMHIST_LOG(maphist, "<- OK",0,0,0,0); + return (VM_PAGER_OK); + } + pg->flags |= PG_WANTED; + uvmexp.fltpgwait++; + + /* + * the last unlock must be an atomic unlock+wait on + * the owner of page + */ + if (pg->uobject) { /* owner is uobject ? */ + uvmfault_unlockall(ufi, amap, NULL, anon); + UVMHIST_LOG(maphist, " unlock+wait on uobj",0, + 0,0,0); + UVM_UNLOCK_AND_WAIT(pg, + &pg->uobject->vmobjlock, + FALSE, "anonget1",0); + } else { + /* anon owns page */ + uvmfault_unlockall(ufi, amap, NULL, NULL); + UVMHIST_LOG(maphist, " unlock+wait on anon",0, + 0,0,0); + UVM_UNLOCK_AND_WAIT(pg,&anon->an_lock,0, + "anonget2",0); + } + /* ready to relock and try again */ + + } else { + + /* + * no page, we must try and bring it in. + */ + pg = uvm_pagealloc(NULL, 0, anon); + + if (pg == NULL) { /* out of RAM. */ + + uvmfault_unlockall(ufi, amap, NULL, anon); + uvmexp.fltnoram++; + UVMHIST_LOG(maphist, " noram -- UVM_WAIT",0, + 0,0,0); + uvm_wait("flt_noram1"); + /* ready to relock and try again */ + + } else { + + /* we set the PG_BUSY bit */ + we_own = TRUE; + uvmfault_unlockall(ufi, amap, NULL, anon); + + /* + * we are passing a PG_BUSY+PG_FAKE+PG_CLEAN + * page into the uvm_swap_get function with + * all data structures unlocked. note that + * it is ok to read an_swslot here because + * we hold PG_BUSY on the page. + */ + uvmexp.pageins++; + result = uvm_swap_get(pg, anon->an_swslot, + PGO_SYNCIO); + + /* + * we clean up after the i/o below in the + * "we_own" case + */ + /* ready to relock and try again */ + } + } + + /* + * now relock and try again + */ + + locked = uvmfault_relock(ufi); + if (locked) { + amap_lock(amap); + } + if (locked || we_own) + simple_lock(&anon->an_lock); + + /* + * if we own the page (i.e. we set PG_BUSY), then we need + * to clean up after the I/O. there are three cases to + * consider: + * [1] page released during I/O: free anon and ReFault. + * [2] I/O not OK. free the page and cause the fault + * to fail. + * [3] I/O OK! activate the page and sync with the + * non-we_own case (i.e. drop anon lock if not locked). + */ + + if (we_own) { + + if (pg->flags & PG_WANTED) { + /* still holding object lock */ + thread_wakeup(pg); + } + /* un-busy! */ + pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(pg, NULL); + + /* + * if we were RELEASED during I/O, then our anon is + * no longer part of an amap. we need to free the + * anon and try again. + */ + if (pg->flags & PG_RELEASED) { + pmap_page_protect(PMAP_PGARG(pg), + VM_PROT_NONE); /* to be safe */ + simple_unlock(&anon->an_lock); + uvm_anfree(anon); /* frees page for us */ + if (locked) + uvmfault_unlockall(ufi, amap, NULL, NULL); + uvmexp.fltpgrele++; + UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0); + return (VM_PAGER_REFAULT); /* refault! */ + } + + if (result != VM_PAGER_OK) { +#ifdef DIAGNOSTIC + if (result == VM_PAGER_PEND) + panic("uvmfault_anonget: got PENDING for non-async I/O"); +#endif + /* remove page from anon */ + anon->u.an_page = NULL; + + /* + * note: page was never !PG_BUSY, so it + * can't be mapped and thus no need to + * pmap_page_protect it... + */ + uvm_lock_pageq(); + uvm_pagefree(pg); + uvm_unlock_pageq(); + + if (locked) + uvmfault_unlockall(ufi, amap, NULL, + anon); + else + simple_unlock(&anon->an_lock); + UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0); + return (VM_PAGER_ERROR); + } + + /* + * must be OK, clear modify (already PG_CLEAN) + * and activate + */ + pmap_clear_modify(PMAP_PGARG(pg)); + uvm_lock_pageq(); + uvm_pageactivate(pg); + uvm_unlock_pageq(); + if (!locked) + simple_unlock(&anon->an_lock); + } + + /* + * we were not able to relock. restart fault. + */ + + if (!locked) { + UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0); + return (VM_PAGER_REFAULT); + } + + /* + * verify no one has touched the amap and moved the anon on us. + */ + + if (amap_lookup(&ufi->entry->aref, + ufi->orig_rvaddr - ufi->entry->start) != anon) { + + uvmfault_unlockall(ufi, amap, NULL, anon); + UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0); + return (VM_PAGER_REFAULT); + } + + /* + * try it again! + */ + + uvmexp.fltanretry++; + continue; + + } /* while (1) */ + + /*NOTREACHED*/ +} + +/* + * F A U L T - m a i n e n t r y p o i n t + */ + +/* + * uvm_fault: page fault handler + * + * => called from MD code to resolve a page fault + * => VM data structures usually should be unlocked. however, it is + * possible to call here with the main map locked if the caller + * gets a write lock, sets it recusive, and then calls us (c.f. + * uvm_map_pageable). this should be avoided because it keeps + * the map locked off during I/O. + */ + +int +uvm_fault(orig_map, vaddr, fault_type, access_type) + vm_map_t orig_map; + vaddr_t vaddr; + vm_fault_t fault_type; + vm_prot_t access_type; +{ + struct uvm_faultinfo ufi; + vm_prot_t enter_prot; + boolean_t wired, narrow, promote, locked, shadowed; + int npages, nback, nforw, centeridx, result, lcv, gotpages; + vaddr_t startva, objaddr, currva, offset; + paddr_t pa; + struct vm_amap *amap; + struct uvm_object *uobj; + struct vm_anon *anons_store[UVM_MAXRANGE], **anons, *anon, *oanon; + struct vm_page *pages[UVM_MAXRANGE], *pg, *uobjpage; + UVMHIST_FUNC("uvm_fault"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, vaddr=0x%x, ft=%d, at=%d)", + orig_map, vaddr, fault_type, access_type); + + anon = NULL; /* XXX: shut up gcc */ + + uvmexp.faults++; /* XXX: locking? */ + + /* + * init the IN parameters in the ufi + */ + + ufi.orig_map = orig_map; + ufi.orig_rvaddr = trunc_page(vaddr); + ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */ + if (fault_type == VM_FAULT_WIRE) + narrow = TRUE; /* don't look for neighborhood + * pages on wire */ + else + narrow = FALSE; /* normal fault */ + + /* + * "goto ReFault" means restart the page fault from ground zero. + */ +ReFault: + + /* + * lookup and lock the maps + */ + + if (uvmfault_lookup(&ufi, FALSE) == FALSE) { + UVMHIST_LOG(maphist, "<- no mapping @ 0x%x", vaddr, 0,0,0); + return (KERN_INVALID_ADDRESS); + } + /* locked: maps(read) */ + + /* + * check protection + */ + + if ((ufi.entry->protection & access_type) != access_type) { + UVMHIST_LOG(maphist, + "<- protection failure (prot=0x%x, access=0x%x)", + ufi.entry->protection, access_type, 0, 0); + uvmfault_unlockmaps(&ufi, FALSE); + return (KERN_PROTECTION_FAILURE); + } + + /* + * "enter_prot" is the protection we want to enter the page in at. + * for certain pages (e.g. copy-on-write pages) this protection can + * be more strict than ufi.entry->protection. "wired" means either + * the entry is wired or we are fault-wiring the pg. + */ + + enter_prot = ufi.entry->protection; + wired = (ufi.entry->wired_count != 0) || (fault_type == VM_FAULT_WIRE); + if (wired) + access_type = enter_prot; /* full access for wired */ + + /* + * handle "needs_copy" case. if we need to copy the amap we will + * have to drop our readlock and relock it with a write lock. (we + * need a write lock to change anything in a map entry [e.g. + * needs_copy]). + */ + + if (UVM_ET_ISNEEDSCOPY(ufi.entry)) { + if ((access_type & VM_PROT_WRITE) || + (ufi.entry->object.uvm_obj == NULL)) { + /* need to clear */ + UVMHIST_LOG(maphist, + " need to clear needs_copy and refault",0,0,0,0); + uvmfault_unlockmaps(&ufi, FALSE); + uvmfault_amapcopy(&ufi); + uvmexp.fltamcopy++; + goto ReFault; + + } else { + + /* + * ensure that we pmap_enter page R/O since + * needs_copy is still true + */ + enter_prot = enter_prot & ~VM_PROT_WRITE; + + } + } + + /* + * identify the players + */ + + amap = ufi.entry->aref.ar_amap; /* top layer */ + uobj = ufi.entry->object.uvm_obj; /* bottom layer */ + + /* + * check for a case 0 fault. if nothing backing the entry then + * error now. + */ + + if (amap == NULL && uobj == NULL) { + uvmfault_unlockmaps(&ufi, FALSE); + UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0); + return (KERN_INVALID_ADDRESS); + } + + /* + * establish range of interest based on advice from mapper + * and then clip to fit map entry. note that we only want + * to do this the first time through the fault. if we + * ReFault we will disable this by setting "narrow" to true. + */ + + if (narrow == FALSE) { + + /* wide fault (!narrow) */ +#ifdef DIAGNOSTIC + if (uvmadvice[ufi.entry->advice].advice != ufi.entry->advice) + panic("fault: advice mismatch!"); +#endif + nback = min(uvmadvice[ufi.entry->advice].nback, + (ufi.orig_rvaddr - ufi.entry->start) >> PAGE_SHIFT); + startva = ufi.orig_rvaddr - (nback << PAGE_SHIFT); + nforw = min(uvmadvice[ufi.entry->advice].nforw, + ((ufi.entry->end - ufi.orig_rvaddr) >> + PAGE_SHIFT) - 1); + /* + * note: "-1" because we don't want to count the + * faulting page as forw + */ + npages = nback + nforw + 1; + centeridx = nback; + + narrow = FALSE; /* ensure only once per-fault */ + + } else { + + /* narrow fault! */ + nback = nforw = 0; + startva = ufi.orig_rvaddr; + npages = 1; + centeridx = 0; + + } + + /* locked: maps(read) */ + UVMHIST_LOG(maphist, " narrow=%d, back=%d, forw=%d, startva=0x%x", + narrow, nback, nforw, startva); + UVMHIST_LOG(maphist, " entry=0x%x, amap=0x%x, obj=0x%x", ufi.entry, + amap, uobj, 0); + + /* + * if we've got an amap, lock it and extract current anons. + */ + + if (amap) { + amap_lock(amap); + anons = anons_store; + amap_lookups(&ufi.entry->aref, startva - ufi.entry->start, + anons, npages); + } else { + anons = NULL; /* to be safe */ + } + + /* locked: maps(read), amap(if there) */ + + /* + * for MADV_SEQUENTIAL mappings we want to deactivate the back pages + * now and then forget about them (for the rest of the fault). + */ + + if (ufi.entry->advice == MADV_SEQUENTIAL) { + + UVMHIST_LOG(maphist, " MADV_SEQUENTIAL: flushing backpages", + 0,0,0,0); + /* flush back-page anons? */ + if (amap) + uvmfault_anonflush(anons, nback); + + /* flush object? */ + if (uobj) { + objaddr = + (startva - ufi.entry->start) + ufi.entry->offset; + simple_lock(&uobj->vmobjlock); + (void) uobj->pgops->pgo_flush(uobj, objaddr, objaddr + + (nback << PAGE_SHIFT), PGO_DEACTIVATE); + simple_unlock(&uobj->vmobjlock); + } + + /* now forget about the backpages */ + if (amap) + anons += nback; + startva = startva + (nback << PAGE_SHIFT); + npages -= nback; + nback = centeridx = 0; + } + + /* locked: maps(read), amap(if there) */ + + /* + * map in the backpages and frontpages we found in the amap in hopes + * of preventing future faults. we also init the pages[] array as + * we go. + */ + + currva = startva; + shadowed = FALSE; + for (lcv = 0 ; lcv < npages ; lcv++, currva += PAGE_SIZE) { + + /* + * dont play with VAs that are already mapped + * except for center) + * XXX: return value of pmap_extract disallows PA 0 + */ + if (lcv != centeridx) { + pa = pmap_extract(ufi.orig_map->pmap, currva); + if (pa != NULL) { + pages[lcv] = PGO_DONTCARE; + continue; + } + } + + /* + * unmapped or center page. check if any anon at this level. + */ + if (amap == NULL || anons[lcv] == NULL) { + pages[lcv] = NULL; + continue; + } + + /* + * check for present page and map if possible. re-activate it. + */ + + pages[lcv] = PGO_DONTCARE; + if (lcv == centeridx) { /* save center for later! */ + shadowed = TRUE; + continue; + } + anon = anons[lcv]; + simple_lock(&anon->an_lock); + /* ignore loaned pages */ + if (anon->u.an_page && anon->u.an_page->loan_count == 0 && + (anon->u.an_page->flags & (PG_RELEASED|PG_BUSY)) == 0) { + uvm_lock_pageq(); + uvm_pageactivate(anon->u.an_page); /* reactivate */ + uvm_unlock_pageq(); + UVMHIST_LOG(maphist, + " MAPPING: n anon: pm=0x%x, va=0x%x, pg=0x%x", + ufi.orig_map->pmap, currva, anon->u.an_page, 0); + uvmexp.fltnamap++; + pmap_enter(ufi.orig_map->pmap, currva, + VM_PAGE_TO_PHYS(anon->u.an_page), + (anon->an_ref > 1) ? VM_PROT_READ : enter_prot, + (ufi.entry->wired_count != 0)); + } + simple_unlock(&anon->an_lock); + } + + /* locked: maps(read), amap(if there) */ + /* (shadowed == TRUE) if there is an anon at the faulting address */ + UVMHIST_LOG(maphist, " shadowed=%d, will_get=%d", shadowed, + (uobj && shadowed == FALSE),0,0); + + /* + * note that if we are really short of RAM we could sleep in the above + * call to pmap_enter with everything locked. bad? + * XXXCDC: this is fixed in PMAP_NEW (no sleep alloc's in pmap) + */ + + /* + * if the desired page is not shadowed by the amap and we have a + * backing object, then we check to see if the backing object would + * prefer to handle the fault itself (rather than letting us do it + * with the usual pgo_get hook). the backing object signals this by + * providing a pgo_fault routine. + */ + + if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) { + + simple_lock(&uobj->vmobjlock); + + /* locked: maps(read), amap (if there), uobj */ + result = uobj->pgops->pgo_fault(&ufi, startva, pages, npages, + centeridx, fault_type, access_type, + PGO_LOCKED); + /* locked: nothing, pgo_fault has unlocked everything */ + + if (result == VM_PAGER_OK) + return (KERN_SUCCESS); /* pgo_fault did pmap enter */ + else if (result == VM_PAGER_REFAULT) + goto ReFault; /* try again! */ + else + return (KERN_PROTECTION_FAILURE); + } + + /* + * now, if the desired page is not shadowed by the amap and we have + * a backing object that does not have a special fault routine, then + * we ask (with pgo_get) the object for resident pages that we care + * about and attempt to map them in. we do not let pgo_get block + * (PGO_LOCKED). + * + * ("get" has the option of doing a pmap_enter for us) + */ + + if (uobj && shadowed == FALSE) { + simple_lock(&uobj->vmobjlock); + + /* locked (!shadowed): maps(read), amap (if there), uobj */ + /* + * the following call to pgo_get does _not_ change locking state + */ + + uvmexp.fltlget++; + gotpages = npages; + result = uobj->pgops->pgo_get(uobj, ufi.entry->offset + + (startva - ufi.entry->start), + pages, &gotpages, centeridx, + UVM_ET_ISCOPYONWRITE(ufi.entry) ? + VM_PROT_READ : access_type, + ufi.entry->advice, PGO_LOCKED); + + /* + * check for pages to map, if we got any + */ + + uobjpage = NULL; + + if (gotpages) { + currva = startva; + for (lcv = 0 ; lcv < npages ; + lcv++, currva += PAGE_SIZE) { + + if (pages[lcv] == NULL || + pages[lcv] == PGO_DONTCARE) + continue; + +#ifdef DIAGNOSTIC + /* + * pager sanity check: pgo_get with + * PGO_LOCKED should never return a + * released page to us. + */ + if (pages[lcv]->flags & PG_RELEASED) + panic("uvm_fault: pgo_get PGO_LOCKED gave us a RELEASED page"); +#endif + + /* + * if center page is resident and not + * PG_BUSY|PG_RELEASED then pgo_get + * made it PG_BUSY for us and gave + * us a handle to it. remember this + * page as "uobjpage." (for later use). + */ + + if (lcv == centeridx) { + uobjpage = pages[lcv]; + UVMHIST_LOG(maphist, " got uobjpage (0x%x) with locked get", + uobjpage, 0,0,0); + continue; + } + + /* + * note: calling pgo_get with locked data + * structures returns us pages which are + * neither busy nor released, so we don't + * need to check for this. we can just + * directly enter the page (after moving it + * to the head of the active queue [useful?]). + */ + + uvm_lock_pageq(); + uvm_pageactivate(pages[lcv]); /* reactivate */ + uvm_unlock_pageq(); + UVMHIST_LOG(maphist, + " MAPPING: n obj: pm=0x%x, va=0x%x, pg=0x%x", + ufi.orig_map->pmap, currva, pages[lcv], 0); + uvmexp.fltnomap++; + pmap_enter(ufi.orig_map->pmap, currva, + VM_PAGE_TO_PHYS(pages[lcv]), + UVM_ET_ISCOPYONWRITE(ufi.entry) ? + VM_PROT_READ : enter_prot, wired); + + /* + * NOTE: page can't be PG_WANTED or PG_RELEASED + * because we've held the lock the whole time + * we've had the handle. + */ + pages[lcv]->flags &= ~(PG_BUSY); /* un-busy! */ + UVM_PAGE_OWN(pages[lcv], NULL); + + /* done! */ + } /* for "lcv" loop */ + } /* "gotpages" != 0 */ + + /* note: object still _locked_ */ + } else { + + uobjpage = NULL; + + } + + /* locked (shadowed): maps(read), amap */ + /* locked (!shadowed): maps(read), amap(if there), + uobj(if !null), uobjpage(if !null) */ + + /* + * note that at this point we are done with any front or back pages. + * we are now going to focus on the center page (i.e. the one we've + * faulted on). if we have faulted on the top (anon) layer + * [i.e. case 1], then the anon we want is anons[centeridx] (we have + * not touched it yet). if we have faulted on the bottom (uobj) + * layer [i.e. case 2] and the page was both present and available, + * then we've got a pointer to it as "uobjpage" and we've already + * made it BUSY. + */ + + /* + * there are four possible cases we must address: 1A, 1B, 2A, and 2B + */ + + /* + * redirect case 2: if we are not shadowed, go to case 2. + */ + + if (shadowed == FALSE) + goto Case2; + + /* locked: maps(read), amap */ + + /* + * handle case 1: fault on an anon in our amap + */ + + anon = anons[centeridx]; + UVMHIST_LOG(maphist, " case 1 fault: anon=0x%x", anon, 0,0,0); + simple_lock(&anon->an_lock); + + /* locked: maps(read), amap, anon */ + + /* + * no matter if we have case 1A or case 1B we are going to need to + * have the anon's memory resident. ensure that now. + */ + + /* + * let uvmfault_anonget do the dirty work. if it fails (!OK) it will + * unlock for us. if it is OK, locks are still valid and locked. + * also, if it is OK, then the anon's page is on the queues. + * if the page is on loan from a uvm_object, then anonget will + * lock that object for us if it does not fail. + */ + + result = uvmfault_anonget(&ufi, amap, anon); + + if (result == VM_PAGER_REFAULT) + goto ReFault; + + if (result == VM_PAGER_AGAIN) { + tsleep((caddr_t)&lbolt, PVM, "fltagain1", 0); + goto ReFault; + } + + if (result != VM_PAGER_OK) + return (KERN_PROTECTION_FAILURE); /* XXX??? */ + + /* + * uobj is non null if the page is on loan from an object (i.e. uobj) + */ + + uobj = anon->u.an_page->uobject; /* locked by anonget if !NULL */ + + /* locked: maps(read), amap, anon, uobj(if one) */ + + /* + * special handling for loaned pages + */ + if (anon->u.an_page->loan_count) { + + if ((access_type & VM_PROT_WRITE) == 0) { + + /* + * for read faults on loaned pages we just cap the + * protection at read-only. + */ + + enter_prot = enter_prot & ~VM_PROT_WRITE; + + } else { + /* + * note that we can't allow writes into a loaned page! + * + * if we have a write fault on a loaned page in an + * anon then we need to look at the anon's ref count. + * if it is greater than one then we are going to do + * a normal copy-on-write fault into a new anon (this + * is not a problem). however, if the reference count + * is one (a case where we would normally allow a + * write directly to the page) then we need to kill + * the loan before we continue. + */ + + /* >1 case is already ok */ + if (anon->an_ref == 1) { + + /* get new un-owned replacement page */ + pg = uvm_pagealloc(NULL, 0, NULL); + if (pg == NULL) { + uvmfault_unlockall(&ufi, amap, uobj, + anon); + uvm_wait("flt_noram2"); + goto ReFault; + } + + /* + * copy data, kill loan, and drop uobj lock + * (if any) + */ + /* copy old -> new */ + uvm_pagecopy(anon->u.an_page, pg); + + /* force reload */ + pmap_page_protect(PMAP_PGARG(anon->u.an_page), + VM_PROT_NONE); + uvm_lock_pageq(); /* KILL loan */ + if (uobj) + /* if we were loaning */ + anon->u.an_page->loan_count--; + anon->u.an_page->uanon = NULL; + /* in case we owned */ + anon->u.an_page->pqflags &= ~PQ_ANON; + uvm_unlock_pageq(); + if (uobj) { + simple_unlock(&uobj->vmobjlock); + uobj = NULL; + } + + /* install new page in anon */ + anon->u.an_page = pg; + pg->uanon = anon; + pg->pqflags |= PQ_ANON; + pg->flags &= ~(PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(pg, NULL); + + /* done! */ + } /* ref == 1 */ + } /* write fault */ + } /* loan count */ + + /* + * if we are case 1B then we will need to allocate a new blank + * anon to transfer the data into. note that we have a lock + * on anon, so no one can busy or release the page until we are done. + * also note that the ref count can't drop to zero here because + * it is > 1 and we are only dropping one ref. + * + * in the (hopefully very rare) case that we are out of RAM we + * will unlock, wait for more RAM, and refault. + * + * if we are out of anon VM we kill the process (XXX: could wait?). + */ + + if ((access_type & VM_PROT_WRITE) != 0 && anon->an_ref > 1) { + + UVMHIST_LOG(maphist, " case 1B: COW fault",0,0,0,0); + uvmexp.flt_acow++; + oanon = anon; /* oanon = old, locked anon */ + anon = uvm_analloc(); + if (anon) + pg = uvm_pagealloc(NULL, 0, anon); +#ifdef __GNUC__ + else + pg = NULL; /* XXX: gcc */ +#endif + + /* check for out of RAM */ + if (anon == NULL || pg == NULL) { + if (anon) + uvm_anfree(anon); + uvmfault_unlockall(&ufi, amap, uobj, oanon); + if (anon == NULL) { + UVMHIST_LOG(maphist, + "<- failed. out of VM",0,0,0,0); + uvmexp.fltnoanon++; + /* XXX: OUT OF VM, ??? */ + return (KERN_RESOURCE_SHORTAGE); + } + uvmexp.fltnoram++; + uvm_wait("flt_noram3"); /* out of RAM, wait for more */ + goto ReFault; + } + + /* got all resources, replace anon with nanon */ + + uvm_pagecopy(oanon->u.an_page, pg); /* pg now !PG_CLEAN */ + pg->flags &= ~(PG_BUSY|PG_FAKE); /* un-busy! new page */ + UVM_PAGE_OWN(pg, NULL); + amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start, + anon, 1); + + /* deref: can not drop to zero here by defn! */ + oanon->an_ref--; + + /* + * note: oanon still locked. anon is _not_ locked, but we + * have the sole references to in from amap which _is_ locked. + * thus, no one can get at it until we are done with it. + */ + + } else { + + uvmexp.flt_anon++; + oanon = anon; /* old, locked anon is same as anon */ + pg = anon->u.an_page; + if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ + enter_prot = enter_prot & ~VM_PROT_WRITE; + + } + + /* locked: maps(read), amap, anon */ + + /* + * now map the page in ... + * XXX: old fault unlocks object before pmap_enter. this seems + * suspect since some other thread could blast the page out from + * under us between the unlock and the pmap_enter. + */ + + UVMHIST_LOG(maphist, " MAPPING: anon: pm=0x%x, va=0x%x, pg=0x%x", + ufi.orig_map->pmap, ufi.orig_rvaddr, pg, 0); + pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr, VM_PAGE_TO_PHYS(pg), + enter_prot, wired); + + /* + * ... and update the page queues. + */ + + uvm_lock_pageq(); + + if (fault_type == VM_FAULT_WIRE) { + uvm_pagewire(pg); + } else { + /* activate it */ + uvm_pageactivate(pg); + + } + + uvm_unlock_pageq(); + + /* + * done case 1! finish up by unlocking everything and returning success + */ + + uvmfault_unlockall(&ufi, amap, uobj, oanon); + return (KERN_SUCCESS); + + +Case2: + /* + * handle case 2: faulting on backing object or zero fill + */ + + /* + * locked: + * maps(read), amap(if there), uobj(if !null), uobjpage(if !null) + */ + + /* + * note that uobjpage can not be PGO_DONTCARE at this point. we now + * set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we + * have a backing object, check and see if we are going to promote + * the data up to an anon during the fault. + */ + + if (uobj == NULL) { + uobjpage = PGO_DONTCARE; + promote = TRUE; /* always need anon here */ + } else { + /* assert(uobjpage != PGO_DONTCARE) */ + promote = (access_type & VM_PROT_WRITE) && + UVM_ET_ISCOPYONWRITE(ufi.entry); + } + UVMHIST_LOG(maphist, " case 2 fault: promote=%d, zfill=%d", + promote, (uobj == NULL), 0,0); + + /* + * if uobjpage is not null then we do not need to do I/O to get the + * uobjpage. + * + * if uobjpage is null, then we need to unlock and ask the pager to + * get the data for us. once we have the data, we need to reverify + * the state the world. we are currently not holding any resources. + */ + + if (uobjpage) { + /* update rusage counters */ + curproc->p_addr->u_stats.p_ru.ru_minflt++; + } else { + /* update rusage counters */ + curproc->p_addr->u_stats.p_ru.ru_majflt++; + + /* locked: maps(read), amap(if there), uobj */ + uvmfault_unlockall(&ufi, amap, NULL, NULL); + /* locked: uobj */ + + uvmexp.fltget++; + gotpages = 1; + result = uobj->pgops->pgo_get(uobj, + (ufi.orig_rvaddr - ufi.entry->start) + ufi.entry->offset, + &uobjpage, &gotpages, 0, + UVM_ET_ISCOPYONWRITE(ufi.entry) ? + VM_PROT_READ : access_type, + ufi.entry->advice, 0); + + /* locked: uobjpage(if result OK) */ + + /* + * recover from I/O + */ + + if (result != VM_PAGER_OK) { + +#ifdef DIAGNOSTIC + if (result == VM_PAGER_PEND) + panic("uvm_fault: pgo_get got PENDing on non-async I/O"); +#endif + + if (result == VM_PAGER_AGAIN) { + UVMHIST_LOG(maphist, " pgo_get says TRY AGAIN!",0,0,0,0); + tsleep((caddr_t)&lbolt, PVM, "fltagain2", 0); + goto ReFault; + } + + UVMHIST_LOG(maphist, "<- pgo_get failed (code %d)", + result, 0,0,0); + return (KERN_PROTECTION_FAILURE); /* XXX i/o error */ + } + + /* locked: uobjpage */ + + /* + * re-verify the state of the world by first trying to relock + * the maps. always relock the object. + */ + + locked = uvmfault_relock(&ufi); + if (locked && amap) + amap_lock(amap); + simple_lock(&uobj->vmobjlock); + + /* locked(locked): maps(read), amap(if !null), uobj, uobjpage */ + /* locked(!locked): uobj, uobjpage */ + + /* + * verify that the page has not be released and re-verify + * that amap slot is still free. if there is a problem, + * we unlock and clean up. + */ + + if ((uobjpage->flags & PG_RELEASED) != 0 || + (locked && amap && + amap_lookup(&ufi.entry->aref, + ufi.orig_rvaddr - ufi.entry->start))) { + if (locked) + uvmfault_unlockall(&ufi, amap, NULL, NULL); + locked = FALSE; + } + + /* + * didn't get the lock? release the page and retry. + */ + + if (locked == FALSE) { + + UVMHIST_LOG(maphist, + " wasn't able to relock after fault: retry", + 0,0,0,0); + if (uobjpage->flags & PG_WANTED) + /* still holding object lock */ + thread_wakeup(uobjpage); + + if (uobjpage->flags & PG_RELEASED) { + uvmexp.fltpgrele++; +#ifdef DIAGNOSTIC + if (uobj->pgops->pgo_releasepg == NULL) + panic("uvm_fault: object has no releasepg function"); +#endif + /* frees page */ + if (uobj->pgops->pgo_releasepg(uobjpage,NULL)) + /* unlock if still alive */ + simple_unlock(&uobj->vmobjlock); + goto ReFault; + } + + uvm_lock_pageq(); + /* make sure it is in queues */ + uvm_pageactivate(uobjpage); + + uvm_unlock_pageq(); + uobjpage->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(uobjpage, NULL); + simple_unlock(&uobj->vmobjlock); + goto ReFault; + + } + + /* + * we have the data in uobjpage which is PG_BUSY and + * !PG_RELEASED. we are holding object lock (so the page + * can't be released on us). + */ + + /* locked: maps(read), amap(if !null), uobj, uobjpage */ + + } + + /* + * locked: + * maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj) + */ + + /* + * notes: + * - at this point uobjpage can not be NULL + * - at this point uobjpage can not be PG_RELEASED (since we checked + * for it above) + * - at this point uobjpage could be PG_WANTED (handle later) + */ + + if (promote == FALSE) { + + /* + * we are not promoting. if the mapping is COW ensure that we + * don't give more access than we should (e.g. when doing a read + * fault on a COPYONWRITE mapping we want to map the COW page in + * R/O even though the entry protection could be R/W). + * + * set "pg" to the page we want to map in (uobjpage, usually) + */ + + uvmexp.flt_obj++; + if (UVM_ET_ISCOPYONWRITE(ufi.entry)) + enter_prot = enter_prot & ~VM_PROT_WRITE; + pg = uobjpage; /* map in the actual object */ + + /* assert(uobjpage != PGO_DONTCARE) */ + + /* + * we are faulting directly on the page. be careful + * about writing to loaned pages... + */ + if (uobjpage->loan_count) { + + if ((access_type & VM_PROT_WRITE) == 0) { + /* read fault: cap the protection at readonly */ + /* cap! */ + enter_prot = enter_prot & ~VM_PROT_WRITE; + } else { + /* write fault: must break the loan here */ + + /* alloc new un-owned page */ + pg = uvm_pagealloc(NULL, 0, NULL); + + if (pg == NULL) { + /* + * drop ownership of page, it can't + * be released + * */ + if (uobjpage->flags & PG_WANTED) + thread_wakeup(uobjpage); + uobjpage->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(uobjpage, NULL); + + uvm_lock_pageq(); + /* activate: we will need it later */ + uvm_pageactivate(uobjpage); + + uvm_unlock_pageq(); + uvmfault_unlockall(&ufi, amap, uobj, + NULL); + UVMHIST_LOG(maphist, + " out of RAM breaking loan, waiting", 0,0,0,0); + uvmexp.fltnoram++; + uvm_wait("flt_noram4"); + goto ReFault; + } + + /* + * copy the data from the old page to the new + * one and clear the fake/clean flags on the + * new page (keep it busy). force a reload + * of the old page by clearing it from all + * pmaps. then lock the page queues to + * rename the pages. + */ + uvm_pagecopy(uobjpage, pg); /* old -> new */ + pg->flags &= ~(PG_FAKE|PG_CLEAN); + pmap_page_protect(PMAP_PGARG(uobjpage), + VM_PROT_NONE); + if (uobjpage->flags & PG_WANTED) + thread_wakeup(uobjpage); + /* uobj still locked */ + uobjpage->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(uobjpage, NULL); + + uvm_lock_pageq(); + offset = uobjpage->offset; + /* remove old page */ + uvm_pagerealloc(uobjpage, NULL, 0); + + /* + * at this point we have absolutely no + * control over uobjpage + */ + /* install new page */ + uvm_pagerealloc(pg, uobj, offset); + uvm_unlock_pageq(); + + /* + * done! loan is broken and "pg" is + * PG_BUSY. it can now replace uobjpage. + */ + + uobjpage = pg; + + } /* write fault case */ + } /* if loan_count */ + + } else { + + /* + * if we are going to promote the data to an anon we + * allocate a blank anon here and plug it into our amap. + */ +#if DIAGNOSTIC + if (amap == NULL) + panic("uvm_fault: want to promote data, but no anon"); +#endif + + anon = uvm_analloc(); + if (anon) + pg = uvm_pagealloc(NULL, 0, anon); /* BUSY+CLEAN+FAKE */ +#ifdef __GNUC__ + else + pg = NULL; /* XXX: gcc */ +#endif + + /* + * out of memory resources? + */ + if (anon == NULL || pg == NULL) { + + /* + * arg! must unbusy our page and fail or sleep. + */ + if (uobjpage != PGO_DONTCARE) { + if (uobjpage->flags & PG_WANTED) + /* still holding object lock */ + thread_wakeup(uobjpage); + + uvm_lock_pageq(); + /* make sure it is in queues */ + uvm_pageactivate(uobjpage); + uvm_unlock_pageq(); + /* un-busy! (still locked) */ + uobjpage->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(uobjpage, NULL); + } + + /* unlock and fail ... */ + uvmfault_unlockall(&ufi, amap, uobj, NULL); + if (anon == NULL) { + UVMHIST_LOG(maphist, " promote: out of VM", + 0,0,0,0); + uvmexp.fltnoanon++; + /* XXX: out of VM */ + return (KERN_RESOURCE_SHORTAGE); + } + UVMHIST_LOG(maphist, " out of RAM, waiting for more", + 0,0,0,0); + uvm_anfree(anon); + uvmexp.fltnoram++; + uvm_wait("flt_noram5"); + goto ReFault; + } + + /* + * fill in the data + */ + + if (uobjpage != PGO_DONTCARE) { + uvmexp.flt_prcopy++; + /* copy page [pg now dirty] */ + uvm_pagecopy(uobjpage, pg); + + /* + * promote to shared amap? make sure all sharing + * procs see it + */ + if ((amap_flags(amap) & AMAP_SHARED) != 0) { + pmap_page_protect(PMAP_PGARG(uobjpage), + VM_PROT_NONE); + } + + /* + * dispose of uobjpage. it can't be PG_RELEASED + * since we still hold the object lock. drop + * handle to uobj as well. + */ + + if (uobjpage->flags & PG_WANTED) + /* still have the obj lock */ + thread_wakeup(uobjpage); + uobjpage->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(uobjpage, NULL); + uvm_lock_pageq(); + uvm_pageactivate(uobjpage); /* put it back */ + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + uobj = NULL; + UVMHIST_LOG(maphist, + " promote uobjpage 0x%x to anon/page 0x%x/0x%x", + uobjpage, anon, pg, 0); + + } else { + uvmexp.flt_przero++; + uvm_pagezero(pg); /* zero page [pg now dirty] */ + UVMHIST_LOG(maphist," zero fill anon/page 0x%x/0%x", + anon, pg, 0, 0); + } + + amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start, + anon, 0); + + } + + /* + * locked: + * maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj) + * + * note: pg is either the uobjpage or the new page in the new anon + */ + + /* + * all resources are present. we can now map it in and free our + * resources. + */ + + UVMHIST_LOG(maphist, + " MAPPING: case2: pm=0x%x, va=0x%x, pg=0x%x, promote=%d", + ufi.orig_map->pmap, ufi.orig_rvaddr, pg, promote); + pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr, VM_PAGE_TO_PHYS(pg), + enter_prot, wired); + + uvm_lock_pageq(); + + if (fault_type == VM_FAULT_WIRE) { + uvm_pagewire(pg); + } else { + + /* activate it */ + uvm_pageactivate(pg); + + } + + uvm_unlock_pageq(); + + if (pg->flags & PG_WANTED) + thread_wakeup(pg); /* lock still held */ + + /* + * note that pg can't be PG_RELEASED since we did not drop the object + * lock since the last time we checked. + */ + + pg->flags &= ~(PG_BUSY|PG_FAKE|PG_WANTED); + UVM_PAGE_OWN(pg, NULL); + uvmfault_unlockall(&ufi, amap, uobj, NULL); + + UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0); + return (KERN_SUCCESS); +} + + +/* + * uvm_fault_wire: wire down a range of virtual addresses in a map. + * + * => map should be locked by caller? If so how can we call + * uvm_fault? WRONG. + * => XXXCDC: locking here is all screwed up!!! start with + * uvm_map_pageable and fix it. + */ + +int +uvm_fault_wire(map, start, end) + vm_map_t map; + vaddr_t start, end; +{ + vaddr_t va; + pmap_t pmap; + int rv; + + pmap = vm_map_pmap(map); + + /* + * call pmap pageable: this tells the pmap layer to lock down these + * page tables. + */ + + pmap_pageable(pmap, start, end, FALSE); + + /* + * now fault it in page at a time. if the fault fails then we have + * to undo what we have done. note that in uvm_fault VM_PROT_NONE + * is replaced with the max protection if fault_type is VM_FAULT_WIRE. + */ + + for (va = start ; va < end ; va += PAGE_SIZE) { + rv = uvm_fault(map, va, VM_FAULT_WIRE, VM_PROT_NONE); + if (rv) { + if (va != start) { + uvm_fault_unwire(map->pmap, start, va); + } + return (rv); + } + } + + return (KERN_SUCCESS); +} + +/* + * uvm_fault_unwire(): unwire range of virtual space. + * + * => caller holds reference to pmap (via its map) + */ + +void +uvm_fault_unwire(pmap, start, end) + struct pmap *pmap; + vaddr_t start, end; +{ + vaddr_t va; + paddr_t pa; + struct vm_page *pg; + + /* + * we assume that the area we are unwiring has actually been wired + * in the first place. this means that we should be able to extract + * the PAs from the pmap. we also lock out the page daemon so that + * we can call uvm_pageunwire. + */ + + uvm_lock_pageq(); + + for (va = start; va < end ; va += PAGE_SIZE) { + pa = pmap_extract(pmap, va); + + /* XXX: assumes PA 0 cannot be in map */ + if (pa == (paddr_t) 0) { + panic("uvm_fault_unwire: unwiring non-wired memory"); + } + pmap_change_wiring(pmap, va, FALSE); /* tell the pmap */ + pg = PHYS_TO_VM_PAGE(pa); + if (pg) + uvm_pageunwire(pg); + } + + uvm_unlock_pageq(); + + /* + * now we call pmap_pageable to let the pmap know that the page tables + * in this space no longer need to be wired. + */ + + pmap_pageable(pmap, start, end, TRUE); + +} diff --git a/sys/uvm/uvm_fault.h b/sys/uvm/uvm_fault.h new file mode 100644 index 00000000000..650543ea669 --- /dev/null +++ b/sys/uvm/uvm_fault.h @@ -0,0 +1,88 @@ +/* $NetBSD: uvm_fault.h,v 1.7 1998/10/11 23:07:42 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_fault.h,v 1.1.2.2 1997/12/08 16:07:12 chuck Exp + */ + +#ifndef _UVM_UVM_FAULT_H_ +#define _UVM_UVM_FAULT_H_ + +/* + * fault types + */ + +#define VM_FAULT_INVALID ((vm_fault_t) 0x0) /* invalid mapping */ +#define VM_FAULT_PROTECT ((vm_fault_t) 0x1) /* protection */ +#define VM_FAULT_WIRE ((vm_fault_t) 0x2) /* wire mapping */ + +/* + * fault data structures + */ + +/* + * uvm_faultinfo: to load one of these fill in all orig_* fields and + * then call uvmfault_lookup on it. + */ + + +struct uvm_faultinfo { + vm_map_t orig_map; /* IN: original map */ + vaddr_t orig_rvaddr; /* IN: original rounded VA */ + vsize_t orig_size; /* IN: original size of interest */ + vm_map_t map; /* map (could be a submap) */ + unsigned int mapv; /* map's version number */ + vm_map_entry_t entry; /* map entry (from 'map') */ + vsize_t size; /* size of interest */ +}; + +/* + * fault prototypes + */ + + +int uvmfault_anonget __P((struct uvm_faultinfo *, struct vm_amap *, + struct vm_anon *)); +static boolean_t uvmfault_lookup __P((struct uvm_faultinfo *, boolean_t)); +static boolean_t uvmfault_relock __P((struct uvm_faultinfo *)); +static void uvmfault_unlockall __P((struct uvm_faultinfo *, struct vm_amap *, + struct uvm_object *, struct vm_anon *)); +static void uvmfault_unlockmaps __P((struct uvm_faultinfo *, boolean_t)); + +int uvm_fault_wire __P((vm_map_t, vaddr_t, vaddr_t)); +void uvm_fault_unwire __P((struct pmap *, vaddr_t, vaddr_t)); + +#endif /* _UVM_UVM_FAULT_H_ */ diff --git a/sys/uvm/uvm_fault_i.h b/sys/uvm/uvm_fault_i.h new file mode 100644 index 00000000000..40c5cddcef8 --- /dev/null +++ b/sys/uvm/uvm_fault_i.h @@ -0,0 +1,203 @@ +/* $NetBSD: uvm_fault_i.h,v 1.7 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp + */ + +#ifndef _UVM_UVM_FAULT_I_H_ +#define _UVM_UVM_FAULT_I_H_ + +/* + * uvm_fault_i.h: fault inline functions + */ + +/* + * uvmfault_unlockmaps: unlock the maps + */ + +static __inline void +uvmfault_unlockmaps(ufi, write_locked) + struct uvm_faultinfo *ufi; + boolean_t write_locked; +{ + + if (write_locked) { + vm_map_unlock(ufi->map); + } else { + vm_map_unlock_read(ufi->map); + } +} + +/* + * uvmfault_unlockall: unlock everything passed in. + * + * => maps must be read-locked (not write-locked). + */ + +static __inline void +uvmfault_unlockall(ufi, amap, uobj, anon) + struct uvm_faultinfo *ufi; + struct vm_amap *amap; + struct uvm_object *uobj; + struct vm_anon *anon; +{ + + if (anon) + simple_unlock(&anon->an_lock); + if (uobj) + simple_unlock(&uobj->vmobjlock); + if (amap) + amap_unlock(amap); + uvmfault_unlockmaps(ufi, FALSE); +} + +/* + * uvmfault_lookup: lookup a virtual address in a map + * + * => caller must provide a uvm_faultinfo structure with the IN + * params properly filled in + * => we will lookup the map entry (handling submaps) as we go + * => if the lookup is a success we will return with the maps locked + * => if "write_lock" is TRUE, we write_lock the map, otherwise we only + * get a read lock. + * => note that submaps can only appear in the kernel and they are + * required to use the same virtual addresses as the map they + * are referenced by (thus address translation between the main + * map and the submap is unnecessary). + */ + +static __inline boolean_t +uvmfault_lookup(ufi, write_lock) + struct uvm_faultinfo *ufi; + boolean_t write_lock; +{ + vm_map_t tmpmap; + + /* + * init ufi values for lookup. + */ + + ufi->map = ufi->orig_map; + ufi->size = ufi->orig_size; + + /* + * keep going down levels until we are done. note that there can + * only be two levels so we won't loop very long. + */ + + while (1) { + + /* + * lock map + */ + if (write_lock) { + vm_map_lock(ufi->map); + } else { + vm_map_lock_read(ufi->map); + } + + /* + * lookup + */ + if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr, + &ufi->entry)) { + uvmfault_unlockmaps(ufi, write_lock); + return(FALSE); + } + + /* + * reduce size if necessary + */ + if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) + ufi->size = ufi->entry->end - ufi->orig_rvaddr; + + /* + * submap? replace map with the submap and lookup again. + * note: VAs in submaps must match VAs in main map. + */ + if (UVM_ET_ISSUBMAP(ufi->entry)) { + tmpmap = ufi->entry->object.sub_map; + if (write_lock) { + vm_map_unlock(ufi->map); + } else { + vm_map_unlock_read(ufi->map); + } + ufi->map = tmpmap; + continue; + } + + /* + * got it! + */ + + ufi->mapv = ufi->map->timestamp; + return(TRUE); + + } /* while loop */ + + /*NOTREACHED*/ +} + +/* + * uvmfault_relock: attempt to relock the same version of the map + * + * => fault data structures should be unlocked before calling. + * => if a success (TRUE) maps will be locked after call. + */ + +static __inline boolean_t +uvmfault_relock(ufi) + struct uvm_faultinfo *ufi; +{ + + uvmexp.fltrelck++; + /* + * relock map. fail if version mismatch (in which case nothing + * gets locked). + */ + + vm_map_lock_read(ufi->map); + if (ufi->mapv != ufi->map->timestamp) { + vm_map_unlock_read(ufi->map); + return(FALSE); + } + + uvmexp.fltrelckok++; + return(TRUE); /* got it! */ +} + +#endif /* _UVM_UVM_FAULT_I_H_ */ diff --git a/sys/uvm/uvm_glue.c b/sys/uvm/uvm_glue.c new file mode 100644 index 00000000000..b46fd012c16 --- /dev/null +++ b/sys/uvm/uvm_glue.c @@ -0,0 +1,605 @@ +/* $NetBSD: uvm_glue.c,v 1.15 1998/10/19 22:21:19 tron Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 + * from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * uvm_glue.c: glue functions + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/buf.h> +#include <sys/user.h> +#ifdef SYSVSHM +#include <sys/shm.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +#include <machine/cpu.h> + +/* + * local prototypes + */ + +static void uvm_swapout __P((struct proc *)); + +/* + * XXXCDC: do these really belong here? + */ + +unsigned maxdmap = MAXDSIZ; /* kern_resource.c: RLIMIT_DATA max */ +unsigned maxsmap = MAXSSIZ; /* kern_resource.c: RLIMIT_STACK max */ + +int readbuffers = 0; /* allow KGDB to read kern buffer pool */ + /* XXX: see uvm_kernacc */ + + +/* + * uvm_kernacc: can the kernel access a region of memory + * + * - called from malloc [DIAGNOSTIC], and /dev/kmem driver (mem.c) + */ + +boolean_t +uvm_kernacc(addr, len, rw) + caddr_t addr; + size_t len; + int rw; +{ + boolean_t rv; + vaddr_t saddr, eaddr; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + saddr = trunc_page(addr); + eaddr = round_page(addr+len); + vm_map_lock_read(kernel_map); + rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot); + vm_map_unlock_read(kernel_map); + + /* + * XXX there are still some things (e.g. the buffer cache) that + * are managed behind the VM system's back so even though an + * address is accessible in the mind of the VM system, there may + * not be physical pages where the VM thinks there is. This can + * lead to bogus allocation of pages in the kernel address space + * or worse, inconsistencies at the pmap level. We only worry + * about the buffer cache for now. + */ + if (!readbuffers && rv && (eaddr > (vaddr_t)buffers && + saddr < (vaddr_t)buffers + MAXBSIZE * nbuf)) + rv = FALSE; + return(rv); +} + +/* + * uvm_useracc: can the user access it? + * + * - called from physio() and sys___sysctl(). + */ + +boolean_t +uvm_useracc(addr, len, rw) + caddr_t addr; + size_t len; + int rw; +{ + boolean_t rv; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + +#if defined(i386) || defined(pc532) + /* + * XXX - specially disallow access to user page tables - they are + * in the map. This is here until i386 & pc532 pmaps are fixed... + */ + if ((vaddr_t) addr >= VM_MAXUSER_ADDRESS + || (vaddr_t) addr + len > VM_MAXUSER_ADDRESS + || (vaddr_t) addr + len <= (vaddr_t) addr) + return (FALSE); +#endif + + rv = uvm_map_checkprot(&curproc->p_vmspace->vm_map, + trunc_page(addr), round_page(addr+len), prot); + return(rv); +} + +#ifdef KGDB +/* + * Change protections on kernel pages from addr to addr+len + * (presumably so debugger can plant a breakpoint). + * + * We force the protection change at the pmap level. If we were + * to use vm_map_protect a change to allow writing would be lazily- + * applied meaning we would still take a protection fault, something + * we really don't want to do. It would also fragment the kernel + * map unnecessarily. We cannot use pmap_protect since it also won't + * enforce a write-enable request. Using pmap_enter is the only way + * we can ensure the change takes place properly. + */ +void +uvm_chgkprot(addr, len, rw) + register caddr_t addr; + size_t len; + int rw; +{ + vm_prot_t prot; + paddr_t pa; + vaddr_t sva, eva; + + prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE; + eva = round_page(addr + len); + for (sva = trunc_page(addr); sva < eva; sva += PAGE_SIZE) { + /* + * Extract physical address for the page. + * We use a cheezy hack to differentiate physical + * page 0 from an invalid mapping, not that it + * really matters... + */ + pa = pmap_extract(pmap_kernel(), sva|1); + if (pa == 0) + panic("chgkprot: invalid page"); + pmap_enter(pmap_kernel(), sva, pa&~1, prot, TRUE); + } +} +#endif + +/* + * vslock: wire user memory for I/O + * + * - called from physio and sys___sysctl + * - XXXCDC: consider nuking this (or making it a macro?) + */ + +void +uvm_vslock(p, addr, len) + struct proc *p; + caddr_t addr; + size_t len; +{ + uvm_fault_wire(&p->p_vmspace->vm_map, trunc_page(addr), + round_page(addr+len)); +} + +/* + * vslock: wire user memory for I/O + * + * - called from physio and sys___sysctl + * - XXXCDC: consider nuking this (or making it a macro?) + */ + +void +uvm_vsunlock(p, addr, len) + struct proc *p; + caddr_t addr; + size_t len; +{ + uvm_fault_unwire(p->p_vmspace->vm_map.pmap, trunc_page(addr), + round_page(addr+len)); +} + +/* + * uvm_fork: fork a virtual address space + * + * - the address space is copied as per parent map's inherit values + * - a new "user" structure is allocated for the child process + * [filled in by MD layer...] + * - NOTE: the kernel stack may be at a different location in the child + * process, and thus addresses of automatic variables may be invalid + * after cpu_fork returns in the child process. We do nothing here + * after cpu_fork returns. + * - XXXCDC: we need a way for this to return a failure value rather + * than just hang + */ +void +uvm_fork(p1, p2, shared) + struct proc *p1, *p2; + boolean_t shared; +{ + struct user *up = p2->p_addr; + int rv; + + if (shared == TRUE) + uvmspace_share(p1, p2); /* share vmspace */ + else + p2->p_vmspace = uvmspace_fork(p1->p_vmspace); /* fork vmspace */ + + /* + * Wire down the U-area for the process, which contains the PCB + * and the kernel stack. Wired state is stored in p->p_flag's + * P_INMEM bit rather than in the vm_map_entry's wired count + * to prevent kernel_map fragmentation. + */ + rv = uvm_fault_wire(kernel_map, (vaddr_t)up, + (vaddr_t)up + USPACE); + if (rv != KERN_SUCCESS) + panic("uvm_fork: uvm_fault_wire failed: %d", rv); + + /* + * p_stats and p_sigacts currently point at fields in the user + * struct but not at &u, instead at p_addr. Copy p_sigacts and + * parts of p_stats; zero the rest of p_stats (statistics). + */ + p2->p_stats = &up->u_stats; + p2->p_sigacts = &up->u_sigacts; + up->u_sigacts = *p1->p_sigacts; + bzero(&up->u_stats.pstat_startzero, + (unsigned) ((caddr_t)&up->u_stats.pstat_endzero - + (caddr_t)&up->u_stats.pstat_startzero)); + bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, + ((caddr_t)&up->u_stats.pstat_endcopy - + (caddr_t)&up->u_stats.pstat_startcopy)); + +/* + * cpu_fork will copy and update the kernel stack and pcb, and make + * the child ready to run. The child will exit directly to user + * mode on its first time slice, and will not return here. + */ + cpu_fork(p1, p2); +} + +/* + * uvm_exit: exit a virtual address space + * + * - the process passed to us is a dead (pre-zombie) process; we + * are running on a different context now (the reaper). + * - we must run in a separate thread because freeing the vmspace + * of the dead process may block. + */ +void +uvm_exit(p) + struct proc *p; +{ + + uvmspace_free(p->p_vmspace); + uvm_km_free(kernel_map, (vaddr_t)p->p_addr, USPACE); +} + +/* + * uvm_init_limit: init per-process VM limits + * + * - called for process 0 and then inherited by all others. + */ +void +uvm_init_limits(p) + struct proc *p; +{ + + /* + * Set up the initial limits on process VM. Set the maximum + * resident set size to be all of (reasonably) available memory. + * This causes any single, large process to start random page + * replacement once it fills memory. + */ + + p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; + p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; + p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(uvmexp.free); +} + +#ifdef DEBUG +int enableswap = 1; +int swapdebug = 0; +#define SDB_FOLLOW 1 +#define SDB_SWAPIN 2 +#define SDB_SWAPOUT 4 +#endif + +/* + * uvm_swapin: swap in a process's u-area. + */ + +void +uvm_swapin(p) + struct proc *p; +{ + vaddr_t addr; + int s; + + addr = (vaddr_t)p->p_addr; + /* make P_INMEM true */ + uvm_fault_wire(kernel_map, addr, addr + USPACE); + + /* + * Some architectures need to be notified when the user area has + * moved to new physical page(s) (e.g. see mips/mips/vm_machdep.c). + */ + cpu_swapin(p); + s = splstatclock(); + if (p->p_stat == SRUN) + setrunqueue(p); + p->p_flag |= P_INMEM; + splx(s); + p->p_swtime = 0; + ++uvmexp.swapins; +} + +/* + * uvm_scheduler: process zero main loop + * + * - attempt to swapin every swaped-out, runnable process in order of + * priority. + * - if not enough memory, wake the pagedaemon and let it clear space. + */ + +void +uvm_scheduler() +{ + register struct proc *p; + register int pri; + struct proc *pp; + int ppri; + UVMHIST_FUNC("uvm_scheduler"); UVMHIST_CALLED(maphist); + +loop: +#ifdef DEBUG + while (!enableswap) + tsleep((caddr_t)&proc0, PVM, "noswap", 0); +#endif + pp = NULL; /* process to choose */ + ppri = INT_MIN; /* its priority */ + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + + /* is it a runnable swapped out process? */ + if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) { + pri = p->p_swtime + p->p_slptime - + (p->p_nice - NZERO) * 8; + if (pri > ppri) { /* higher priority? remember it. */ + pp = p; + ppri = pri; + } + } + } + +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("scheduler: running, procp %p pri %d\n", pp, ppri); +#endif + /* + * Nothing to do, back to sleep + */ + if ((p = pp) == NULL) { + tsleep((caddr_t)&proc0, PVM, "scheduler", 0); + goto loop; + } + + /* + * we have found swapped out process which we would like to bring + * back in. + * + * XXX: this part is really bogus cuz we could deadlock on memory + * despite our feeble check + */ + if (uvmexp.free > atop(USPACE)) { +#ifdef DEBUG + if (swapdebug & SDB_SWAPIN) + printf("swapin: pid %d(%s)@%p, pri %d free %d\n", + p->p_pid, p->p_comm, p->p_addr, ppri, uvmexp.free); +#endif + uvm_swapin(p); + goto loop; + } + /* + * not enough memory, jab the pageout daemon and wait til the coast + * is clear + */ +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("scheduler: no room for pid %d(%s), free %d\n", + p->p_pid, p->p_comm, uvmexp.free); +#endif + printf("scheduler: no room for pid %d(%s), free %d\n", + p->p_pid, p->p_comm, uvmexp.free);/*XXXCDC: HIGHLY BOGUS */ + (void) splhigh(); + uvm_wait("schedpwait"); + (void) spl0(); +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("scheduler: room again, free %d\n", uvmexp.free); +#endif + goto loop; +} + +/* + * swappable: is process "p" swappable? + */ + +#define swappable(p) \ + (((p)->p_flag & (P_SYSTEM | P_INMEM | P_WEXIT)) == P_INMEM && \ + (p)->p_holdcnt == 0) + +/* + * swapout_threads: find threads that can be swapped and unwire their + * u-areas. + * + * - called by the pagedaemon + * - try and swap at least one processs + * - processes that are sleeping or stopped for maxslp or more seconds + * are swapped... otherwise the longest-sleeping or stopped process + * is swapped, otherwise the longest resident process... + */ +void +uvm_swapout_threads() +{ + register struct proc *p; + struct proc *outp, *outp2; + int outpri, outpri2; + int didswap = 0; + extern int maxslp; + /* XXXCDC: should move off to uvmexp. or uvm., also in uvm_meter */ + +#ifdef DEBUG + if (!enableswap) + return; +#endif + + /* + * outp/outpri : stop/sleep process with largest sleeptime < maxslp + * outp2/outpri2: the longest resident process (its swap time) + */ + outp = outp2 = NULL; + outpri = outpri2 = 0; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (!swappable(p)) + continue; + switch (p->p_stat) { + case SRUN: + if (p->p_swtime > outpri2) { + outp2 = p; + outpri2 = p->p_swtime; + } + continue; + + case SSLEEP: + case SSTOP: + if (p->p_slptime >= maxslp) { + uvm_swapout(p); /* zap! */ + didswap++; + } else if (p->p_slptime > outpri) { + outp = p; + outpri = p->p_slptime; + } + continue; + } + } + + /* + * If we didn't get rid of any real duds, toss out the next most + * likely sleeping/stopped or running candidate. We only do this + * if we are real low on memory since we don't gain much by doing + * it (USPACE bytes). + */ + if (didswap == 0 && uvmexp.free <= atop(round_page(USPACE))) { + if ((p = outp) == NULL) + p = outp2; +#ifdef DEBUG + if (swapdebug & SDB_SWAPOUT) + printf("swapout_threads: no duds, try procp %p\n", p); +#endif + if (p) + uvm_swapout(p); + } +} + +/* + * uvm_swapout: swap out process "p" + * + * - currently "swapout" means "unwire U-area" and "pmap_collect()" + * the pmap. + * - XXXCDC: should deactivate all process' private anonymous memory + */ + +static void +uvm_swapout(p) + register struct proc *p; +{ + vaddr_t addr; + int s; + +#ifdef DEBUG + if (swapdebug & SDB_SWAPOUT) + printf("swapout: pid %d(%s)@%p, stat %x pri %d free %d\n", + p->p_pid, p->p_comm, p->p_addr, p->p_stat, + p->p_slptime, uvmexp.free); +#endif + + /* + * Do any machine-specific actions necessary before swapout. + * This can include saving floating point state, etc. + */ + cpu_swapout(p); + + /* + * Unwire the to-be-swapped process's user struct and kernel stack. + */ + addr = (vaddr_t)p->p_addr; + uvm_fault_unwire(kernel_map->pmap, addr, addr + USPACE); /* !P_INMEM */ + pmap_collect(vm_map_pmap(&p->p_vmspace->vm_map)); + + /* + * Mark it as (potentially) swapped out. + */ + s = splstatclock(); + p->p_flag &= ~P_INMEM; + if (p->p_stat == SRUN) + remrunqueue(p); + splx(s); + p->p_swtime = 0; + ++uvmexp.swapouts; +} + diff --git a/sys/uvm/uvm_glue.h b/sys/uvm/uvm_glue.h new file mode 100644 index 00000000000..8a137800fcd --- /dev/null +++ b/sys/uvm/uvm_glue.h @@ -0,0 +1,50 @@ +/* $NetBSD: uvm_glue.h,v 1.4 1998/02/10 02:34:37 perry Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_glue.h,v 1.1.2.1 1997/08/14 19:10:48 chuck Exp + */ + +#ifndef _UVM_UVM_GLUE_H_ +#define _UVM_UVM_GLUE_H_ + +/* + * uvm_glue.h + */ + +void uvm_swapout_threads __P((void)); + +#endif /* _UVM_UVM_GLUE_H_ */ diff --git a/sys/uvm/uvm_init.c b/sys/uvm/uvm_init.c new file mode 100644 index 00000000000..95406c95b0c --- /dev/null +++ b/sys/uvm/uvm_init.c @@ -0,0 +1,167 @@ +/* $NetBSD: uvm_init.c,v 1.10 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_init.c,v 1.1.2.3 1998/02/06 05:15:27 chs Exp + */ + +/* + * uvm_init.c: init the vm system. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/resourcevar.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/conf.h> + + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * struct uvm: we store all global vars in this structure to make them + * easier to spot... + */ + +struct uvm uvm; /* decl */ +struct uvmexp uvmexp; /* decl */ + +/* + * local prototypes + */ + +/* + * uvm_init: init the VM system. called from kern/init_main.c. + */ + +void +uvm_init() +{ + vaddr_t kvm_start, kvm_end; + + /* + * step 0: ensure that the hardware set the page size + */ + + if (uvmexp.pagesize == 0) { + panic("uvm_init: page size not set"); + } + + /* + * step 1: zero the uvm structure + */ + + bzero(&uvm, sizeof(uvm)); + averunnable.fscale = FSCALE; + + /* + * step 2: init the page sub-system. this includes allocating the + * vm_page structures, and setting up all the page queues (and + * locks). available memory will be put in the "free" queue. + * kvm_start and kvm_end will be set to the area of kernel virtual + * memory which is available for general use. + */ + + uvm_page_init(&kvm_start, &kvm_end); + + /* + * step 3: init the map sub-system. allocates the static pool of + * vm_map_entry structures that are used for "special" kernel maps + * (e.g. kernel_map, kmem_map, etc...). + */ + + uvm_map_init(); + + /* + * step 4: setup the kernel's virtual memory data structures. this + * includes setting up the kernel_map/kernel_object and the kmem_map/ + * kmem_object. + */ + + uvm_km_init(kvm_start, kvm_end); + + /* + * step 5: init the pmap module. the pmap module is free to allocate + * memory for its private use (e.g. pvlists). + */ + + pmap_init(); + + /* + * step 6: init the kernel memory allocator. after this call the + * kernel memory allocator (malloc) can be used. + */ + + kmeminit(); + + /* + * step 7: init all pagers and the pager_map. + */ + + uvm_pager_init(); + + /* + * step 8: init anonymous memory systems (both amap and anons) + */ + + amap_init(); /* init amap module */ + uvm_anon_init(); /* allocate initial anons */ + + /* + * the VM system is now up! now that malloc is up we can resize the + * <obj,off> => <page> hash table for general use and enable paging + * of kernel objects. + */ + + uvm_page_rehash(); + uao_create(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + UAO_FLAG_KERNSWAP); + + /* + * done! + */ + + return; +} diff --git a/sys/uvm/uvm_io.c b/sys/uvm/uvm_io.c new file mode 100644 index 00000000000..603e04b26d9 --- /dev/null +++ b/sys/uvm/uvm_io.c @@ -0,0 +1,163 @@ +/* $NetBSD: uvm_io.c,v 1.7 1998/10/11 23:18:20 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_io.c,v 1.1.2.2 1997/12/30 12:02:00 mrg Exp + */ + +/* + * uvm_io.c: uvm i/o ops + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * functions + */ + +/* + * uvm_io: perform I/O on a map + * + * => caller must have a reference to "map" so that it doesn't go away + * while we are working. + */ + +int +uvm_io(map, uio) + vm_map_t map; + struct uio *uio; +{ + vaddr_t baseva, endva, pageoffset, kva; + vsize_t chunksz, togo, sz; + vm_map_entry_t dead_entries; + int error; + + /* + * step 0: sanity checks and set up for copy loop. start with a + * large chunk size. if we have trouble finding vm space we will + * reduce it. + */ + + if (uio->uio_resid == 0) + return(0); + togo = uio->uio_resid; + + baseva = (vaddr_t) uio->uio_offset; + endva = baseva + (togo - 1); + + if (endva < baseva) /* wrap around? */ + return(EIO); + + if (baseva >= VM_MAXUSER_ADDRESS) + return(0); + if (endva >= VM_MAXUSER_ADDRESS) + /* EOF truncate */ + togo = togo - (endva - VM_MAXUSER_ADDRESS + 1); + pageoffset = baseva & PAGE_MASK; + baseva = trunc_page(baseva); + chunksz = min(round_page(togo + pageoffset), MAXBSIZE); + error = 0; + + /* + * step 1: main loop... while we've got data to move + */ + + for (/*null*/; togo > 0 ; pageoffset = 0) { + + /* + * step 2: extract mappings from the map into kernel_map + */ + + error = uvm_map_extract(map, baseva, chunksz, kernel_map, &kva, + UVM_EXTRACT_QREF | UVM_EXTRACT_CONTIG | + UVM_EXTRACT_FIXPROT); + if (error) { + + /* retry with a smaller chunk... */ + if (error == ENOMEM && chunksz > PAGE_SIZE) { + chunksz = trunc_page(chunksz / 2); + if (chunksz < PAGE_SIZE) + chunksz = PAGE_SIZE; + continue; + } + + break; + } + + /* + * step 3: move a chunk of data + */ + + sz = chunksz - pageoffset; + if (sz > togo) + sz = togo; + error = uiomove((caddr_t) (kva + pageoffset), sz, uio); + if (error) + break; + togo -= sz; + baseva += chunksz; + + + /* + * step 4: unmap the area of kernel memory + */ + + vm_map_lock(kernel_map); + (void)uvm_unmap_remove(kernel_map, kva, kva+chunksz, + &dead_entries); + vm_map_unlock(kernel_map); + + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, AMAP_REFALL); + } + + /* + * done + */ + + return (error); +} diff --git a/sys/uvm/uvm_km.c b/sys/uvm/uvm_km.c new file mode 100644 index 00000000000..49e9e5191bc --- /dev/null +++ b/sys/uvm/uvm_km.c @@ -0,0 +1,1081 @@ +/* $NetBSD: uvm_km.c,v 1.18 1998/10/18 23:49:59 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 + * from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * uvm_km.c: handle kernel memory allocation and management + */ + +/* + * overview of kernel memory management: + * + * the kernel virtual address space is mapped by "kernel_map." kernel_map + * starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS. + * note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map). + * + * the kernel_map has several "submaps." submaps can only appear in + * the kernel_map (user processes can't use them). submaps "take over" + * the management of a sub-range of the kernel's address space. submaps + * are typically allocated at boot time and are never released. kernel + * virtual address space that is mapped by a submap is locked by the + * submap's lock -- not the kernel_map's lock. + * + * thus, the useful feature of submaps is that they allow us to break + * up the locking and protection of the kernel address space into smaller + * chunks. + * + * the vm system has several standard kernel submaps, including: + * kmem_map => contains only wired kernel memory for the kernel + * malloc. *** access to kmem_map must be protected + * by splimp() because we are allowed to call malloc() + * at interrupt time *** + * mb_map => memory for large mbufs, *** protected by splimp *** + * pager_map => used to map "buf" structures into kernel space + * exec_map => used during exec to handle exec args + * etc... + * + * the kernel allocates its private memory out of special uvm_objects whose + * reference count is set to UVM_OBJ_KERN (thus indicating that the objects + * are "special" and never die). all kernel objects should be thought of + * as large, fixed-sized, sparsely populated uvm_objects. each kernel + * object is equal to the size of kernel virtual address space (i.e. the + * value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS"). + * + * most kernel private memory lives in kernel_object. the only exception + * to this is for memory that belongs to submaps that must be protected + * by splimp(). each of these submaps has their own private kernel + * object (e.g. kmem_object, mb_object). + * + * note that just because a kernel object spans the entire kernel virutal + * address space doesn't mean that it has to be mapped into the entire space. + * large chunks of a kernel object's space go unused either because + * that area of kernel VM is unmapped, or there is some other type of + * object mapped into that range (e.g. a vnode). for submap's kernel + * objects, the only part of the object that can ever be populated is the + * offsets that are managed by the submap. + * + * note that the "offset" in a kernel object is always the kernel virtual + * address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)). + * example: + * suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a + * uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the + * kernel map]. if uvm_km_alloc returns virtual address 0xf8235000, + * then that means that the page at offset 0x235000 in kernel_object is + * mapped at 0xf8235000. + * + * note that the offsets in kmem_object and mb_object also follow this + * rule. this means that the offsets for kmem_object must fall in the + * range of [vm_map_min(kmem_object) - vm_map_min(kernel_map)] to + * [vm_map_max(kmem_object) - vm_map_min(kernel_map)], so the offsets + * in those objects will typically not start at zero. + * + * kernel object have one other special property: when the kernel virtual + * memory mapping them is unmapped, the backing memory in the object is + * freed right away. this is done with the uvm_km_pgremove() function. + * this has to be done because there is no backing store for kernel pages + * and no need to save them after they are no longer referenced. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * global data structures + */ + +vm_map_t kernel_map = NULL; + +/* + * local functions + */ + +static int uvm_km_get __P((struct uvm_object *, vaddr_t, + vm_page_t *, int *, int, vm_prot_t, int, int)); +/* + * local data structues + */ + +static struct vm_map kernel_map_store; +static struct uvm_object kmem_object_store; +static struct uvm_object mb_object_store; + +static struct uvm_pagerops km_pager = { + NULL, /* init */ + NULL, /* attach */ + NULL, /* reference */ + NULL, /* detach */ + NULL, /* fault */ + NULL, /* flush */ + uvm_km_get, /* get */ + /* ... rest are NULL */ +}; + +/* + * uvm_km_get: pager get function for kernel objects + * + * => currently we do not support pageout to the swap area, so this + * pager is very simple. eventually we may want an anonymous + * object pager which will do paging. + * => XXXCDC: this pager should be phased out in favor of the aobj pager + */ + + +static int +uvm_km_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags) + struct uvm_object *uobj; + vaddr_t offset; + struct vm_page **pps; + int *npagesp; + int centeridx, advice, flags; + vm_prot_t access_type; +{ + vaddr_t current_offset; + vm_page_t ptmp; + int lcv, gotpages, maxpages; + boolean_t done; + UVMHIST_FUNC("uvm_km_get"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "flags=%d", flags,0,0,0); + + /* + * get number of pages + */ + + maxpages = *npagesp; + + /* + * step 1: handled the case where fault data structures are locked. + */ + + if (flags & PGO_LOCKED) { + + /* + * step 1a: get pages that are already resident. only do + * this if the data structures are locked (i.e. the first time + * through). + */ + + done = TRUE; /* be optimistic */ + gotpages = 0; /* # of pages we got so far */ + + for (lcv = 0, current_offset = offset ; + lcv < maxpages ; lcv++, current_offset += PAGE_SIZE) { + + /* do we care about this page? if not, skip it */ + if (pps[lcv] == PGO_DONTCARE) + continue; + + /* lookup page */ + ptmp = uvm_pagelookup(uobj, current_offset); + + /* null? attempt to allocate the page */ + if (ptmp == NULL) { + ptmp = uvm_pagealloc(uobj, current_offset, + NULL); + if (ptmp) { + /* new page */ + ptmp->flags &= ~(PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(ptmp, NULL); + uvm_pagezero(ptmp); + } + } + + /* + * to be useful must get a non-busy, non-released page + */ + if (ptmp == NULL || + (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + if (lcv == centeridx || + (flags & PGO_ALLPAGES) != 0) + /* need to do a wait or I/O! */ + done = FALSE; + continue; + } + + /* + * useful page: busy/lock it and plug it in our + * result array + */ + + /* caller must un-busy this page */ + ptmp->flags |= PG_BUSY; + UVM_PAGE_OWN(ptmp, "uvm_km_get1"); + pps[lcv] = ptmp; + gotpages++; + + } /* "for" lcv loop */ + + /* + * step 1b: now we've either done everything needed or we + * to unlock and do some waiting or I/O. + */ + + UVMHIST_LOG(maphist, "<- done (done=%d)", done, 0,0,0); + + *npagesp = gotpages; + if (done) + return(VM_PAGER_OK); /* bingo! */ + else + return(VM_PAGER_UNLOCK); /* EEK! Need to + * unlock and I/O */ + } + + /* + * step 2: get non-resident or busy pages. + * object is locked. data structures are unlocked. + */ + + for (lcv = 0, current_offset = offset ; + lcv < maxpages ; lcv++, current_offset += PAGE_SIZE) { + + /* skip over pages we've already gotten or don't want */ + /* skip over pages we don't _have_ to get */ + if (pps[lcv] != NULL || + (lcv != centeridx && (flags & PGO_ALLPAGES) == 0)) + continue; + + /* + * we have yet to locate the current page (pps[lcv]). we + * first look for a page that is already at the current offset. + * if we find a page, we check to see if it is busy or + * released. if that is the case, then we sleep on the page + * until it is no longer busy or released and repeat the + * lookup. if the page we found is neither busy nor + * released, then we busy it (so we own it) and plug it into + * pps[lcv]. this 'break's the following while loop and + * indicates we are ready to move on to the next page in the + * "lcv" loop above. + * + * if we exit the while loop with pps[lcv] still set to NULL, + * then it means that we allocated a new busy/fake/clean page + * ptmp in the object and we need to do I/O to fill in the + * data. + */ + + while (pps[lcv] == NULL) { /* top of "pps" while loop */ + + /* look for a current page */ + ptmp = uvm_pagelookup(uobj, current_offset); + + /* nope? allocate one now (if we can) */ + if (ptmp == NULL) { + + ptmp = uvm_pagealloc(uobj, current_offset, + NULL); /* alloc */ + + /* out of RAM? */ + if (ptmp == NULL) { + simple_unlock(&uobj->vmobjlock); + uvm_wait("kmgetwait1"); + simple_lock(&uobj->vmobjlock); + /* goto top of pps while loop */ + continue; + } + + /* + * got new page ready for I/O. break pps + * while loop. pps[lcv] is still NULL. + */ + break; + } + + /* page is there, see if we need to wait on it */ + if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + ptmp->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(ptmp,&uobj->vmobjlock, 0, + "uvn_get",0); + simple_lock(&uobj->vmobjlock); + continue; /* goto top of pps while loop */ + } + + /* + * if we get here then the page has become resident + * and unbusy between steps 1 and 2. we busy it now + * (so we own it) and set pps[lcv] (so that we exit + * the while loop). caller must un-busy. + */ + ptmp->flags |= PG_BUSY; + UVM_PAGE_OWN(ptmp, "uvm_km_get2"); + pps[lcv] = ptmp; + } + + /* + * if we own the a valid page at the correct offset, pps[lcv] + * will point to it. nothing more to do except go to the + * next page. + */ + + if (pps[lcv]) + continue; /* next lcv */ + + /* + * we have a "fake/busy/clean" page that we just allocated. + * do the needed "i/o" (in this case that means zero it). + */ + + uvm_pagezero(ptmp); + ptmp->flags &= ~(PG_FAKE); + pps[lcv] = ptmp; + + } /* lcv loop */ + + /* + * finally, unlock object and return. + */ + + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(maphist, "<- done (OK)",0,0,0,0); + return(VM_PAGER_OK); +} + +/* + * uvm_km_init: init kernel maps and objects to reflect reality (i.e. + * KVM already allocated for text, data, bss, and static data structures). + * + * => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS. + * we assume that [min -> start] has already been allocated and that + * "end" is the end. + */ + +void +uvm_km_init(start, end) + vaddr_t start, end; +{ + vaddr_t base = VM_MIN_KERNEL_ADDRESS; + + /* + * first, init kernel memory objects. + */ + + /* kernel_object: for pageable anonymous kernel memory */ + uvm.kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS - + VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ); + + /* kmem_object: for malloc'd memory (wired, protected by splimp) */ + simple_lock_init(&kmem_object_store.vmobjlock); + kmem_object_store.pgops = &km_pager; + TAILQ_INIT(&kmem_object_store.memq); + kmem_object_store.uo_npages = 0; + /* we are special. we never die */ + kmem_object_store.uo_refs = UVM_OBJ_KERN; + uvmexp.kmem_object = &kmem_object_store; + + /* mb_object: for mbuf memory (always wired, protected by splimp) */ + simple_lock_init(&mb_object_store.vmobjlock); + mb_object_store.pgops = &km_pager; + TAILQ_INIT(&mb_object_store.memq); + mb_object_store.uo_npages = 0; + /* we are special. we never die */ + mb_object_store.uo_refs = UVM_OBJ_KERN; + uvmexp.mb_object = &mb_object_store; + + /* + * init the map and reserve allready allocated kernel space + * before installing. + */ + + uvm_map_setup(&kernel_map_store, base, end, FALSE); + kernel_map_store.pmap = pmap_kernel(); + if (uvm_map(&kernel_map_store, &base, start - base, NULL, + UVM_UNKNOWN_OFFSET, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, + UVM_INH_NONE, UVM_ADV_RANDOM,UVM_FLAG_FIXED)) != KERN_SUCCESS) + panic("uvm_km_init: could not reserve space for kernel"); + + /* + * install! + */ + + kernel_map = &kernel_map_store; +} + +/* + * uvm_km_suballoc: allocate a submap in the kernel map. once a submap + * is allocated all references to that area of VM must go through it. this + * allows the locking of VAs in kernel_map to be broken up into regions. + * + * => if `fixed' is true, *min specifies where the region described + * by the submap must start + * => if submap is non NULL we use that as the submap, otherwise we + * alloc a new map + */ +struct vm_map * +uvm_km_suballoc(map, min, max, size, pageable, fixed, submap) + struct vm_map *map; + vaddr_t *min, *max; /* OUT, OUT */ + vsize_t size; + boolean_t pageable; + boolean_t fixed; + struct vm_map *submap; +{ + int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0); + + size = round_page(size); /* round up to pagesize */ + + /* + * first allocate a blank spot in the parent map + */ + + if (uvm_map(map, min, size, NULL, UVM_UNKNOWN_OFFSET, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, mapflags)) != KERN_SUCCESS) { + panic("uvm_km_suballoc: unable to allocate space in parent map"); + } + + /* + * set VM bounds (min is filled in by uvm_map) + */ + + *max = *min + size; + + /* + * add references to pmap and create or init the submap + */ + + pmap_reference(vm_map_pmap(map)); + if (submap == NULL) { + submap = uvm_map_create(vm_map_pmap(map), *min, *max, pageable); + if (submap == NULL) + panic("uvm_km_suballoc: unable to create submap"); + } else { + uvm_map_setup(submap, *min, *max, pageable); + submap->pmap = vm_map_pmap(map); + } + + /* + * now let uvm_map_submap plug in it... + */ + + if (uvm_map_submap(map, *min, *max, submap) != KERN_SUCCESS) + panic("uvm_km_suballoc: submap allocation failed"); + + return(submap); +} + +/* + * uvm_km_pgremove: remove pages from a kernel uvm_object. + * + * => when you unmap a part of anonymous kernel memory you want to toss + * the pages right away. (this gets called from uvm_unmap_...). + */ + +#define UKM_HASH_PENALTY 4 /* a guess */ + +void +uvm_km_pgremove(uobj, start, end) + struct uvm_object *uobj; + vaddr_t start, end; +{ + boolean_t by_list, is_aobj; + struct vm_page *pp, *ppnext; + vaddr_t curoff; + UVMHIST_FUNC("uvm_km_pgremove"); UVMHIST_CALLED(maphist); + + simple_lock(&uobj->vmobjlock); /* lock object */ + + /* is uobj an aobj? */ + is_aobj = uobj->pgops == &aobj_pager; + + /* choose cheapest traversal */ + by_list = (uobj->uo_npages <= + ((end - start) >> PAGE_SHIFT) * UKM_HASH_PENALTY); + + if (by_list) + goto loop_by_list; + + /* by hash */ + + for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { + pp = uvm_pagelookup(uobj, curoff); + if (pp == NULL) + continue; + + UVMHIST_LOG(maphist," page 0x%x, busy=%d", pp, + pp->flags & PG_BUSY, 0, 0); + /* now do the actual work */ + if (pp->flags & PG_BUSY) + /* owner must check for this when done */ + pp->flags |= PG_RELEASED; + else { + pmap_page_protect(PMAP_PGARG(pp), VM_PROT_NONE); + + /* + * if this kernel object is an aobj, free the swap slot. + */ + if (is_aobj) { + int slot = uao_set_swslot(uobj, + curoff >> PAGE_SHIFT, + 0); + + if (slot) + uvm_swap_free(slot, 1); + } + + uvm_lock_pageq(); + uvm_pagefree(pp); + uvm_unlock_pageq(); + } + /* done */ + + } + simple_unlock(&uobj->vmobjlock); + return; + +loop_by_list: + + for (pp = uobj->memq.tqh_first ; pp != NULL ; pp = ppnext) { + + ppnext = pp->listq.tqe_next; + if (pp->offset < start || pp->offset >= end) { + continue; + } + + UVMHIST_LOG(maphist," page 0x%x, busy=%d", pp, + pp->flags & PG_BUSY, 0, 0); + /* now do the actual work */ + if (pp->flags & PG_BUSY) + /* owner must check for this when done */ + pp->flags |= PG_RELEASED; + else { + pmap_page_protect(PMAP_PGARG(pp), VM_PROT_NONE); + + /* + * if this kernel object is an aobj, free the swap slot. + */ + if (is_aobj) { + int slot = uao_set_swslot(uobj, + pp->offset >> PAGE_SHIFT, 0); + + if (slot) + uvm_swap_free(slot, 1); + } + + uvm_lock_pageq(); + uvm_pagefree(pp); + uvm_unlock_pageq(); + } + /* done */ + + } + simple_unlock(&uobj->vmobjlock); + return; +} + + +/* + * uvm_km_kmemalloc: lower level kernel memory allocator for malloc() + * + * => we map wired memory into the specified map using the obj passed in + * => NOTE: we can return NULL even if we can wait if there is not enough + * free VM space in the map... caller should be prepared to handle + * this case. + * => we return KVA of memory allocated + * => flags: NOWAIT, VALLOC - just allocate VA, TRYLOCK - fail if we can't + * lock the map + */ + +vaddr_t +uvm_km_kmemalloc(map, obj, size, flags) + vm_map_t map; + struct uvm_object *obj; + vsize_t size; + int flags; +{ + vaddr_t kva, loopva; + vaddr_t offset; + struct vm_page *pg; + UVMHIST_FUNC("uvm_km_kmemalloc"); UVMHIST_CALLED(maphist); + + + UVMHIST_LOG(maphist," (map=0x%x, obj=0x%x, size=0x%x, flags=%d)", + map, obj, size, flags); +#ifdef DIAGNOSTIC + /* sanity check */ + if (vm_map_pmap(map) != pmap_kernel()) + panic("uvm_km_kmemalloc: invalid map"); +#endif + + /* + * setup for call + */ + + size = round_page(size); + kva = vm_map_min(map); /* hint */ + + /* + * allocate some virtual space + */ + + if (uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, (flags & UVM_KMF_TRYLOCK))) + != KERN_SUCCESS) { + UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0); + return(0); + } + + /* + * if all we wanted was VA, return now + */ + + if (flags & UVM_KMF_VALLOC) { + UVMHIST_LOG(maphist,"<- done valloc (kva=0x%x)", kva,0,0,0); + return(kva); + } + /* + * recover object offset from virtual address + */ + + offset = kva - vm_map_min(kernel_map); + UVMHIST_LOG(maphist, " kva=0x%x, offset=0x%x", kva, offset,0,0); + + /* + * now allocate and map in the memory... note that we are the only ones + * whom should ever get a handle on this area of VM. + */ + + loopva = kva; + while (size) { + simple_lock(&obj->vmobjlock); + pg = uvm_pagealloc(obj, offset, NULL); + if (pg) { + pg->flags &= ~PG_BUSY; /* new page */ + UVM_PAGE_OWN(pg, NULL); + } + simple_unlock(&obj->vmobjlock); + + /* + * out of memory? + */ + + if (pg == NULL) { + if (flags & UVM_KMF_NOWAIT) { + /* free everything! */ + uvm_unmap(map, kva, kva + size); + return(0); + } else { + uvm_wait("km_getwait2"); /* sleep here */ + continue; + } + } + + /* + * map it in: note that we call pmap_enter with the map and + * object unlocked in case we are kmem_map/kmem_object + * (because if pmap_enter wants to allocate out of kmem_object + * it will need to lock it itself!) + */ +#if defined(PMAP_NEW) + pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg), VM_PROT_ALL); +#else + pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg), + UVM_PROT_ALL, TRUE); +#endif + loopva += PAGE_SIZE; + offset += PAGE_SIZE; + size -= PAGE_SIZE; + } + + UVMHIST_LOG(maphist,"<- done (kva=0x%x)", kva,0,0,0); + return(kva); +} + +/* + * uvm_km_free: free an area of kernel memory + */ + +void +uvm_km_free(map, addr, size) + vm_map_t map; + vaddr_t addr; + vsize_t size; +{ + + uvm_unmap(map, trunc_page(addr), round_page(addr+size)); +} + +/* + * uvm_km_free_wakeup: free an area of kernel memory and wake up + * anyone waiting for vm space. + * + * => XXX: "wanted" bit + unlock&wait on other end? + */ + +void +uvm_km_free_wakeup(map, addr, size) + vm_map_t map; + vaddr_t addr; + vsize_t size; +{ + vm_map_entry_t dead_entries; + + vm_map_lock(map); + (void)uvm_unmap_remove(map, trunc_page(addr), round_page(addr+size), + &dead_entries); + thread_wakeup(map); + vm_map_unlock(map); + + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, 0); +} + +/* + * uvm_km_alloc1: allocate wired down memory in the kernel map. + * + * => we can sleep if needed + */ + +vaddr_t +uvm_km_alloc1(map, size, zeroit) + vm_map_t map; + vsize_t size; + boolean_t zeroit; +{ + vaddr_t kva, loopva, offset; + struct vm_page *pg; + UVMHIST_FUNC("uvm_km_alloc1"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist,"(map=0x%x, size=0x%x)", map, size,0,0); + +#ifdef DIAGNOSTIC + if (vm_map_pmap(map) != pmap_kernel()) + panic("uvm_km_alloc1"); +#endif + + size = round_page(size); + kva = vm_map_min(map); /* hint */ + + /* + * allocate some virtual space + */ + + if (uvm_map(map, &kva, size, uvm.kernel_object, UVM_UNKNOWN_OFFSET, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, 0)) != KERN_SUCCESS) { + UVMHIST_LOG(maphist,"<- done (no VM)",0,0,0,0); + return(0); + } + + /* + * recover object offset from virtual address + */ + + offset = kva - vm_map_min(kernel_map); + UVMHIST_LOG(maphist," kva=0x%x, offset=0x%x", kva, offset,0,0); + + /* + * now allocate the memory. we must be careful about released pages. + */ + + loopva = kva; + while (size) { + simple_lock(&uvm.kernel_object->vmobjlock); + pg = uvm_pagelookup(uvm.kernel_object, offset); + + /* + * if we found a page in an unallocated region, it must be + * released + */ + if (pg) { + if ((pg->flags & PG_RELEASED) == 0) + panic("uvm_km_alloc1: non-released page"); + pg->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(pg, &uvm.kernel_object->vmobjlock, + 0, "km_alloc", 0); + continue; /* retry */ + } + + /* allocate ram */ + pg = uvm_pagealloc(uvm.kernel_object, offset, NULL); + if (pg) { + pg->flags &= ~PG_BUSY; /* new page */ + UVM_PAGE_OWN(pg, NULL); + } + simple_unlock(&uvm.kernel_object->vmobjlock); + if (pg == NULL) { + uvm_wait("km_alloc1w"); /* wait for memory */ + continue; + } + + /* map it in */ +#if defined(PMAP_NEW) + pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg), UVM_PROT_ALL); +#else + pmap_enter(map->pmap, loopva, VM_PAGE_TO_PHYS(pg), + UVM_PROT_ALL, TRUE); +#endif + loopva += PAGE_SIZE; + offset += PAGE_SIZE; + size -= PAGE_SIZE; + } + + /* + * zero on request (note that "size" is now zero due to the above loop + * so we need to subtract kva from loopva to reconstruct the size). + */ + + if (zeroit) + bzero((caddr_t)kva, loopva - kva); + + UVMHIST_LOG(maphist,"<- done (kva=0x%x)", kva,0,0,0); + return(kva); +} + +/* + * uvm_km_valloc: allocate zero-fill memory in the kernel's address space + * + * => memory is not allocated until fault time + */ + +vaddr_t +uvm_km_valloc(map, size) + vm_map_t map; + vsize_t size; +{ + vaddr_t kva; + UVMHIST_FUNC("uvm_km_valloc"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, size=0x%x)", map, size, 0,0); + +#ifdef DIAGNOSTIC + if (vm_map_pmap(map) != pmap_kernel()) + panic("uvm_km_valloc"); +#endif + + size = round_page(size); + kva = vm_map_min(map); /* hint */ + + /* + * allocate some virtual space. will be demand filled by kernel_object. + */ + + if (uvm_map(map, &kva, size, uvm.kernel_object, UVM_UNKNOWN_OFFSET, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, 0)) != KERN_SUCCESS) { + UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0); + return(0); + } + + UVMHIST_LOG(maphist, "<- done (kva=0x%x)", kva,0,0,0); + return(kva); +} + +/* + * uvm_km_valloc_wait: allocate zero-fill memory in the kernel's address space + * + * => memory is not allocated until fault time + * => if no room in map, wait for space to free, unless requested size + * is larger than map (in which case we return 0) + */ + +vaddr_t +uvm_km_valloc_wait(map, size) + vm_map_t map; + vsize_t size; +{ + vaddr_t kva; + UVMHIST_FUNC("uvm_km_valloc_wait"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, size=0x%x)", map, size, 0,0); + +#ifdef DIAGNOSTIC + if (vm_map_pmap(map) != pmap_kernel()) + panic("uvm_km_valloc_wait"); +#endif + + size = round_page(size); + if (size > vm_map_max(map) - vm_map_min(map)) + return(0); + + while (1) { + kva = vm_map_min(map); /* hint */ + + /* + * allocate some virtual space. will be demand filled + * by kernel_object. + */ + + if (uvm_map(map, &kva, size, uvm.kernel_object, + UVM_UNKNOWN_OFFSET, UVM_MAPFLAG(UVM_PROT_ALL, + UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_RANDOM, 0)) + == KERN_SUCCESS) { + UVMHIST_LOG(maphist,"<- done (kva=0x%x)", kva,0,0,0); + return(kva); + } + + /* + * failed. sleep for a while (on map) + */ + + UVMHIST_LOG(maphist,"<<<sleeping>>>",0,0,0,0); + tsleep((caddr_t)map, PVM, "vallocwait", 0); + } + /*NOTREACHED*/ +} + +/* Sanity; must specify both or none. */ +#if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \ + (!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE)) +#error Must specify MAP and UNMAP together. +#endif + +/* + * uvm_km_alloc_poolpage: allocate a page for the pool allocator + * + * => if the pmap specifies an alternate mapping method, we use it. + */ + +/* ARGSUSED */ +vaddr_t +uvm_km_alloc_poolpage1(map, obj, waitok) + vm_map_t map; + struct uvm_object *obj; + boolean_t waitok; +{ +#if defined(PMAP_MAP_POOLPAGE) + struct vm_page *pg; + vaddr_t va; + + again: + pg = uvm_pagealloc(NULL, 0, NULL); + if (pg == NULL) { + if (waitok) { + uvm_wait("plpg"); + goto again; + } else + return (0); + } + va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg)); + if (va == 0) + uvm_pagefree(pg); + return (va); +#else + vaddr_t va; + int s; + + /* + * NOTE: We may be called with a map that doens't require splimp + * protection (e.g. kernel_map). However, it does not hurt to + * go to splimp in this case (since unprocted maps will never be + * accessed in interrupt context). + * + * XXX We may want to consider changing the interface to this + * XXX function. + */ + + s = splimp(); + va = uvm_km_kmemalloc(map, obj, PAGE_SIZE, waitok ? 0 : UVM_KMF_NOWAIT); + splx(s); + return (va); +#endif /* PMAP_MAP_POOLPAGE */ +} + +/* + * uvm_km_free_poolpage: free a previously allocated pool page + * + * => if the pmap specifies an alternate unmapping method, we use it. + */ + +/* ARGSUSED */ +void +uvm_km_free_poolpage1(map, addr) + vm_map_t map; + vaddr_t addr; +{ +#if defined(PMAP_UNMAP_POOLPAGE) + paddr_t pa; + + pa = PMAP_UNMAP_POOLPAGE(addr); + uvm_pagefree(PHYS_TO_VM_PAGE(pa)); +#else + int s; + + /* + * NOTE: We may be called with a map that doens't require splimp + * protection (e.g. kernel_map). However, it does not hurt to + * go to splimp in this case (since unprocted maps will never be + * accessed in interrupt context). + * + * XXX We may want to consider changing the interface to this + * XXX function. + */ + + s = splimp(); + uvm_km_free(map, addr, PAGE_SIZE); + splx(s); +#endif /* PMAP_UNMAP_POOLPAGE */ +} diff --git a/sys/uvm/uvm_km.h b/sys/uvm/uvm_km.h new file mode 100644 index 00000000000..ba941255020 --- /dev/null +++ b/sys/uvm/uvm_km.h @@ -0,0 +1,55 @@ +/* $NetBSD: uvm_km.h,v 1.6 1998/08/13 02:11:01 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_km.h,v 1.1.2.2 1997/12/30 12:03:15 mrg Exp + */ + +#ifndef _UVM_UVM_KM_H_ +#define _UVM_UVM_KM_H_ + +/* + * uvm_km.h + */ + +/* + * prototypes + */ + +void uvm_km_init __P((vaddr_t, vaddr_t)); +void uvm_km_pgremove __P((struct uvm_object *, vaddr_t, vaddr_t)); + +#endif /* _UVM_UVM_KM_H_ */ diff --git a/sys/uvm/uvm_loan.c b/sys/uvm/uvm_loan.c new file mode 100644 index 00000000000..d8716b46f52 --- /dev/null +++ b/sys/uvm/uvm_loan.c @@ -0,0 +1,755 @@ +/* $NetBSD: uvm_loan.c,v 1.13 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_loan.c,v 1.1.6.4 1998/02/06 05:08:43 chs Exp + */ + +/* + * uvm_loan.c: page loanout handler + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * "loaned" pages are pages which are (read-only, copy-on-write) loaned + * from the VM system to other parts of the kernel. this allows page + * copying to be avoided (e.g. you can loan pages from objs/anons to + * the mbuf system). + * + * there are 3 types of loans possible: + * O->K uvm_object page to wired kernel page (e.g. mbuf data area) + * A->K anon page to kernel wired kernel page (e.g. mbuf data area) + * O->A uvm_object to anon loan (e.g. vnode page to an anon) + * note that it possible to have an O page loaned to both an A and K + * at the same time. + * + * loans are tracked by pg->loan_count. an O->A page will have both + * a uvm_object and a vm_anon, but PQ_ANON will not be set. this sort + * of page is considered "owned" by the uvm_object (not the anon). + * + * each loan of a page to a wired kernel page bumps the pg->wire_count. + * wired kernel mappings should be entered with pmap_kenter functions + * so that pmap_page_protect() will not affect the kernel mappings. + * (this requires the PMAP_NEW interface...). + * + * owners that want to free their pages and discover that they are + * loaned out simply "disown" them (the page becomes an orphan). these + * pages should be freed when the last loan is dropped. in some cases + * an anon may "adopt" an orphaned page. + * + * locking: to read pg->loan_count either the owner or the page queues + * must be locked. to modify pg->loan_count, both the owner of the page + * and the PQs must be locked. pg->flags is (as always) locked by + * the owner of the page. + * + * note that locking from the "loaned" side is tricky since the object + * getting the loaned page has no reference to the page's owner and thus + * the owner could "die" at any time. in order to prevent the owner + * from dying the page queues should be locked. this forces us to sometimes + * use "try" locking. + * + * loans are typically broken by the following events: + * 1. write fault to a loaned page + * 2. pageout of clean+inactive O->A loaned page + * 3. owner frees page (e.g. pager flush) + * + * note that loaning a page causes all mappings of the page to become + * read-only (via pmap_page_protect). this could have an unexpected + * effect on normal "wired" pages if one is not careful. + */ + +/* + * local prototypes + */ + +static int uvm_loananon __P((struct uvm_faultinfo *, void ***, + int, struct vm_anon *)); +static int uvm_loanentry __P((struct uvm_faultinfo *, void ***, int)); +static int uvm_loanuobj __P((struct uvm_faultinfo *, void ***, + int, vaddr_t)); +static int uvm_loanzero __P((struct uvm_faultinfo *, void ***, int)); + +/* + * inlines + */ + +/* + * uvm_loanentry: loan out pages in a map entry (helper fn for uvm_loan()) + * + * => "ufi" is the result of a successful map lookup (meaning that + * the maps are locked by the caller) + * => we may unlock the maps if needed (for I/O) + * => we put our output result in "output" + * => we return the number of pages we loaned, or -1 if we had an error + */ + +static __inline int +uvm_loanentry(ufi, output, flags) + struct uvm_faultinfo *ufi; + void ***output; + int flags; +{ + vaddr_t curaddr = ufi->orig_rvaddr; + vsize_t togo = ufi->size; + struct vm_aref *aref = &ufi->entry->aref; + struct uvm_object *uobj = ufi->entry->object.uvm_obj; + struct vm_anon *anon; + int rv, result = 0; + + /* + * lock us the rest of the way down + */ + if (aref->ar_amap) + amap_lock(aref->ar_amap); + if (uobj) + simple_lock(&uobj->vmobjlock); + + /* + * loop until done + */ + while (togo) { + + /* + * find the page we want. check the anon layer first. + */ + + if (aref->ar_amap) { + anon = amap_lookup(aref, curaddr - ufi->entry->start); + } else { + anon = NULL; + } + + if (anon) { + rv = uvm_loananon(ufi, output, flags, anon); + } else if (uobj) { + rv = uvm_loanuobj(ufi, output, flags, curaddr); + } else if (UVM_ET_ISCOPYONWRITE(ufi->entry)) { + rv = uvm_loanzero(ufi, output, flags); + } else { + rv = -1; /* null map entry... fail now */ + } + + /* total failure */ + if (rv < 0) + return(-1); + + /* relock failed, need to do another lookup */ + if (rv == 0) + return(result); + + /* + * got it... advance to next page + */ + result++; + togo -= PAGE_SIZE; + curaddr += PAGE_SIZE; + } + + /* + * unlock everything and return + */ + uvmfault_unlockall(ufi, aref->ar_amap, uobj, NULL); + return(result); +} + +/* + * normal functions + */ + +/* + * uvm_loan: loan pages out to anons or to the kernel + * + * => map should be unlocked + * => start and len should be multiples of PAGE_SIZE + * => result is either an array of anon's or vm_pages (depending on flags) + * => flag values: UVM_LOAN_TOANON - loan to anons + * UVM_LOAN_TOPAGE - loan to wired kernel page + * one and only one of these flags must be set! + */ + +int +uvm_loan(map, start, len, result, flags) + struct vm_map *map; + vaddr_t start; + vsize_t len; + void **result; + int flags; +{ + struct uvm_faultinfo ufi; + void **output; + int rv; + + /* + * ensure that one and only one of the flags is set + */ + + if ((flags & (UVM_LOAN_TOANON|UVM_LOAN_TOPAGE)) == + (UVM_LOAN_TOANON|UVM_LOAN_TOPAGE) || + (flags & (UVM_LOAN_TOANON|UVM_LOAN_TOPAGE)) == 0) + return(KERN_FAILURE); + + /* + * "output" is a pointer to the current place to put the loaned + * page... + */ + + output = &result[0]; /* start at the beginning ... */ + + /* + * while we've got pages to do + */ + + while (len > 0) { + + /* + * fill in params for a call to uvmfault_lookup + */ + + ufi.orig_map = map; + ufi.orig_rvaddr = start; + ufi.orig_size = len; + + /* + * do the lookup, the only time this will fail is if we hit on + * an unmapped region (an error) + */ + + if (!uvmfault_lookup(&ufi, FALSE)) + goto fail; + + /* + * now do the loanout + */ + rv = uvm_loanentry(&ufi, &output, flags); + if (rv < 0) + goto fail; + + /* + * done! advance pointers and unlock. + */ + rv <<= PAGE_SHIFT; + len -= rv; + start += rv; + uvmfault_unlockmaps(&ufi, FALSE); + } + + /* + * got it! return success. + */ + + return(KERN_SUCCESS); + +fail: + /* + * fail: failed to do it. drop our loans and return failure code. + */ + if (output - result) { + if (flags & UVM_LOAN_TOANON) + uvm_unloananon((struct vm_anon **)result, + output - result); + else + uvm_unloanpage((struct vm_page **)result, + output - result); + } + return(KERN_FAILURE); +} + +/* + * uvm_loananon: loan a page from an anon out + * + * => return value: + * -1 = fatal error, everything is unlocked, abort. + * 0 = lookup in ufi went stale, everything unlocked, relookup and + * try again + * 1 = got it, everything still locked + */ + +int +uvm_loananon(ufi, output, flags, anon) + struct uvm_faultinfo *ufi; + void ***output; + int flags; + struct vm_anon *anon; +{ + struct vm_page *pg; + int result; + + /* + * if we are loaning to another anon then it is easy, we just + * bump the reference count on the current anon and return a + * pointer to it. + */ + if (flags & UVM_LOAN_TOANON) { + simple_lock(&anon->an_lock); + pg = anon->u.an_page; + if (pg && (pg->pqflags & PQ_ANON) != 0 && anon->an_ref == 1) + /* read protect it */ + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_READ); + anon->an_ref++; + **output = anon; + *output = (*output) + 1; + simple_unlock(&anon->an_lock); + return(1); + } + + /* + * we are loaning to a kernel-page. we need to get the page + * resident so we can wire it. uvmfault_anonget will handle + * this for us. + */ + + simple_lock(&anon->an_lock); + result = uvmfault_anonget(ufi, ufi->entry->aref.ar_amap, anon); + + /* + * if we were unable to get the anon, then uvmfault_anonget has + * unlocked everything and returned an error code. + */ + + if (result != VM_PAGER_OK) { + + /* need to refault (i.e. refresh our lookup) ? */ + if (result == VM_PAGER_REFAULT) + return(0); + + /* "try again"? sleep a bit and retry ... */ + if (result == VM_PAGER_AGAIN) { + tsleep((caddr_t)&lbolt, PVM, "loanagain", 0); + return(0); + } + + /* otherwise flag it as an error */ + return(-1); + } + + /* + * we have the page and its owner locked: do the loan now. + */ + + pg = anon->u.an_page; + uvm_lock_pageq(); + if (pg->loan_count == 0) + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_READ); + pg->loan_count++; + uvm_pagewire(pg); /* always wire it */ + uvm_unlock_pageq(); + **output = pg; + *output = (*output) + 1; + + /* unlock anon and return success */ + if (pg->uobject) + simple_unlock(&pg->uobject->vmobjlock); + simple_unlock(&anon->an_lock); + return(1); +} + +/* + * uvm_loanuobj: loan a page from a uobj out + * + * => return value: + * -1 = fatal error, everything is unlocked, abort. + * 0 = lookup in ufi went stale, everything unlocked, relookup and + * try again + * 1 = got it, everything still locked + */ + +int +uvm_loanuobj(ufi, output, flags, va) + struct uvm_faultinfo *ufi; + void ***output; + int flags; + vaddr_t va; +{ + struct vm_amap *amap = ufi->entry->aref.ar_amap; + struct uvm_object *uobj = ufi->entry->object.uvm_obj; + struct vm_page *pg; + struct vm_anon *anon; + int result, npages; + boolean_t locked; + + /* + * first we must make sure the page is resident. + * + * XXXCDC: duplicate code with uvm_fault(). + */ + + if (uobj->pgops->pgo_get) { + npages = 1; + pg = NULL; + result = uobj->pgops->pgo_get(uobj, va - ufi->entry->start, + &pg, &npages, 0, VM_PROT_READ, MADV_NORMAL, PGO_LOCKED); + } else { + result = VM_PAGER_ERROR; + } + + /* + * check the result of the locked pgo_get. if there is a problem, + * then we fail the loan. + */ + + if (result != VM_PAGER_OK && result != VM_PAGER_UNLOCK) { + uvmfault_unlockall(ufi, amap, uobj, NULL); + return(-1); + } + + /* + * if we need to unlock for I/O, do so now. + */ + + if (result == VM_PAGER_UNLOCK) { + uvmfault_unlockall(ufi, amap, NULL, NULL); + + npages = 1; + /* locked: uobj */ + result = uobj->pgops->pgo_get(uobj, va - ufi->entry->start, + &pg, &npages, 0, VM_PROT_READ, MADV_NORMAL, 0); + /* locked: <nothing> */ + + /* + * check for errors + */ + + if (result != VM_PAGER_OK) { + if (result == VM_PAGER_AGAIN) { + tsleep((caddr_t)&lbolt, PVM, "fltagain2", 0); + return(0); /* redo the lookup and try again */ + } + return(-1); /* total failure */ + } + + /* + * pgo_get was a success. attempt to relock everything. + */ + + locked = uvmfault_relock(ufi); + if (locked && amap) + amap_lock(amap); + simple_lock(&uobj->vmobjlock); + + /* + * verify that the page has not be released and re-verify + * that amap slot is still free. if there is a problem we + * drop our lock (thus force a lookup refresh/retry). + */ + + if ((pg->flags & PG_RELEASED) != 0 || + (locked && amap && amap_lookup(&ufi->entry->aref, + ufi->orig_rvaddr - ufi->entry->start))) { + + if (locked) + uvmfault_unlockall(ufi, amap, NULL, NULL); + locked = FALSE; + } + + /* + * didn't get the lock? release the page and retry. + */ + + if (locked == FALSE) { + + if (pg->flags & PG_WANTED) + /* still holding object lock */ + thread_wakeup(pg); + + if (pg->flags & PG_RELEASED) { +#ifdef DIAGNOSTIC + if (uobj->pgops->pgo_releasepg == NULL) + panic("uvm_loanuobj: object has no releasepg function"); +#endif + /* frees page */ + if (uobj->pgops->pgo_releasepg(pg, NULL)) + simple_unlock(&uobj->vmobjlock); + return (0); + } + + uvm_lock_pageq(); + uvm_pageactivate(pg); /* make sure it is in queues */ + uvm_unlock_pageq(); + pg->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(pg, NULL); + simple_unlock(&uobj->vmobjlock); + return (0); + } + } + + /* + * at this point we have the page we want ("pg") marked PG_BUSY for us + * and we have all data structures locked. do the loanout. page can + * not be PG_RELEASED (we caught this above). + */ + + if ((flags & UVM_LOAN_TOANON) == 0) { /* loan to wired-kernel page? */ + uvm_lock_pageq(); + if (pg->loan_count == 0) + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_READ); + pg->loan_count++; + uvm_pagewire(pg); + uvm_unlock_pageq(); + **output = pg; + *output = (*output) + 1; + if (pg->flags & PG_WANTED) + thread_wakeup(pg); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + return(1); /* got it! */ + } + + /* + * must be a loan to an anon. check to see if there is already + * an anon associated with this page. if so, then just return + * a reference to this object. the page should already be + * mapped read-only because it is already on loan. + */ + + if (pg->uanon) { + anon = pg->uanon; + simple_lock(&anon->an_lock); + anon->an_ref++; + simple_unlock(&anon->an_lock); + **output = anon; + *output = (*output) + 1; + uvm_lock_pageq(); + uvm_pageactivate(pg); /* reactivate */ + uvm_unlock_pageq(); + if (pg->flags & PG_WANTED) + thread_wakeup(pg); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + return(1); + } + + /* + * need to allocate a new anon + */ + + anon = uvm_analloc(); + if (anon == NULL) { /* out of VM! */ + if (pg->flags & PG_WANTED) + thread_wakeup(pg); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + uvmfault_unlockall(ufi, amap, uobj, NULL); + return(-1); + } + anon->u.an_page = pg; + pg->uanon = anon; + uvm_lock_pageq(); + if (pg->loan_count == 0) + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_READ); + pg->loan_count++; + uvm_pageactivate(pg); + uvm_unlock_pageq(); + **output = anon; + *output = (*output) + 1; + if (pg->flags & PG_WANTED) + thread_wakeup(pg); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + return(1); +} + +/* + * uvm_loanzero: "loan" a zero-fill page out + * + * => return value: + * -1 = fatal error, everything is unlocked, abort. + * 0 = lookup in ufi went stale, everything unlocked, relookup and + * try again + * 1 = got it, everything still locked + */ + +int +uvm_loanzero(ufi, output, flags) + struct uvm_faultinfo *ufi; + void ***output; + int flags; +{ + struct vm_anon *anon; + struct vm_page *pg; + + if ((flags & UVM_LOAN_TOANON) == 0) { /* loaning to kernel-page */ + + while ((pg = uvm_pagealloc(NULL, 0, NULL)) == NULL) { + uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, + ufi->entry->object.uvm_obj, NULL); + uvm_wait("loanzero1"); + if (!uvmfault_relock(ufi)) + return(0); + if (ufi->entry->aref.ar_amap) + amap_lock(ufi->entry->aref.ar_amap); + if (ufi->entry->object.uvm_obj) + simple_lock( + &ufi->entry->object.uvm_obj->vmobjlock); + /* ... and try again */ + } + + /* got a page, zero it and return */ + uvm_pagezero(pg); /* clears PG_CLEAN */ + pg->flags &= ~(PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(pg, NULL); + **output = pg; + *output = (*output) + 1; + uvm_lock_pageq(); + /* wire it as we are loaning to kernel-page */ + uvm_pagewire(pg); + pg->loan_count = 1; + uvm_unlock_pageq(); + return(1); + } + + /* loaning to an anon */ + while ((anon = uvm_analloc()) == NULL || + (pg = uvm_pagealloc(NULL, 0, anon)) == NULL) { + + /* unlock everything */ + uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, + ufi->entry->object.uvm_obj, NULL); + + /* out of swap causes us to fail */ + if (anon == NULL) + return(-1); + + uvm_anfree(anon); + uvm_wait("loanzero2"); /* wait for pagedaemon */ + + if (!uvmfault_relock(ufi)) + /* map changed while unlocked, need relookup */ + return (0); + + /* relock everything else */ + if (ufi->entry->aref.ar_amap) + amap_lock(ufi->entry->aref.ar_amap); + if (ufi->entry->object.uvm_obj) + simple_lock(&ufi->entry->object.uvm_obj->vmobjlock); + /* ... and try again */ + } + + /* got a page, zero it and return */ + uvm_pagezero(pg); /* clears PG_CLEAN */ + pg->flags &= ~(PG_BUSY|PG_FAKE); + UVM_PAGE_OWN(pg, NULL); + uvm_lock_pageq(); + uvm_pageactivate(pg); + uvm_unlock_pageq(); + **output = anon; + *output = (*output) + 1; + return(1); +} + + +/* + * uvm_unloananon: kill loans on anons (basically a normal ref drop) + * + * => we expect all our resources to be unlocked + */ + +void +uvm_unloananon(aloans, nanons) + struct vm_anon **aloans; + int nanons; +{ + struct vm_anon *anon; + + while (nanons-- > 0) { + int refs; + + anon = *aloans++; + simple_lock(&anon->an_lock); + refs = --anon->an_ref; + simple_unlock(&anon->an_lock); + + if (refs == 0) { + uvm_anfree(anon); /* last reference: kill anon */ + } + } +} + +/* + * uvm_unloanpage: kill loans on pages loaned out to the kernel + * + * => we expect all our resources to be unlocked + */ + +void +uvm_unloanpage(ploans, npages) + struct vm_page **ploans; + int npages; +{ + struct vm_page *pg; + + uvm_lock_pageq(); + + while (npages-- > 0) { + pg = *ploans++; + + if (pg->loan_count < 1) + panic("uvm_unloanpage: page %p isn't loaned", pg); + + pg->loan_count--; /* drop loan */ + uvm_pageunwire(pg); /* and wire */ + + /* + * if page is unowned and we killed last loan, then we can + * free it + */ + if (pg->loan_count == 0 && pg->uobject == NULL && + pg->uanon == NULL) { + + if (pg->flags & PG_BUSY) + panic("uvm_unloanpage: page %p unowned but PG_BUSY!", pg); + + /* be safe */ + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + uvm_pagefree(pg); /* pageq locked above */ + + } + } + + uvm_unlock_pageq(); +} + diff --git a/sys/uvm/uvm_loan.h b/sys/uvm/uvm_loan.h new file mode 100644 index 00000000000..af99b357cf5 --- /dev/null +++ b/sys/uvm/uvm_loan.h @@ -0,0 +1,59 @@ +/* $NetBSD: uvm_loan.h,v 1.5 1998/08/13 02:11:01 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_loan.h,v 1.1.4.1 1997/12/08 16:07:14 chuck Exp + */ + +#ifndef _UVM_UVM_LOAN_H_ +#define _UVM_UVM_LOAN_H_ + +/* + * flags for uvm_loan + */ + +#define UVM_LOAN_TOANON 0x1 /* loan to anon */ +#define UVM_LOAN_TOPAGE 0x2 /* loan to page */ + +/* + * loan prototypes + */ + +int uvm_loan __P((struct vm_map *, vaddr_t, vsize_t, void **, int)); +void uvm_unloananon __P((struct vm_anon **, int)); +void uvm_unloanpage __P((struct vm_page **, int)); + +#endif /* _UVM_UVM_LOAN_H_ */ diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c new file mode 100644 index 00000000000..a5b337db99d --- /dev/null +++ b/sys/uvm/uvm_map.c @@ -0,0 +1,2972 @@ +/* $NetBSD: uvm_map.c,v 1.34 1999/01/24 23:53:15 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 + * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * uvm_map.c: uvm map operations + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/pool.h> + +#include <sys/user.h> +#include <machine/pcb.h> + +#ifdef SYSVSHM +#include <sys/shm.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#define UVM_MAP +#include <uvm/uvm.h> + +#ifdef DDB +#include <uvm/uvm_ddb.h> +#endif + + +struct uvm_cnt uvm_map_call, map_backmerge, map_forwmerge; +struct uvm_cnt uvm_mlk_call, uvm_mlk_hint; + +/* + * pool for vmspace structures. + */ + +struct pool uvm_vmspace_pool; + +/* + * pool for dynamically-allocated map entries. + */ + +struct pool uvm_map_entry_pool; + +/* + * macros + */ + +/* + * uvm_map_entry_link: insert entry into a map + * + * => map must be locked + */ +#define uvm_map_entry_link(map, after_where, entry) do { \ + (map)->nentries++; \ + (entry)->prev = (after_where); \ + (entry)->next = (after_where)->next; \ + (entry)->prev->next = (entry); \ + (entry)->next->prev = (entry); \ +} while (0) + +/* + * uvm_map_entry_unlink: remove entry from a map + * + * => map must be locked + */ +#define uvm_map_entry_unlink(map, entry) do { \ + (map)->nentries--; \ + (entry)->next->prev = (entry)->prev; \ + (entry)->prev->next = (entry)->next; \ +} while (0) + +/* + * SAVE_HINT: saves the specified entry as the hint for future lookups. + * + * => map need not be locked (protected by hint_lock). + */ +#define SAVE_HINT(map,value) do { \ + simple_lock(&(map)->hint_lock); \ + (map)->hint = (value); \ + simple_unlock(&(map)->hint_lock); \ +} while (0) + +/* + * VM_MAP_RANGE_CHECK: check and correct range + * + * => map must at least be read locked + */ + +#define VM_MAP_RANGE_CHECK(map, start, end) do { \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ +} while (0) + +/* + * local prototypes + */ + +static vm_map_entry_t uvm_mapent_alloc __P((vm_map_t)); +static void uvm_mapent_copy __P((vm_map_entry_t,vm_map_entry_t)); +static void uvm_mapent_free __P((vm_map_entry_t)); +static void uvm_map_entry_unwire __P((vm_map_t, vm_map_entry_t)); + +/* + * local inlines + */ + +#undef UVM_MAP_INLINES + +#ifdef UVM_MAP_INLINES +#define UVM_INLINE __inline +#else +#define UVM_INLINE +#endif + +/* + * uvm_mapent_alloc: allocate a map entry + * + * => XXX: static pool for kernel map? + */ + +static UVM_INLINE vm_map_entry_t +uvm_mapent_alloc(map) + vm_map_t map; +{ + vm_map_entry_t me; + int s; + UVMHIST_FUNC("uvm_mapent_alloc"); + UVMHIST_CALLED(maphist); + + if (map->entries_pageable) { + me = pool_get(&uvm_map_entry_pool, PR_WAITOK); + me->flags = 0; + /* me can't be null, wait ok */ + + } else { + s = splimp(); /* protect kentry_free list with splimp */ + simple_lock(&uvm.kentry_lock); + me = uvm.kentry_free; + if (me) uvm.kentry_free = me->next; + simple_unlock(&uvm.kentry_lock); + splx(s); + if (!me) + panic("mapent_alloc: out of kernel map entries, check MAX_KMAPENT"); + me->flags = UVM_MAP_STATIC; + } + + UVMHIST_LOG(maphist, "<- new entry=0x%x [pageable=%d]", + me, map->entries_pageable, 0, 0); + return(me); + +} + +/* + * uvm_mapent_free: free map entry + * + * => XXX: static pool for kernel map? + */ + +static UVM_INLINE void +uvm_mapent_free(me) + vm_map_entry_t me; +{ + int s; + UVMHIST_FUNC("uvm_mapent_free"); + UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"<- freeing map entry=0x%x [flags=%d]", + me, me->flags, 0, 0); + if ((me->flags & UVM_MAP_STATIC) == 0) { + pool_put(&uvm_map_entry_pool, me); + } else { + s = splimp(); /* protect kentry_free list with splimp */ + simple_lock(&uvm.kentry_lock); + me->next = uvm.kentry_free; + uvm.kentry_free = me; + simple_unlock(&uvm.kentry_lock); + splx(s); + } +} + +/* + * uvm_mapent_copy: copy a map entry, preserving flags + */ + +static UVM_INLINE void +uvm_mapent_copy(src, dst) + vm_map_entry_t src; + vm_map_entry_t dst; +{ + + bcopy(src, dst, ((char *)&src->uvm_map_entry_stop_copy) - ((char*)src)); +} + +/* + * uvm_map_entry_unwire: unwire a map entry + * + * => map should be locked by caller + */ + +static UVM_INLINE void +uvm_map_entry_unwire(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ + + uvm_fault_unwire(map->pmap, entry->start, entry->end); + entry->wired_count = 0; +} + +/* + * uvm_map_init: init mapping system at boot time. note that we allocate + * and init the static pool of vm_map_entry_t's for the kernel here. + */ + +void +uvm_map_init() +{ + static struct vm_map_entry kernel_map_entry[MAX_KMAPENT]; +#if defined(UVMHIST) + static struct uvm_history_ent maphistbuf[100]; + static struct uvm_history_ent pdhistbuf[100]; +#endif + int lcv; + + /* + * first, init logging system. + */ + + UVMHIST_FUNC("uvm_map_init"); + UVMHIST_INIT_STATIC(maphist, maphistbuf); + UVMHIST_INIT_STATIC(pdhist, pdhistbuf); + UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0); + UVMCNT_INIT(uvm_map_call, UVMCNT_CNT, 0, + "# uvm_map() successful calls", 0); + UVMCNT_INIT(map_backmerge, UVMCNT_CNT, 0, "# uvm_map() back merges", 0); + UVMCNT_INIT(map_forwmerge, UVMCNT_CNT, 0, "# uvm_map() missed forward", + 0); + UVMCNT_INIT(uvm_mlk_call, UVMCNT_CNT, 0, "# map lookup calls", 0); + UVMCNT_INIT(uvm_mlk_hint, UVMCNT_CNT, 0, "# map lookup hint hits", 0); + + /* + * now set up static pool of kernel map entrys ... + */ + + simple_lock_init(&uvm.kentry_lock); + uvm.kentry_free = NULL; + for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) { + kernel_map_entry[lcv].next = uvm.kentry_free; + uvm.kentry_free = &kernel_map_entry[lcv]; + } + + /* + * initialize the map-related pools. + */ + pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), + 0, 0, 0, "vmsppl", 0, + pool_page_alloc_nointr, pool_page_free_nointr, M_VMMAP); + pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), + 0, 0, 0, "vmmpepl", 0, + pool_page_alloc_nointr, pool_page_free_nointr, M_VMMAP); +} + +/* + * clippers + */ + +/* + * uvm_map_clip_start: ensure that the entry begins at or after + * the starting address, if it doesn't we split the entry. + * + * => caller should use UVM_MAP_CLIP_START macro rather than calling + * this directly + * => map must be locked by caller + */ + +void uvm_map_clip_start(map, entry, start) + vm_map_t map; + vm_map_entry_t entry; + vaddr_t start; +{ + vm_map_entry_t new_entry; + vaddr_t new_adj; + + /* uvm_map_simplify_entry(map, entry); */ /* XXX */ + + /* + * Split off the front portion. note that we must insert the new + * entry BEFORE this one, so that this entry has the specified + * starting address. + */ + + new_entry = uvm_mapent_alloc(map); + uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ + + new_entry->end = start; + new_adj = start - new_entry->start; + if (entry->object.uvm_obj) + entry->offset += new_adj; /* shift start over */ + entry->start = start; + + if (new_entry->aref.ar_amap) { + amap_splitref(&new_entry->aref, &entry->aref, new_adj); + } + + uvm_map_entry_link(map, entry->prev, new_entry); + + if (UVM_ET_ISSUBMAP(entry)) { + /* ... unlikely to happen, but play it safe */ + uvm_map_reference(new_entry->object.sub_map); + } else { + if (UVM_ET_ISOBJ(entry) && + entry->object.uvm_obj->pgops && + entry->object.uvm_obj->pgops->pgo_reference) + entry->object.uvm_obj->pgops->pgo_reference( + entry->object.uvm_obj); + } +} + +/* + * uvm_map_clip_end: ensure that the entry ends at or before + * the ending address, if it does't we split the reference + * + * => caller should use UVM_MAP_CLIP_END macro rather than calling + * this directly + * => map must be locked by caller + */ + +void +uvm_map_clip_end(map, entry, end) + vm_map_t map; + vm_map_entry_t entry; + vaddr_t end; +{ + vm_map_entry_t new_entry; + vaddr_t new_adj; /* #bytes we move start forward */ + + /* + * Create a new entry and insert it + * AFTER the specified entry + */ + + new_entry = uvm_mapent_alloc(map); + uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ + + new_entry->start = entry->end = end; + new_adj = end - entry->start; + if (new_entry->object.uvm_obj) + new_entry->offset += new_adj; + + if (entry->aref.ar_amap) + amap_splitref(&entry->aref, &new_entry->aref, new_adj); + + uvm_map_entry_link(map, entry, new_entry); + + if (UVM_ET_ISSUBMAP(entry)) { + /* ... unlikely to happen, but play it safe */ + uvm_map_reference(new_entry->object.sub_map); + } else { + if (UVM_ET_ISOBJ(entry) && + entry->object.uvm_obj->pgops && + entry->object.uvm_obj->pgops->pgo_reference) + entry->object.uvm_obj->pgops->pgo_reference( + entry->object.uvm_obj); + } +} + + +/* + * M A P - m a i n e n t r y p o i n t + */ +/* + * uvm_map: establish a valid mapping in a map + * + * => assume startp is page aligned. + * => assume size is a multiple of PAGE_SIZE. + * => assume sys_mmap provides enough of a "hint" to have us skip + * over text/data/bss area. + * => map must be unlocked (we will lock it) + * => <uobj,uoffset> value meanings (4 cases): + * [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER + * [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER + * [3] <uobj,uoffset> == normal mapping + * [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA + * + * case [4] is for kernel mappings where we don't know the offset until + * we've found a virtual address. note that kernel object offsets are + * always relative to vm_map_min(kernel_map). + * => XXXCDC: need way to map in external amap? + */ + +int +uvm_map(map, startp, size, uobj, uoffset, flags) + vm_map_t map; + vaddr_t *startp; /* IN/OUT */ + vsize_t size; + struct uvm_object *uobj; + vaddr_t uoffset; + uvm_flag_t flags; +{ + vm_map_entry_t prev_entry, new_entry; + vm_prot_t prot = UVM_PROTECTION(flags), maxprot = + UVM_MAXPROTECTION(flags); + vm_inherit_t inherit = UVM_INHERIT(flags); + int advice = UVM_ADVICE(flags); + UVMHIST_FUNC("uvm_map"); + UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, *startp=0x%x, size=%d, flags=0x%x)", + map, *startp, size, flags); + UVMHIST_LOG(maphist, " uobj/offset 0x%x/%d", uobj, uoffset,0,0); + + /* + * step 0: sanity check of protection code + */ + + if ((prot & maxprot) != prot) { + UVMHIST_LOG(maphist, "<- prot. failure: prot=0x%x, max=0x%x", + prot, maxprot,0,0); + return(KERN_PROTECTION_FAILURE); + } + + /* + * step 1: figure out where to put new VM range + */ + + if (vm_map_lock_try(map) == FALSE) { + if (flags & UVM_FLAG_TRYLOCK) + return(KERN_FAILURE); + vm_map_lock(map); /* could sleep here */ + } + if ((prev_entry = uvm_map_findspace(map, *startp, size, startp, + uobj, uoffset, flags & UVM_FLAG_FIXED)) == NULL) { + UVMHIST_LOG(maphist,"<- uvm_map_findspace failed!",0,0,0,0); + vm_map_unlock(map); + return (KERN_NO_SPACE); + } + +#if defined(PMAP_GROWKERNEL) /* hack */ + { + /* locked by kernel_map lock */ + static vaddr_t maxkaddr = 0; + + /* + * hack: grow kernel PTPs in advance. + */ + if (map == kernel_map && maxkaddr < (*startp + size)) { + pmap_growkernel(*startp + size); + maxkaddr = *startp + size; + } + } +#endif + + UVMCNT_INCR(uvm_map_call); + + /* + * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER + * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in + * either case we want to zero it before storing it in the map entry + * (because it looks strange and confusing when debugging...) + * + * if uobj is not null + * if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping + * and we do not need to change uoffset. + * if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset + * now (based on the starting address of the map). this case is + * for kernel object mappings where we don't know the offset until + * the virtual address is found (with uvm_map_findspace). the + * offset is the distance we are from the start of the map. + */ + + if (uobj == NULL) { + uoffset = 0; + } else { + if (uoffset == UVM_UNKNOWN_OFFSET) { +#ifdef DIAGNOSTIC + if (uobj->uo_refs != UVM_OBJ_KERN) + panic("uvm_map: unknown offset with non-kernel object"); +#endif + uoffset = *startp - vm_map_min(kernel_map); + } + } + + /* + * step 2: try and insert in map by extending previous entry, if + * possible + * XXX: we don't try and pull back the next entry. might be useful + * for a stack, but we are currently allocating our stack in advance. + */ + + if ((flags & UVM_FLAG_NOMERGE) == 0 && + prev_entry->end == *startp && prev_entry != &map->header && + prev_entry->object.uvm_obj == uobj) { + + if (uobj && prev_entry->offset + + (prev_entry->end - prev_entry->start) != uoffset) + goto step3; + + if (UVM_ET_ISSUBMAP(prev_entry)) + goto step3; + + if (prev_entry->protection != prot || + prev_entry->max_protection != maxprot) + goto step3; + + if (prev_entry->inheritance != inherit || + prev_entry->advice != advice) + goto step3; + + /* wired_count's must match (new area is unwired) */ + if (prev_entry->wired_count) + goto step3; + + /* + * can't extend a shared amap. note: no need to lock amap to + * look at refs since we don't care about its exact value. + * if it is one (i.e. we have only reference) it will stay there + */ + + if (prev_entry->aref.ar_amap && + amap_refs(prev_entry->aref.ar_amap) != 1) { + goto step3; + } + + /* got it! */ + + UVMCNT_INCR(map_backmerge); + UVMHIST_LOG(maphist," starting back merge", 0, 0, 0, 0); + + /* + * drop our reference to uobj since we are extending a reference + * that we already have (the ref count can not drop to zero). + */ + if (uobj && uobj->pgops->pgo_detach) + uobj->pgops->pgo_detach(uobj); + + if (prev_entry->aref.ar_amap) { + amap_extend(prev_entry, size); + } + + prev_entry->end += size; + map->size += size; + + UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0); + vm_map_unlock(map); + return (KERN_SUCCESS); + + } +step3: + UVMHIST_LOG(maphist," allocating new map entry", 0, 0, 0, 0); + + /* + * check for possible forward merge (which we don't do) and count + * the number of times we missed a *possible* chance to merge more + */ + + if ((flags & UVM_FLAG_NOMERGE) == 0 && + prev_entry->next != &map->header && + prev_entry->next->start == (*startp + size)) + UVMCNT_INCR(map_forwmerge); + + /* + * step 3: allocate new entry and link it in + */ + + new_entry = uvm_mapent_alloc(map); + new_entry->start = *startp; + new_entry->end = new_entry->start + size; + new_entry->object.uvm_obj = uobj; + new_entry->offset = uoffset; + + if (uobj) + new_entry->etype = UVM_ET_OBJ; + else + new_entry->etype = 0; + + if (flags & UVM_FLAG_COPYONW) { + new_entry->etype |= UVM_ET_COPYONWRITE; + if ((flags & UVM_FLAG_OVERLAY) == 0) + new_entry->etype |= UVM_ET_NEEDSCOPY; + } + + new_entry->protection = prot; + new_entry->max_protection = maxprot; + new_entry->inheritance = inherit; + new_entry->wired_count = 0; + new_entry->advice = advice; + if (flags & UVM_FLAG_OVERLAY) { + /* + * to_add: for BSS we overallocate a little since we + * are likely to extend + */ + vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ? + UVM_AMAP_CHUNK << PAGE_SHIFT : 0; + struct vm_amap *amap = amap_alloc(size, to_add, M_WAITOK); + new_entry->aref.ar_pageoff = 0; + new_entry->aref.ar_amap = amap; + } else { + new_entry->aref.ar_amap = NULL; + } + + uvm_map_entry_link(map, prev_entry, new_entry); + + map->size += size; + + /* + * Update the free space hint + */ + + if ((map->first_free == prev_entry) && + (prev_entry->end >= new_entry->start)) + map->first_free = new_entry; + + UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * uvm_map_lookup_entry: find map entry at or before an address + * + * => map must at least be read-locked by caller + * => entry is returned in "entry" + * => return value is true if address is in the returned entry + */ + +boolean_t +uvm_map_lookup_entry(map, address, entry) + vm_map_t map; + vaddr_t address; + vm_map_entry_t *entry; /* OUT */ +{ + vm_map_entry_t cur; + vm_map_entry_t last; + UVMHIST_FUNC("uvm_map_lookup_entry"); + UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist,"(map=0x%x,addr=0x%x,ent=0x%x)", + map, address, entry, 0); + + /* + * start looking either from the head of the + * list, or from the hint. + */ + + simple_lock(&map->hint_lock); + cur = map->hint; + simple_unlock(&map->hint_lock); + + if (cur == &map->header) + cur = cur->next; + + UVMCNT_INCR(uvm_mlk_call); + if (address >= cur->start) { + /* + * go from hint to end of list. + * + * but first, make a quick check to see if + * we are already looking at the entry we + * want (which is usually the case). + * note also that we don't need to save the hint + * here... it is the same hint (unless we are + * at the header, in which case the hint didn't + * buy us anything anyway). + */ + last = &map->header; + if ((cur != last) && (cur->end > address)) { + UVMCNT_INCR(uvm_mlk_hint); + *entry = cur; + UVMHIST_LOG(maphist,"<- got it via hint (0x%x)", + cur, 0, 0, 0); + return (TRUE); + } + } else { + /* + * go from start to hint, *inclusively* + */ + last = cur->next; + cur = map->header.next; + } + + /* + * search linearly + */ + + while (cur != last) { + if (cur->end > address) { + if (address >= cur->start) { + /* + * save this lookup for future + * hints, and return + */ + + *entry = cur; + SAVE_HINT(map, cur); + UVMHIST_LOG(maphist,"<- search got it (0x%x)", + cur, 0, 0, 0); + return (TRUE); + } + break; + } + cur = cur->next; + } + *entry = cur->prev; + SAVE_HINT(map, *entry); + UVMHIST_LOG(maphist,"<- failed!",0,0,0,0); + return (FALSE); +} + + +/* + * uvm_map_findspace: find "length" sized space in "map". + * + * => "hint" is a hint about where we want it, unless fixed is true + * (in which case we insist on using "hint"). + * => "result" is VA returned + * => uobj/uoffset are to be used to handle VAC alignment, if required + * => caller must at least have read-locked map + * => returns NULL on failure, or pointer to prev. map entry if success + * => note this is a cross between the old vm_map_findspace and vm_map_find + */ + +vm_map_entry_t +uvm_map_findspace(map, hint, length, result, uobj, uoffset, fixed) + vm_map_t map; + vaddr_t hint; + vsize_t length; + vaddr_t *result; /* OUT */ + struct uvm_object *uobj; + vaddr_t uoffset; + boolean_t fixed; +{ + vm_map_entry_t entry, next, tmp; + vaddr_t end; + UVMHIST_FUNC("uvm_map_findspace"); + UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, hint=0x%x, len=%d, fixed=%d)", + map, hint, length, fixed); + + if (hint < map->min_offset) { /* check ranges ... */ + if (fixed) { + UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0); + return(NULL); + } + hint = map->min_offset; + } + if (hint > map->max_offset) { + UVMHIST_LOG(maphist,"<- VA 0x%x > range [0x%x->0x%x]", + hint, map->min_offset, map->max_offset, 0); + return(NULL); + } + + /* + * Look for the first possible address; if there's already + * something at this address, we have to start after it. + */ + + if (!fixed && hint == map->min_offset) { + if ((entry = map->first_free) != &map->header) + hint = entry->end; + } else { + if (uvm_map_lookup_entry(map, hint, &tmp)) { + /* "hint" address already in use ... */ + if (fixed) { + UVMHIST_LOG(maphist,"<- fixed & VA in use", + 0, 0, 0, 0); + return(NULL); + } + hint = tmp->end; + } + entry = tmp; + } + + /* + * Look through the rest of the map, trying to fit a new region in + * the gap between existing regions, or after the very last region. + * note: entry->end = base VA of current gap, + * next->start = VA of end of current gap + */ + for (;; hint = (entry = next)->end) { + /* + * Find the end of the proposed new region. Be sure we didn't + * go beyond the end of the map, or wrap around the address; + * if so, we lose. Otherwise, if this is the last entry, or + * if the proposed new region fits before the next entry, we + * win. + */ + +#ifdef PMAP_PREFER + /* + * push hint forward as needed to avoid VAC alias problems. + * we only do this if a valid offset is specified. + */ + if (!fixed && uoffset != UVM_UNKNOWN_OFFSET) + PMAP_PREFER(uoffset, &hint); +#endif + end = hint + length; + if (end > map->max_offset || end < hint) { + UVMHIST_LOG(maphist,"<- failed (off end)", 0,0,0,0); + return (NULL); + } + next = entry->next; + if (next == &map->header || next->start >= end) + break; + if (fixed) { + UVMHIST_LOG(maphist,"<- fixed mapping failed", 0,0,0,0); + return(NULL); /* only one shot at it ... */ + } + } + SAVE_HINT(map, entry); + *result = hint; + UVMHIST_LOG(maphist,"<- got it! (result=0x%x)", hint, 0,0,0); + return (entry); +} + +/* + * U N M A P - m a i n h e l p e r f u n c t i o n s + */ + +/* + * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop") + * + * => caller must check alignment and size + * => map must be locked by caller + * => we return a list of map entries that we've remove from the map + * in "entry_list" + */ + +int +uvm_unmap_remove(map, start, end, entry_list) + vm_map_t map; + vaddr_t start,end; + vm_map_entry_t *entry_list; /* OUT */ +{ + vm_map_entry_t entry, first_entry, next; + vaddr_t len; + UVMHIST_FUNC("uvm_unmap_remove"); + UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist,"(map=0x%x, start=0x%x, end=0x%x)", + map, start, end, 0); + + VM_MAP_RANGE_CHECK(map, start, end); + + /* + * find first entry + */ + if (uvm_map_lookup_entry(map, start, &first_entry) == TRUE) { + /* clip and go... */ + entry = first_entry; + UVM_MAP_CLIP_START(map, entry, start); + /* critical! prevents stale hint */ + SAVE_HINT(map, entry->prev); + + } else { + entry = first_entry->next; + } + + /* + * Save the free space hint + */ + + if (map->first_free->start >= start) + map->first_free = entry->prev; + + /* + * note: we now re-use first_entry for a different task. we remove + * a number of map entries from the map and save them in a linked + * list headed by "first_entry". once we remove them from the map + * the caller should unlock the map and drop the references to the + * backing objects [c.f. uvm_unmap_detach]. the object is to + * seperate unmapping from reference dropping. why? + * [1] the map has to be locked for unmapping + * [2] the map need not be locked for reference dropping + * [3] dropping references may trigger pager I/O, and if we hit + * a pager that does synchronous I/O we may have to wait for it. + * [4] we would like all waiting for I/O to occur with maps unlocked + * so that we don't block other threads. + */ + first_entry = NULL; + *entry_list = NULL; /* to be safe */ + + /* + * break up the area into map entry sized regions and unmap. note + * that all mappings have to be removed before we can even consider + * dropping references to amaps or VM objects (otherwise we could end + * up with a mapping to a page on the free list which would be very bad) + */ + + while ((entry != &map->header) && (entry->start < end)) { + + UVM_MAP_CLIP_END(map, entry, end); + next = entry->next; + len = entry->end - entry->start; + + /* + * unwire before removing addresses from the pmap; otherwise + * unwiring will put the entries back into the pmap (XXX). + */ + + if (entry->wired_count) + uvm_map_entry_unwire(map, entry); + + /* + * special case: handle mappings to anonymous kernel objects. + * we want to free these pages right away... + */ + if (UVM_ET_ISOBJ(entry) && + entry->object.uvm_obj->uo_refs == UVM_OBJ_KERN) { + +#ifdef DIAGNOSTIC + if (vm_map_pmap(map) != pmap_kernel()) + panic("uvm_unmap_remove: kernel object mapped by non-kernel map"); +#endif + + /* + * note: kernel object mappings are currently used in + * two ways: + * [1] "normal" mappings of pages in the kernel object + * [2] uvm_km_valloc'd allocations in which we + * pmap_enter in some non-kernel-object page + * (e.g. vmapbuf). + * + * for case [1], we need to remove the mapping from + * the pmap and then remove the page from the kernel + * object (because, once pages in a kernel object are + * unmapped they are no longer needed, unlike, say, + * a vnode where you might want the data to persist + * until flushed out of a queue). + * + * for case [2], we need to remove the mapping from + * the pmap. there shouldn't be any pages at the + * specified offset in the kernel object [but it + * doesn't hurt to call uvm_km_pgremove just to be + * safe?] + * + * uvm_km_pgremove currently does the following: + * for pages in the kernel object in range: + * - pmap_page_protect them out of all pmaps + * - uvm_pagefree the page + * + * note that in case [1] the pmap_page_protect call + * in uvm_km_pgremove may very well be redundant + * because we have already removed the mappings + * beforehand with pmap_remove (or pmap_kremove). + * in the PMAP_NEW case, the pmap_page_protect call + * may not do anything, since PMAP_NEW allows the + * kernel to enter/remove kernel mappings without + * bothing to keep track of the mappings (e.g. via + * pv_entry lists). XXX: because of this, in the + * future we should consider removing the + * pmap_page_protect from uvm_km_pgremove some time + * in the future. + */ + + /* + * remove mappings from pmap + */ +#if defined(PMAP_NEW) + pmap_kremove(entry->start, len); +#else + pmap_remove(pmap_kernel(), entry->start, + entry->start+len); +#endif + + /* + * remove pages from a kernel object (offsets are + * always relative to vm_map_min(kernel_map)). + */ + uvm_km_pgremove(entry->object.uvm_obj, + entry->start - vm_map_min(kernel_map), + entry->end - vm_map_min(kernel_map)); + + /* + * null out kernel_object reference, we've just + * dropped it + */ + entry->etype &= ~UVM_ET_OBJ; + entry->object.uvm_obj = NULL; /* to be safe */ + + } else { + /* + * remove mappings the standard way. + */ + pmap_remove(map->pmap, entry->start, entry->end); + } + + /* + * remove entry from map and put it on our list of entries + * that we've nuked. then go do next entry. + */ + UVMHIST_LOG(maphist, " removed map entry 0x%x", entry, 0, 0,0); + uvm_map_entry_unlink(map, entry); + map->size -= len; + entry->next = first_entry; + first_entry = entry; + entry = next; /* next entry, please */ + } + + /* + * now we've cleaned up the map and are ready for the caller to drop + * references to the mapped objects. + */ + + *entry_list = first_entry; + UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0); + return(KERN_SUCCESS); +} + +/* + * uvm_unmap_detach: drop references in a chain of map entries + * + * => we will free the map entries as we traverse the list. + */ + +void +uvm_unmap_detach(first_entry, amap_unref_flags) + vm_map_entry_t first_entry; + int amap_unref_flags; +{ + vm_map_entry_t next_entry; + UVMHIST_FUNC("uvm_unmap_detach"); UVMHIST_CALLED(maphist); + + while (first_entry) { + +#ifdef DIAGNOSTIC + /* + * sanity check + */ + /* was part of vm_map_entry_delete() */ + if (first_entry->wired_count) + panic("unmap: still wired!"); +#endif + + UVMHIST_LOG(maphist, + " detach 0x%x: amap=0x%x, obj=0x%x, submap?=%d", + first_entry, first_entry->aref.ar_amap, + first_entry->object.uvm_obj, + UVM_ET_ISSUBMAP(first_entry)); + + /* + * drop reference to amap, if we've got one + */ + + if (first_entry->aref.ar_amap) + amap_unref(first_entry, amap_unref_flags); + + /* + * drop reference to our backing object, if we've got one + */ + + if (UVM_ET_ISSUBMAP(first_entry)) { + /* ... unlikely to happen, but play it safe */ + uvm_map_deallocate(first_entry->object.sub_map); + } else { + if (UVM_ET_ISOBJ(first_entry) && + first_entry->object.uvm_obj->pgops->pgo_detach) + first_entry->object.uvm_obj->pgops-> + pgo_detach(first_entry->object.uvm_obj); + } + + /* + * next entry + */ + next_entry = first_entry->next; + uvm_mapent_free(first_entry); + first_entry = next_entry; + } + + /* + * done! + */ + UVMHIST_LOG(maphist, "<- done", 0,0,0,0); + return; +} + +/* + * E X T R A C T I O N F U N C T I O N S + */ + +/* + * uvm_map_reserve: reserve space in a vm_map for future use. + * + * => we reserve space in a map by putting a dummy map entry in the + * map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE) + * => map should be unlocked (we will write lock it) + * => we return true if we were able to reserve space + * => XXXCDC: should be inline? + */ + +int +uvm_map_reserve(map, size, offset, raddr) + vm_map_t map; + vsize_t size; + vaddr_t offset; /* hint for pmap_prefer */ + vaddr_t *raddr; /* OUT: reserved VA */ +{ + UVMHIST_FUNC("uvm_map_reserve"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(map=0x%x, size=0x%x, offset=0x%x,addr=0x%x)", + map,size,offset,raddr); + + size = round_page(size); + if (*raddr < vm_map_min(map)) + *raddr = vm_map_min(map); /* hint */ + + /* + * reserve some virtual space. + */ + + if (uvm_map(map, raddr, size, NULL, offset, + UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE, + UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) != KERN_SUCCESS) { + UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0); + return (FALSE); + } + + UVMHIST_LOG(maphist, "<- done (*raddr=0x%x)", *raddr,0,0,0); + return (TRUE); +} + +/* + * uvm_map_replace: replace a reserved (blank) area of memory with + * real mappings. + * + * => caller must WRITE-LOCK the map + * => we return TRUE if replacement was a success + * => we expect the newents chain to have nnewents entrys on it and + * we expect newents->prev to point to the last entry on the list + * => note newents is allowed to be NULL + */ + +int +uvm_map_replace(map, start, end, newents, nnewents) + struct vm_map *map; + vaddr_t start, end; + vm_map_entry_t newents; + int nnewents; +{ + vm_map_entry_t oldent, last; + UVMHIST_FUNC("uvm_map_replace"); + UVMHIST_CALLED(maphist); + + /* + * first find the blank map entry at the specified address + */ + + if (!uvm_map_lookup_entry(map, start, &oldent)) { + return(FALSE); + } + + /* + * check to make sure we have a proper blank entry + */ + + if (oldent->start != start || oldent->end != end || + oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) { + return (FALSE); + } + +#ifdef DIAGNOSTIC + /* + * sanity check the newents chain + */ + { + vm_map_entry_t tmpent = newents; + int nent = 0; + vaddr_t cur = start; + + while (tmpent) { + nent++; + if (tmpent->start < cur) + panic("uvm_map_replace1"); + if (tmpent->start > tmpent->end || tmpent->end > end) { + printf("tmpent->start=0x%lx, tmpent->end=0x%lx, end=0x%lx\n", + tmpent->start, tmpent->end, end); + panic("uvm_map_replace2"); + } + cur = tmpent->end; + if (tmpent->next) { + if (tmpent->next->prev != tmpent) + panic("uvm_map_replace3"); + } else { + if (newents->prev != tmpent) + panic("uvm_map_replace4"); + } + tmpent = tmpent->next; + } + if (nent != nnewents) + panic("uvm_map_replace5"); + } +#endif + + /* + * map entry is a valid blank! replace it. (this does all the + * work of map entry link/unlink...). + */ + + if (newents) { + + last = newents->prev; /* we expect this */ + + /* critical: flush stale hints out of map */ + SAVE_HINT(map, newents); + if (map->first_free == oldent) + map->first_free = last; + + last->next = oldent->next; + last->next->prev = last; + newents->prev = oldent->prev; + newents->prev->next = newents; + map->nentries = map->nentries + (nnewents - 1); + + } else { + + /* critical: flush stale hints out of map */ + SAVE_HINT(map, oldent->prev); + if (map->first_free == oldent) + map->first_free = oldent->prev; + + /* NULL list of new entries: just remove the old one */ + uvm_map_entry_unlink(map, oldent); + } + + + /* + * now we can free the old blank entry, unlock the map and return. + */ + + uvm_mapent_free(oldent); + return(TRUE); +} + +/* + * uvm_map_extract: extract a mapping from a map and put it somewhere + * (maybe removing the old mapping) + * + * => maps should be unlocked (we will write lock them) + * => returns 0 on success, error code otherwise + * => start must be page aligned + * => len must be page sized + * => flags: + * UVM_EXTRACT_REMOVE: remove mappings from srcmap + * UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only) + * UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs + * UVM_EXTRACT_FIXPROT: set prot to maxprot as we go + * >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<< + * >>>NOTE: QREF's must be unmapped via the QREF path, thus should only + * be used from within the kernel in a kernel level map <<< + */ + +int +uvm_map_extract(srcmap, start, len, dstmap, dstaddrp, flags) + vm_map_t srcmap, dstmap; + vaddr_t start, *dstaddrp; + vsize_t len; + int flags; +{ + vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge, + oldstart; + vm_map_entry_t chain, endchain, entry, orig_entry, newentry, deadentry; + vm_map_entry_t oldentry; + vsize_t elen; + int nchain, error, copy_ok; + UVMHIST_FUNC("uvm_map_extract"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(srcmap=0x%x,start=0x%x, len=0x%x", srcmap, start, + len,0); + UVMHIST_LOG(maphist," ...,dstmap=0x%x, flags=0x%x)", dstmap,flags,0,0); + +#ifdef DIAGNOSTIC + /* + * step 0: sanity check: start must be on a page boundary, length + * must be page sized. can't ask for CONTIG/QREF if you asked for + * REMOVE. + */ + if ((start & PAGE_MASK) || (len & PAGE_MASK)) + panic("uvm_map_extract1"); + if (flags & UVM_EXTRACT_REMOVE) + if (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) + panic("uvm_map_extract2"); +#endif + + + /* + * step 1: reserve space in the target map for the extracted area + */ + + dstaddr = *dstaddrp; + if (uvm_map_reserve(dstmap, len, start, &dstaddr) == FALSE) + return(ENOMEM); + *dstaddrp = dstaddr; /* pass address back to caller */ + UVMHIST_LOG(maphist, " dstaddr=0x%x", dstaddr,0,0,0); + + + /* + * step 2: setup for the extraction process loop by init'ing the + * map entry chain, locking src map, and looking up the first useful + * entry in the map. + */ + + end = start + len; + newend = dstaddr + len; + chain = endchain = NULL; + nchain = 0; + vm_map_lock(srcmap); + + if (uvm_map_lookup_entry(srcmap, start, &entry)) { + + /* "start" is within an entry */ + if (flags & UVM_EXTRACT_QREF) { + /* + * for quick references we don't clip the entry, so + * the entry may map space "before" the starting + * virtual address... this is the "fudge" factor + * (which can be non-zero only the first time + * through the "while" loop in step 3). + */ + fudge = start - entry->start; + } else { + /* + * normal reference: we clip the map to fit (thus + * fudge is zero) + */ + UVM_MAP_CLIP_START(srcmap, entry, start); + SAVE_HINT(srcmap, entry->prev); + fudge = 0; + } + + } else { + + /* "start" is not within an entry ... skip to next entry */ + if (flags & UVM_EXTRACT_CONTIG) { + error = EINVAL; + goto bad; /* definite hole here ... */ + } + + entry = entry->next; + fudge = 0; + } + /* save values from srcmap for step 6 */ + orig_entry = entry; + orig_fudge = fudge; + + + /* + * step 3: now start looping through the map entries, extracting + * as we go. + */ + + while (entry->start < end && entry != &srcmap->header) { + + /* if we are not doing a quick reference, clip it */ + if ((flags & UVM_EXTRACT_QREF) == 0) + UVM_MAP_CLIP_END(srcmap, entry, end); + + /* clear needs_copy (allow chunking) */ + if (UVM_ET_ISNEEDSCOPY(entry)) { + if (fudge) + oldstart = entry->start; + else + oldstart = 0; /* XXX: gcc */ + amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end); + if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */ + error = ENOMEM; + goto bad; + } + /* amap_copy could clip (during chunk)! update fudge */ + if (fudge) { + fudge = fudge - (entry->start - oldstart); + orig_fudge = fudge; + } + } + + /* calculate the offset of this from "start" */ + oldoffset = (entry->start + fudge) - start; + + /* allocate a new map entry */ + newentry = uvm_mapent_alloc(dstmap); + if (newentry == NULL) { + error = ENOMEM; + goto bad; + } + + /* set up new map entry */ + newentry->next = NULL; + newentry->prev = endchain; + newentry->start = dstaddr + oldoffset; + newentry->end = + newentry->start + (entry->end - (entry->start + fudge)); + if (newentry->end > newend) + newentry->end = newend; + newentry->object.uvm_obj = entry->object.uvm_obj; + if (newentry->object.uvm_obj) { + if (newentry->object.uvm_obj->pgops->pgo_reference) + newentry->object.uvm_obj->pgops-> + pgo_reference(newentry->object.uvm_obj); + newentry->offset = entry->offset + fudge; + } else { + newentry->offset = 0; + } + newentry->etype = entry->etype; + newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ? + entry->max_protection : entry->protection; + newentry->max_protection = entry->max_protection; + newentry->inheritance = entry->inheritance; + newentry->wired_count = 0; + newentry->aref.ar_amap = entry->aref.ar_amap; + if (newentry->aref.ar_amap) { + newentry->aref.ar_pageoff = + entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT); + amap_ref(newentry, AMAP_SHARED | + ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0)); + } else { + newentry->aref.ar_pageoff = 0; + } + newentry->advice = entry->advice; + + /* now link it on the chain */ + nchain++; + if (endchain == NULL) { + chain = endchain = newentry; + } else { + endchain->next = newentry; + endchain = newentry; + } + + /* end of 'while' loop! */ + if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end && + (entry->next == &srcmap->header || + entry->next->start != entry->end)) { + error = EINVAL; + goto bad; + } + entry = entry->next; + fudge = 0; + } + + + /* + * step 4: close off chain (in format expected by uvm_map_replace) + */ + + if (chain) + chain->prev = endchain; + + + /* + * step 5: attempt to lock the dest map so we can pmap_copy. + * note usage of copy_ok: + * 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5) + * 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7 + */ + + if (srcmap == dstmap || vm_map_lock_try(dstmap) == TRUE) { + + copy_ok = 1; + if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, + nchain)) { + if (srcmap != dstmap) + vm_map_unlock(dstmap); + error = EIO; + goto bad; + } + + } else { + + copy_ok = 0; + /* replace defered until step 7 */ + + } + + + /* + * step 6: traverse the srcmap a second time to do the following: + * - if we got a lock on the dstmap do pmap_copy + * - if UVM_EXTRACT_REMOVE remove the entries + * we make use of orig_entry and orig_fudge (saved in step 2) + */ + + if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) { + + /* purge possible stale hints from srcmap */ + if (flags & UVM_EXTRACT_REMOVE) { + SAVE_HINT(srcmap, orig_entry->prev); + if (srcmap->first_free->start >= start) + srcmap->first_free = orig_entry->prev; + } + + entry = orig_entry; + fudge = orig_fudge; + deadentry = NULL; /* for UVM_EXTRACT_REMOVE */ + + while (entry->start < end && entry != &srcmap->header) { + + if (copy_ok) { + oldoffset = (entry->start + fudge) - start; + elen = min(end, entry->end) - (entry->start + fudge); + pmap_copy(dstmap->pmap, srcmap->pmap, dstaddr + oldoffset, + elen, entry->start + fudge); + } + + /* we advance "entry" in the following if statement */ + if (flags & UVM_EXTRACT_REMOVE) { + pmap_remove(srcmap->pmap, entry->start, + entry->end); + oldentry = entry; /* save entry */ + entry = entry->next; /* advance */ + uvm_map_entry_unlink(srcmap, oldentry); + /* add to dead list */ + oldentry->next = deadentry; + deadentry = oldentry; + } else { + entry = entry->next; /* advance */ + } + + /* end of 'while' loop */ + fudge = 0; + } + + /* + * unlock dstmap. we will dispose of deadentry in + * step 7 if needed + */ + if (copy_ok && srcmap != dstmap) + vm_map_unlock(dstmap); + + } + else + deadentry = NULL; /* XXX: gcc */ + + /* + * step 7: we are done with the source map, unlock. if copy_ok + * is 0 then we have not replaced the dummy mapping in dstmap yet + * and we need to do so now. + */ + + vm_map_unlock(srcmap); + if ((flags & UVM_EXTRACT_REMOVE) && deadentry) + uvm_unmap_detach(deadentry, 0); /* dispose of old entries */ + + /* now do the replacement if we didn't do it in step 5 */ + if (copy_ok == 0) { + vm_map_lock(dstmap); + error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, + nchain); + vm_map_unlock(dstmap); + + if (error == FALSE) { + error = EIO; + goto bad2; + } + } + + /* + * done! + */ + return(0); + + /* + * bad: failure recovery + */ +bad: + vm_map_unlock(srcmap); +bad2: /* src already unlocked */ + if (chain) + uvm_unmap_detach(chain, + (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0); + uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */ + return(error); +} + +/* end of extraction functions */ + +/* + * uvm_map_submap: punch down part of a map into a submap + * + * => only the kernel_map is allowed to be submapped + * => the purpose of submapping is to break up the locking granularity + * of a larger map + * => the range specified must have been mapped previously with a uvm_map() + * call [with uobj==NULL] to create a blank map entry in the main map. + * [And it had better still be blank!] + * => maps which contain submaps should never be copied or forked. + * => to remove a submap, use uvm_unmap() on the main map + * and then uvm_map_deallocate() the submap. + * => main map must be unlocked. + * => submap must have been init'd and have a zero reference count. + * [need not be locked as we don't actually reference it] + */ + +int +uvm_map_submap(map, start, end, submap) + vm_map_t map, submap; + vaddr_t start, end; +{ + vm_map_entry_t entry; + int result; + UVMHIST_FUNC("uvm_map_submap"); UVMHIST_CALLED(maphist); + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (uvm_map_lookup_entry(map, start, &entry)) { + UVM_MAP_CLIP_START(map, entry, start); + UVM_MAP_CLIP_END(map, entry, end); /* to be safe */ + } + else { + entry = NULL; + } + + if (entry != NULL && + entry->start == start && entry->end == end && + entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL && + !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) { + + /* + * doit! + */ + entry->etype |= UVM_ET_SUBMAP; + entry->object.sub_map = submap; + entry->offset = 0; + uvm_map_reference(submap); + result = KERN_SUCCESS; + } else { + result = KERN_INVALID_ARGUMENT; + } + vm_map_unlock(map); + + return(result); +} + + +/* + * uvm_map_protect: change map protection + * + * => set_max means set max_protection. + * => map must be unlocked. + * => XXXCDC: does not work properly with share maps. rethink. + */ + +#define MASK(entry) ( UVM_ET_ISCOPYONWRITE(entry) ? \ + ~VM_PROT_WRITE : VM_PROT_ALL) +#define max(a,b) ((a) > (b) ? (a) : (b)) + +int +uvm_map_protect(map, start, end, new_prot, set_max) + vm_map_t map; + vaddr_t start, end; + vm_prot_t new_prot; + boolean_t set_max; +{ + vm_map_entry_t current, entry; + UVMHIST_FUNC("uvm_map_protect"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_prot=0x%x)", + map, start, end, new_prot); + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (uvm_map_lookup_entry(map, start, &entry)) { + UVM_MAP_CLIP_START(map, entry, start); + } else { + entry = entry->next; + } + + /* + * make a first pass to check for protection violations. + */ + + current = entry; + while ((current != &map->header) && (current->start < end)) { + if (UVM_ET_ISSUBMAP(current)) + return(KERN_INVALID_ARGUMENT); + if ((new_prot & current->max_protection) != new_prot) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + current = current->next; + } + + /* go back and fix up protections (no need to clip this time). */ + + current = entry; + + while ((current != &map->header) && (current->start < end)) { + vm_prot_t old_prot; + + UVM_MAP_CLIP_END(map, current, end); + + old_prot = current->protection; + if (set_max) + current->protection = + (current->max_protection = new_prot) & old_prot; + else + current->protection = new_prot; + + /* + * update physical map if necessary. worry about copy-on-write + * here -- CHECK THIS XXX + */ + + if (current->protection != old_prot) { + + /* update pmap! */ + pmap_protect(map->pmap, current->start, current->end, + current->protection & MASK(entry)); + + } + current = current->next; + } + + vm_map_unlock(map); + UVMHIST_LOG(maphist, "<- done",0,0,0,0); + return(KERN_SUCCESS); +} + +#undef max +#undef MASK + +/* + * uvm_map_inherit: set inheritance code for range of addrs in map. + * + * => map must be unlocked + * => note that the inherit code is used during a "fork". see fork + * code for details. + * => XXXCDC: currently only works in main map. what about share map? + */ + +int +uvm_map_inherit(map, start, end, new_inheritance) + vm_map_t map; + vaddr_t start; + vaddr_t end; + vm_inherit_t new_inheritance; +{ + vm_map_entry_t entry, temp_entry; + UVMHIST_FUNC("uvm_map_inherit"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_inh=0x%x)", + map, start, end, new_inheritance); + + switch (new_inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0); + return(KERN_INVALID_ARGUMENT); + } + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (uvm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + UVM_MAP_CLIP_START(map, entry, start); + } else { + entry = temp_entry->next; + } + + while ((entry != &map->header) && (entry->start < end)) { + UVM_MAP_CLIP_END(map, entry, end); + + entry->inheritance = new_inheritance; + + entry = entry->next; + } + + vm_map_unlock(map); + UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0); + return(KERN_SUCCESS); +} + +/* + * uvm_map_pageable: sets the pageability of a range in a map. + * + * => regions sepcified as not pageable require lock-down (wired) memory + * and page tables. + * => map must not be locked. + * => XXXCDC: check this and try and clean it up. + */ + +int +uvm_map_pageable(map, start, end, new_pageable) + vm_map_t map; + vaddr_t start, end; + boolean_t new_pageable; +{ + vm_map_entry_t entry, start_entry; + vaddr_t failed = 0; + int rv; + UVMHIST_FUNC("uvm_map_pageable"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_pageable=0x%x)", + map, start, end, new_pageable); + + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); + + /* + * only one pageability change may take place at one time, since + * uvm_fault_wire assumes it will be called only once for each + * wiring/unwiring. therefore, we have to make sure we're actually + * changing the pageability for the entire region. we do so before + * making any changes. + */ + + if (uvm_map_lookup_entry(map, start, &start_entry) == FALSE) { + vm_map_unlock(map); + + UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0); + return (KERN_INVALID_ADDRESS); + } + entry = start_entry; + + /* + * handle wiring and unwiring seperately. + */ + + if (new_pageable) { /* unwire */ + + UVM_MAP_CLIP_START(map, entry, start); + + /* + * unwiring. first ensure that the range to be unwired is + * really wired down and that there are no holes. + */ + while ((entry != &map->header) && (entry->start < end)) { + + if (entry->wired_count == 0 || + (entry->end < end && + (entry->next == &map->header || + entry->next->start > entry->end))) { + vm_map_unlock(map); + UVMHIST_LOG(maphist, + "<- done (INVALID UNWIRE ARG)",0,0,0,0); + return (KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * now decrement the wiring count for each region. if a region + * becomes completely unwired, unwire its physical pages and + * mappings. + */ +#if 0 /* not necessary: uvm_fault_unwire does not lock */ + lock_set_recursive(&map->lock); +#endif /* XXXCDC */ + + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + UVM_MAP_CLIP_END(map, entry, end); + + entry->wired_count--; + if (entry->wired_count == 0) + uvm_map_entry_unwire(map, entry); + + entry = entry->next; + } +#if 0 /* XXXCDC: not necessary, see above */ + lock_clear_recursive(&map->lock); +#endif + vm_map_unlock(map); + UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0); + return(KERN_SUCCESS); + + /* + * end of unwire case! + */ + } + + /* + * wire case: in two passes [XXXCDC: ugly block of code here] + * + * 1: holding the write lock, we create any anonymous maps that need + * to be created. then we clip each map entry to the region to + * be wired and increment its wiring count. + * + * 2: we downgrade to a read lock, and call uvm_fault_wire to fault + * in the pages for any newly wired area (wired_count is 1). + * + * downgrading to a read lock for uvm_fault_wire avoids a possible + * deadlock with another thread that may have faulted on one of + * the pages to be wired (it would mark the page busy, blocking + * us, then in turn block on the map lock that we hold). because + * of problems in the recursive lock package, we cannot upgrade + * to a write lock in vm_map_lookup. thus, any actions that + * require the write lock must be done beforehand. because we + * keep the read lock on the map, the copy-on-write status of the + * entries we modify here cannot change. + */ + + while ((entry != &map->header) && (entry->start < end)) { + + if (entry->wired_count == 0) { /* not already wired? */ + + /* + * perform actions of vm_map_lookup that need the + * write lock on the map: create an anonymous map + * for a copy-on-write region, or an anonymous map + * for a zero-fill region. (XXXCDC: submap case + * ok?) + */ + + if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ + /* + * XXXCDC: protection vs. max_protection?? + * (wirefault uses max?) + * XXXCDC: used to do it always if + * uvm_obj == NULL (wrong?) + */ + if ( UVM_ET_ISNEEDSCOPY(entry) && + (entry->protection & VM_PROT_WRITE) != 0) { + amap_copy(map, entry, M_WAITOK, TRUE, + start, end); + /* XXXCDC: wait OK? */ + } + } + } /* wired_count == 0 */ + UVM_MAP_CLIP_START(map, entry, start); + UVM_MAP_CLIP_END(map, entry, end); + entry->wired_count++; + + /* + * Check for holes + */ + if (entry->end < end && (entry->next == &map->header || + entry->next->start > entry->end)) { + /* + * found one. amap creation actions do not need to + * be undone, but the wired counts need to be restored. + */ + while (entry != &map->header && entry->end > start) { + entry->wired_count--; + entry = entry->prev; + } + vm_map_unlock(map); + UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * Pass 2. + */ + /* + * HACK HACK HACK HACK + * + * if we are wiring in the kernel map or a submap of it, unlock the + * map to avoid deadlocks. we trust that the kernel threads are + * well-behaved, and therefore will not do anything destructive to + * this region of the map while we have it unlocked. we cannot + * trust user threads to do the same. + * + * HACK HACK HACK HACK + */ + if (vm_map_pmap(map) == pmap_kernel()) { + vm_map_unlock(map); /* trust me ... */ + } else { + vm_map_set_recursive(&map->lock); + lockmgr(&map->lock, LK_DOWNGRADE, (void *)0, curproc /*XXX*/); + } + + rv = 0; + entry = start_entry; + while (entry != &map->header && entry->start < end) { + /* + * if uvm_fault_wire fails for any page we need to undo what has + * been done. we decrement the wiring count for those pages + * which have not yet been wired (now) and unwire those that + * have * (later). + * + * XXX this violates the locking protocol on the map, needs to + * be fixed. [because we only have a read lock on map we + * shouldn't be changing wired_count?] + */ + if (rv) { + entry->wired_count--; + } else if (entry->wired_count == 1) { + rv = uvm_fault_wire(map, entry->start, entry->end); + if (rv) { + failed = entry->start; + entry->wired_count--; + } + } + entry = entry->next; + } + + if (vm_map_pmap(map) == pmap_kernel()) { + vm_map_lock(map); /* relock */ + } else { + vm_map_clear_recursive(&map->lock); + } + + if (rv) { /* failed? */ + vm_map_unlock(map); + (void) uvm_map_pageable(map, start, failed, TRUE); + UVMHIST_LOG(maphist, "<- done (RV=%d)", rv,0,0,0); + return(rv); + } + vm_map_unlock(map); + + UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0); + return(KERN_SUCCESS); +} + +/* + * uvm_map_clean: push dirty pages off to backing store. + * + * => valid flags: + * if (flags & PGO_SYNCIO): dirty pages are written synchronously + * if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean + * if (flags & PGO_FREE): any cached pages are freed after clean + * => returns an error if any part of the specified range isn't mapped + * => never a need to flush amap layer since the anonymous memory has + * no permanent home... + * => called from sys_msync() + * => caller must not write-lock map (read OK). + * => we may sleep while cleaning if SYNCIO [with map read-locked] + * => XXX: does this handle share maps properly? + */ + +int +uvm_map_clean(map, start, end, flags) + vm_map_t map; + vaddr_t start, end; + int flags; +{ + vm_map_entry_t current; + vm_map_entry_t entry; + vsize_t size; + struct uvm_object *object; + vaddr_t offset; + UVMHIST_FUNC("uvm_map_clean"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,flags=0x%x)", + map, start, end, flags); + + vm_map_lock_read(map); + VM_MAP_RANGE_CHECK(map, start, end); + if (!uvm_map_lookup_entry(map, start, &entry)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + + /* + * Make a first pass to check for holes. + */ + for (current = entry; current->start < end; current = current->next) { + if (UVM_ET_ISSUBMAP(current)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ARGUMENT); + } + if (end > current->end && (current->next == &map->header || + current->end != current->next->start)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + } + + /* + * add "cleanit" flag to flags (for generic flush routine). + * then make a second pass, cleaning/uncaching pages from + * the indicated objects as we go. + */ + flags = flags | PGO_CLEANIT; + for (current = entry; current->start < end; current = current->next) { + offset = current->offset + (start - current->start); + size = (end <= current->end ? end : current->end) - start; + + /* + * get object/offset. can't be submap (checked above). + */ + object = current->object.uvm_obj; + simple_lock(&object->vmobjlock); + + /* + * flush pages if we've got a valid backing object. + * note that object is locked. + * XXX should we continue on an error? + */ + + if (object && object->pgops) { + if (!object->pgops->pgo_flush(object, offset, + offset+size, flags)) { + simple_unlock(&object->vmobjlock); + vm_map_unlock_read(map); + return (KERN_FAILURE); + } + } + simple_unlock(&object->vmobjlock); + start += size; + } + vm_map_unlock_read(map); + return(KERN_SUCCESS); +} + + +/* + * uvm_map_checkprot: check protection in map + * + * => must allow specified protection in a fully allocated region. + * => map must be read or write locked by caller. + */ + +boolean_t +uvm_map_checkprot(map, start, end, protection) + vm_map_t map; + vaddr_t start, end; + vm_prot_t protection; +{ + vm_map_entry_t entry; + vm_map_entry_t tmp_entry; + + if (!uvm_map_lookup_entry(map, start, &tmp_entry)) { + return(FALSE); + } + + entry = tmp_entry; + + while (start < end) { + if (entry == &map->header) { + return(FALSE); + } + + /* + * no holes allowed + */ + + if (start < entry->start) { + return(FALSE); + } + + /* + * check protection associated with entry + */ + + if ((entry->protection & protection) != protection) { + return(FALSE); + } + + /* go to next entry */ + + start = entry->end; + entry = entry->next; + } + return(TRUE); +} + +/* + * uvmspace_alloc: allocate a vmspace structure. + * + * - structure includes vm_map and pmap + * - XXX: no locking on this structure + * - refcnt set to 1, rest must be init'd by caller + */ +struct vmspace * +uvmspace_alloc(min, max, pageable) + vaddr_t min, max; + int pageable; +{ + struct vmspace *vm; + UVMHIST_FUNC("uvmspace_alloc"); UVMHIST_CALLED(maphist); + + vm = pool_get(&uvm_vmspace_pool, PR_WAITOK); + uvmspace_init(vm, NULL, min, max, pageable); + UVMHIST_LOG(maphist,"<- done (vm=0x%x)", vm,0,0,0); + return (vm); +} + +/* + * uvmspace_init: initialize a vmspace structure. + * + * - XXX: no locking on this structure + * - refcnt set to 1, rest must me init'd by caller + */ +void +uvmspace_init(vm, pmap, min, max, pageable) + struct vmspace *vm; + struct pmap *pmap; + vaddr_t min, max; + boolean_t pageable; +{ + UVMHIST_FUNC("uvmspace_init"); UVMHIST_CALLED(maphist); + + bzero(vm, sizeof(*vm)); + + uvm_map_setup(&vm->vm_map, min, max, pageable); + + if (pmap) + pmap_reference(pmap); + else +#if defined(PMAP_NEW) + pmap = pmap_create(); +#else + pmap = pmap_create(0); +#endif + vm->vm_map.pmap = pmap; + + vm->vm_refcnt = 1; + UVMHIST_LOG(maphist,"<- done",0,0,0,0); +} + +/* + * uvmspace_share: share a vmspace between two proceses + * + * - XXX: no locking on vmspace + * - used for vfork, threads(?) + */ + +void +uvmspace_share(p1, p2) + struct proc *p1, *p2; +{ + p2->p_vmspace = p1->p_vmspace; + p1->p_vmspace->vm_refcnt++; +} + +/* + * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace + * + * - XXX: no locking on vmspace + */ + +void +uvmspace_unshare(p) + struct proc *p; +{ + struct vmspace *nvm, *ovm = p->p_vmspace; + int s; + + if (ovm->vm_refcnt == 1) + /* nothing to do: vmspace isn't shared in the first place */ + return; + + /* make a new vmspace, still holding old one */ + nvm = uvmspace_fork(ovm); + + s = splhigh(); /* make this `atomic' */ + pmap_deactivate(p); + /* unbind old vmspace */ + p->p_vmspace = nvm; + pmap_activate(p); + /* switch to new vmspace */ + splx(s); /* end of critical section */ + + uvmspace_free(ovm); /* drop reference to old vmspace */ +} + +/* + * uvmspace_exec: the process wants to exec a new program + * + * - XXX: no locking on vmspace + */ + +void +uvmspace_exec(p) + struct proc *p; +{ + struct vmspace *nvm, *ovm = p->p_vmspace; + vm_map_t map = &ovm->vm_map; + int s; + +#ifdef sparc + /* XXX cgd 960926: the sparc #ifdef should be a MD hook */ + kill_user_windows(p); /* before stack addresses go away */ +#endif + + /* + * see if more than one process is using this vmspace... + */ + + if (ovm->vm_refcnt == 1) { + + /* + * if p is the only process using its vmspace then we can safely + * recycle that vmspace for the program that is being exec'd. + */ + +#ifdef SYSVSHM + /* + * SYSV SHM semantics require us to kill all segments on an exec + */ + if (ovm->vm_shm) + shmexit(ovm); +#endif + + /* + * now unmap the old program + */ + uvm_unmap(map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); + + } else { + + /* + * p's vmspace is being shared, so we can't reuse it for p since + * it is still being used for others. allocate a new vmspace + * for p + */ + nvm = uvmspace_alloc(map->min_offset, map->max_offset, + map->entries_pageable); + +#if (defined(i386) || defined(pc532)) && !defined(PMAP_NEW) + /* + * allocate zero fill area in the new vmspace's map for user + * page tables for ports that have old style pmaps that keep + * user page tables in the top part of the process' address + * space. + * + * XXXCDC: this should go away once all pmaps are fixed + */ + { + vaddr_t addr = VM_MAXUSER_ADDRESS; + if (uvm_map(&nvm->vm_map, &addr, VM_MAX_ADDRESS - addr, + NULL, UVM_UNKNOWN_OFFSET, UVM_MAPFLAG(UVM_PROT_ALL, + UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_NORMAL, + UVM_FLAG_FIXED|UVM_FLAG_COPYONW)) != KERN_SUCCESS) + panic("vm_allocate of PT page area failed"); + } +#endif + + /* + * install new vmspace and drop our ref to the old one. + */ + + s = splhigh(); + pmap_deactivate(p); + p->p_vmspace = nvm; + pmap_activate(p); + splx(s); + + uvmspace_free(ovm); + } +} + +/* + * uvmspace_free: free a vmspace data structure + * + * - XXX: no locking on vmspace + */ + +void +uvmspace_free(vm) + struct vmspace *vm; +{ + vm_map_entry_t dead_entries; + UVMHIST_FUNC("uvmspace_free"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist,"(vm=0x%x) ref=%d", vm, vm->vm_refcnt,0,0); + if (--vm->vm_refcnt == 0) { + /* + * lock the map, to wait out all other references to it. delete + * all of the mappings and pages they hold, then call the pmap + * module to reclaim anything left. + */ + vm_map_lock(&vm->vm_map); + if (vm->vm_map.nentries) { + (void)uvm_unmap_remove(&vm->vm_map, + vm->vm_map.min_offset, vm->vm_map.max_offset, + &dead_entries); + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, 0); + } + pmap_destroy(vm->vm_map.pmap); + vm->vm_map.pmap = NULL; + pool_put(&uvm_vmspace_pool, vm); + } + UVMHIST_LOG(maphist,"<- done", 0,0,0,0); +} + +/* + * F O R K - m a i n e n t r y p o i n t + */ +/* + * uvmspace_fork: fork a process' main map + * + * => create a new vmspace for child process from parent. + * => parent's map must not be locked. + */ + +struct vmspace * +uvmspace_fork(vm1) + struct vmspace *vm1; +{ + struct vmspace *vm2; + vm_map_t old_map = &vm1->vm_map; + vm_map_t new_map; + vm_map_entry_t old_entry; + vm_map_entry_t new_entry; + pmap_t new_pmap; + boolean_t protect_child; + UVMHIST_FUNC("uvmspace_fork"); UVMHIST_CALLED(maphist); + +#if (defined(i386) || defined(pc532)) && !defined(PMAP_NEW) + /* + * avoid copying any of the parent's pagetables or other per-process + * objects that reside in the map by marking all of them non-inheritable + * XXXCDC: should go away + */ + (void) uvm_map_inherit(old_map, VM_MAXUSER_ADDRESS, VM_MAX_ADDRESS, + VM_INHERIT_NONE); +#endif + + vm_map_lock(old_map); + + vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset, + old_map->entries_pageable); + bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, + (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); + new_map = &vm2->vm_map; /* XXX */ + new_pmap = new_map->pmap; + + old_entry = old_map->header.next; + + /* + * go entry-by-entry + */ + + while (old_entry != &old_map->header) { + + /* + * first, some sanity checks on the old entry + */ + if (UVM_ET_ISSUBMAP(old_entry)) + panic("fork: encountered a submap during fork (illegal)"); + + if (!UVM_ET_ISCOPYONWRITE(old_entry) && + UVM_ET_ISNEEDSCOPY(old_entry)) + panic("fork: non-copy_on_write map entry marked needs_copy (illegal)"); + + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + /* + * drop the mapping + */ + break; + + case VM_INHERIT_SHARE: + /* + * share the mapping: this means we want the old and + * new entries to share amaps and backing objects. + */ + + /* + * if the old_entry needs a new amap (due to prev fork) + * then we need to allocate it now so that we have + * something we own to share with the new_entry. [in + * other words, we need to clear needs_copy] + */ + + if (UVM_ET_ISNEEDSCOPY(old_entry)) { + /* get our own amap, clears needs_copy */ + amap_copy(old_map, old_entry, M_WAITOK, FALSE, + 0, 0); + /* XXXCDC: WAITOK??? */ + } + + new_entry = uvm_mapent_alloc(new_map); + /* old_entry -> new_entry */ + uvm_mapent_copy(old_entry, new_entry); + + /* new pmap has nothing wired in it */ + new_entry->wired_count = 0; + + /* + * gain reference to object backing the map (can't + * be a submap, already checked this case). + */ + if (new_entry->aref.ar_amap) + /* share reference */ + amap_ref(new_entry, AMAP_SHARED); + + if (new_entry->object.uvm_obj && + new_entry->object.uvm_obj->pgops->pgo_reference) + new_entry->object.uvm_obj-> + pgops->pgo_reference( + new_entry->object.uvm_obj); + + /* insert entry at end of new_map's entry list */ + uvm_map_entry_link(new_map, new_map->header.prev, + new_entry); + + /* + * pmap_copy the mappings: this routine is optional + * but if it is there it will reduce the number of + * page faults in the new proc. + */ + + pmap_copy(new_pmap, old_map->pmap, new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + + break; + + case VM_INHERIT_COPY: + + /* + * copy-on-write the mapping (using mmap's + * MAP_PRIVATE semantics) + * + * allocate new_entry, adjust reference counts. + * (note that new references are read-only). + */ + + new_entry = uvm_mapent_alloc(new_map); + /* old_entry -> new_entry */ + uvm_mapent_copy(old_entry, new_entry); + + if (new_entry->aref.ar_amap) + amap_ref(new_entry, 0); + + if (new_entry->object.uvm_obj && + new_entry->object.uvm_obj->pgops->pgo_reference) + new_entry->object.uvm_obj->pgops->pgo_reference + (new_entry->object.uvm_obj); + + /* new pmap has nothing wired in it */ + new_entry->wired_count = 0; + + new_entry->etype |= + (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); + uvm_map_entry_link(new_map, new_map->header.prev, + new_entry); + + /* + * the new entry will need an amap. it will either + * need to be copied from the old entry or created + * from scratch (if the old entry does not have an + * amap). can we defer this process until later + * (by setting "needs_copy") or do we need to copy + * the amap now? + * + * we must copy the amap now if any of the following + * conditions hold: + * 1. the old entry has an amap and that amap is + * being shared. this means that the old (parent) + * process is sharing the amap with another + * process. if we do not clear needs_copy here + * we will end up in a situation where both the + * parent and child process are refering to the + * same amap with "needs_copy" set. if the + * parent write-faults, the fault routine will + * clear "needs_copy" in the parent by allocating + * a new amap. this is wrong because the + * parent is supposed to be sharing the old amap + * and the new amap will break that. + * + * 2. if the old entry has an amap and a non-zero + * wire count then we are going to have to call + * amap_cow_now to avoid page faults in the + * parent process. since amap_cow_now requires + * "needs_copy" to be clear we might as well + * clear it here as well. + * + */ + + if (old_entry->aref.ar_amap != NULL) { + + if ((amap_flags(old_entry->aref.ar_amap) & + AMAP_SHARED) != 0 || + old_entry->wired_count != 0) { + + amap_copy(new_map, new_entry, M_WAITOK, FALSE, + 0, 0); + /* XXXCDC: M_WAITOK ... ok? */ + } + } + + /* + * if the parent's entry is wired down, then the + * parent process does not want page faults on + * access to that memory. this means that we + * cannot do copy-on-write because we can't write + * protect the old entry. in this case we + * resolve all copy-on-write faults now, using + * amap_cow_now. note that we have already + * allocated any needed amap (above). + */ + + if (old_entry->wired_count != 0) { + + /* + * resolve all copy-on-write faults now + * (note that there is nothing to do if + * the old mapping does not have an amap). + * XXX: is it worthwhile to bother with pmap_copy + * in this case? + */ + if (old_entry->aref.ar_amap) + amap_cow_now(new_map, new_entry); + + } else { + + /* + * setup mappings to trigger copy-on-write faults + * we must write-protect the parent if it has + * an amap and it is not already "needs_copy"... + * if it is already "needs_copy" then the parent + * has already been write-protected by a previous + * fork operation. + * + * if we do not write-protect the parent, then + * we must be sure to write-protect the child + * after the pmap_copy() operation. + * + * XXX: pmap_copy should have some way of telling + * us that it didn't do anything so we can avoid + * calling pmap_protect needlessly. + */ + + if (old_entry->aref.ar_amap) { + + if (!UVM_ET_ISNEEDSCOPY(old_entry)) { + if (old_entry->max_protection & VM_PROT_WRITE) { + pmap_protect(old_map->pmap, + old_entry->start, + old_entry->end, + old_entry->protection & + ~VM_PROT_WRITE); + } + old_entry->etype |= UVM_ET_NEEDSCOPY; + } + + /* + * parent must now be write-protected + */ + protect_child = FALSE; + } else { + + /* + * we only need to protect the child if the + * parent has write access. + */ + if (old_entry->max_protection & VM_PROT_WRITE) + protect_child = TRUE; + else + protect_child = FALSE; + + } + + /* + * copy the mappings + * XXX: need a way to tell if this does anything + */ + + pmap_copy(new_pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + + /* + * protect the child's mappings if necessary + */ + if (protect_child) { + pmap_protect(new_pmap, new_entry->start, + new_entry->end, + new_entry->protection & + ~VM_PROT_WRITE); + } + + } + break; + } /* end of switch statement */ + old_entry = old_entry->next; + } + + new_map->size = old_map->size; + vm_map_unlock(old_map); + +#if (defined(i386) || defined(pc532)) && !defined(PMAP_NEW) + /* + * allocate zero fill area in the new vmspace's map for user + * page tables for ports that have old style pmaps that keep + * user page tables in the top part of the process' address + * space. + * + * XXXCDC: this should go away once all pmaps are fixed + */ + { + vaddr_t addr = VM_MAXUSER_ADDRESS; + if (uvm_map(new_map, &addr, VM_MAX_ADDRESS - addr, NULL, + UVM_UNKNOWN_OFFSET, UVM_MAPFLAG(UVM_PROT_ALL, + UVM_PROT_ALL, UVM_INH_NONE, UVM_ADV_NORMAL, + UVM_FLAG_FIXED|UVM_FLAG_COPYONW)) != KERN_SUCCESS) + panic("vm_allocate of PT page area failed"); + } +#endif + +#ifdef SYSVSHM + if (vm1->vm_shm) + shmfork(vm1, vm2); +#endif + + UVMHIST_LOG(maphist,"<- done",0,0,0,0); + return(vm2); +} + + +#if defined(DDB) + +/* + * DDB hooks + */ + +/* + * uvm_map_print: print out a map + */ + +void +uvm_map_print(map, full) + vm_map_t map; + boolean_t full; +{ + + uvm_map_printit(map, full, printf); +} + +/* + * uvm_map_printit: actually prints the map + */ + +void +uvm_map_printit(map, full, pr) + vm_map_t map; + boolean_t full; + int (*pr) __P((const char *, ...)); +{ + vm_map_entry_t entry; + + (*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset); + (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d\n", + map->nentries, map->size, map->ref_count, map->timestamp); +#ifdef pmap_resident_count + (*pr)("\tpmap=%p(resident=%d)\n", map->pmap, + pmap_resident_count(map->pmap)); +#else + /* XXXCDC: this should be required ... */ + (*pr)("\tpmap=%p(resident=<<NOT SUPPORTED!!!>>)\n", map->pmap); +#endif + if (!full) + return; + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + (*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%x, amap=%p/%d\n", + entry, entry->start, entry->end, entry->object.uvm_obj, + entry->offset, entry->aref.ar_amap, entry->aref.ar_pageoff); + (*pr)( +"\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, wc=%d, adv=%d\n", + (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F', + (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F', + (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F', + entry->protection, entry->max_protection, + entry->inheritance, entry->wired_count, entry->advice); + } +} + +/* + * uvm_object_print: print out an object + */ + +void +uvm_object_print(uobj, full) + struct uvm_object *uobj; + boolean_t full; +{ + + uvm_object_printit(uobj, full, printf); +} + +/* + * uvm_object_printit: actually prints the object + */ + +void +uvm_object_printit(uobj, full, pr) + struct uvm_object *uobj; + boolean_t full; + int (*pr) __P((const char *, ...)); +{ + struct vm_page *pg; + int cnt = 0; + + (*pr)("OBJECT %p: pgops=%p, npages=%d, ", uobj, uobj->pgops, + uobj->uo_npages); + if (uobj->uo_refs == UVM_OBJ_KERN) + (*pr)("refs=<SYSTEM>\n"); + else + (*pr)("refs=%d\n", uobj->uo_refs); + + if (!full) return; + (*pr)(" PAGES <pg,offset>:\n "); + for (pg = uobj->memq.tqh_first ; pg ; pg = pg->listq.tqe_next, cnt++) { + (*pr)("<%p,0x%lx> ", pg, pg->offset); + if ((cnt % 3) == 2) (*pr)("\n "); + } + if ((cnt % 3) != 2) (*pr)("\n"); +} + +/* + * uvm_page_print: print out a page + */ + +void +uvm_page_print(pg, full) + struct vm_page *pg; + boolean_t full; +{ + + uvm_page_printit(pg, full, printf); +} + +/* + * uvm_page_printit: actually print the page + */ + +void +uvm_page_printit(pg, full, pr) + struct vm_page *pg; + boolean_t full; + int (*pr) __P((const char *, ...)); +{ + struct vm_page *lcv; + struct uvm_object *uobj; + struct pglist *pgl; + + (*pr)("PAGE %p:\n", pg); + (*pr)(" flags=0x%x, pqflags=0x%x, vers=%d, wire_count=%d, pa=0x%lx\n", + pg->flags, pg->pqflags, pg->version, pg->wire_count, (long)pg->phys_addr); + (*pr)(" uobject=%p, uanon=%p, offset=0x%lx loan_count=%d\n", + pg->uobject, pg->uanon, pg->offset, pg->loan_count); +#if defined(UVM_PAGE_TRKOWN) + if (pg->flags & PG_BUSY) + (*pr)(" owning process = %d, tag=%s\n", + pg->owner, pg->owner_tag); + else + (*pr)(" page not busy, no owner\n"); +#else + (*pr)(" [page ownership tracking disabled]\n"); +#endif + + if (!full) + return; + + /* cross-verify object/anon */ + if ((pg->pqflags & PQ_FREE) == 0) { + if (pg->pqflags & PQ_ANON) { + if (pg->uanon == NULL || pg->uanon->u.an_page != pg) + (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", + (pg->uanon) ? pg->uanon->u.an_page : NULL); + else + (*pr)(" anon backpointer is OK\n"); + } else { + uobj = pg->uobject; + if (uobj) { + (*pr)(" checking object list\n"); + for (lcv = uobj->memq.tqh_first ; lcv ; + lcv = lcv->listq.tqe_next) { + if (lcv == pg) break; + } + if (lcv) + (*pr)(" page found on object list\n"); + else + (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); + } + } + } + + /* cross-verify page queue */ + if (pg->pqflags & PQ_FREE) + pgl = &uvm.page_free[uvm_page_lookup_freelist(pg)]; + else if (pg->pqflags & PQ_INACTIVE) + pgl = (pg->pqflags & PQ_SWAPBACKED) ? + &uvm.page_inactive_swp : &uvm.page_inactive_obj; + else if (pg->pqflags & PQ_ACTIVE) + pgl = &uvm.page_active; + else + pgl = NULL; + + if (pgl) { + (*pr)(" checking pageq list\n"); + for (lcv = pgl->tqh_first ; lcv ; lcv = lcv->pageq.tqe_next) { + if (lcv == pg) break; + } + if (lcv) + (*pr)(" page found on pageq list\n"); + else + (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); + } +} +#endif diff --git a/sys/uvm/uvm_map.h b/sys/uvm/uvm_map.h new file mode 100644 index 00000000000..4c10b5222d1 --- /dev/null +++ b/sys/uvm/uvm_map.h @@ -0,0 +1,166 @@ +/* $NetBSD: uvm_map.h,v 1.10 1998/10/11 23:14:48 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.h 8.3 (Berkeley) 3/15/94 + * from: Id: uvm_map.h,v 1.1.2.3 1998/02/07 01:16:55 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _UVM_UVM_MAP_H_ +#define _UVM_UVM_MAP_H_ + +/* + * uvm_map.h + */ + +/* + * macros + */ + +/* + * UVM_MAP_CLIP_START: ensure that the entry begins at or after + * the starting address, if it doesn't we split the entry. + * + * => map must be locked by caller + */ + +#define UVM_MAP_CLIP_START(MAP,ENTRY,VA) { \ + if ((VA) > (ENTRY)->start) uvm_map_clip_start(MAP,ENTRY,VA); } + +/* + * UVM_MAP_CLIP_END: ensure that the entry ends at or before + * the ending address, if it does't we split the entry. + * + * => map must be locked by caller + */ + +#define UVM_MAP_CLIP_END(MAP,ENTRY,VA) { \ + if ((VA) < (ENTRY)->end) uvm_map_clip_end(MAP,ENTRY,VA); } + +/* + * extract flags + */ +#define UVM_EXTRACT_REMOVE 0x1 /* remove mapping from old map */ +#define UVM_EXTRACT_CONTIG 0x2 /* try to keep it contig */ +#define UVM_EXTRACT_QREF 0x4 /* use quick refs */ +#define UVM_EXTRACT_FIXPROT 0x8 /* set prot to maxprot as we go */ + + +/* + * handle inline options + */ + +#ifdef UVM_MAP_INLINE +#define MAP_INLINE static __inline +#else +#define MAP_INLINE /* nothing */ +#endif /* UVM_MAP_INLINE */ + +/* + * protos: the following prototypes define the interface to vm_map + */ + +MAP_INLINE +void uvm_map_deallocate __P((vm_map_t)); + +int uvm_map_clean __P((vm_map_t, vaddr_t, vaddr_t, int)); +void uvm_map_clip_start __P((vm_map_t, + vm_map_entry_t, vaddr_t)); +void uvm_map_clip_end __P((vm_map_t, vm_map_entry_t, + vaddr_t)); +MAP_INLINE +vm_map_t uvm_map_create __P((pmap_t, vaddr_t, + vaddr_t, boolean_t)); +int uvm_map_extract __P((vm_map_t, vaddr_t, vsize_t, + vm_map_t, vaddr_t *, int)); +vm_map_entry_t uvm_map_findspace __P((vm_map_t, vaddr_t, vsize_t, + vaddr_t *, struct uvm_object *, vaddr_t, + boolean_t)); +int uvm_map_inherit __P((vm_map_t, vaddr_t, vaddr_t, + vm_inherit_t)); +void uvm_map_init __P((void)); +boolean_t uvm_map_lookup_entry __P((vm_map_t, vaddr_t, + vm_map_entry_t *)); +MAP_INLINE +void uvm_map_reference __P((vm_map_t)); +int uvm_map_replace __P((vm_map_t, vaddr_t, vaddr_t, + vm_map_entry_t, int)); +int uvm_map_reserve __P((vm_map_t, vsize_t, vaddr_t, + vaddr_t *)); +void uvm_map_setup __P((vm_map_t, vaddr_t, + vaddr_t, boolean_t)); +int uvm_map_submap __P((vm_map_t, vaddr_t, + vaddr_t, vm_map_t)); +MAP_INLINE +int uvm_unmap __P((vm_map_t, vaddr_t, vaddr_t)); +void uvm_unmap_detach __P((vm_map_entry_t,int)); +int uvm_unmap_remove __P((vm_map_t, vaddr_t, vaddr_t, + vm_map_entry_t *)); + +struct vmspace *uvmspace_fork __P((struct vmspace *)); + +#endif /* _UVM_UVM_MAP_H_ */ diff --git a/sys/uvm/uvm_map_i.h b/sys/uvm/uvm_map_i.h new file mode 100644 index 00000000000..56842e191b6 --- /dev/null +++ b/sys/uvm/uvm_map_i.h @@ -0,0 +1,243 @@ +/* $NetBSD: uvm_map_i.h,v 1.10 1998/10/11 23:14:48 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 + * from: Id: uvm_map_i.h,v 1.1.2.1 1997/08/14 19:10:50 chuck Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _UVM_UVM_MAP_I_H_ +#define _UVM_UVM_MAP_I_H_ + +/* + * uvm_map_i.h + */ + +/* + * inline functions [maybe] + */ + +#if defined(UVM_MAP_INLINE) || defined(UVM_MAP) + +/* + * uvm_map_create: create map + */ + +MAP_INLINE vm_map_t +uvm_map_create(pmap, min, max, pageable) + pmap_t pmap; + vaddr_t min, max; + boolean_t pageable; +{ + vm_map_t result; + + MALLOC(result, vm_map_t, sizeof(struct vm_map), M_VMMAP, M_WAITOK); + uvm_map_setup(result, min, max, pageable); + result->pmap = pmap; + return(result); +} + +/* + * uvm_map_setup: init map + * + * => map must not be in service yet. + */ + +MAP_INLINE void +uvm_map_setup(map, min, max, pageable) + vm_map_t map; + vaddr_t min, max; + boolean_t pageable; +{ + + map->header.next = map->header.prev = &map->header; + map->nentries = 0; + map->size = 0; + map->ref_count = 1; + map->min_offset = min; + map->max_offset = max; + map->entries_pageable = pageable; + map->first_free = &map->header; + map->hint = &map->header; + map->timestamp = 0; + lockinit(&map->lock, PVM, "thrd_sleep", 0, 0); + simple_lock_init(&map->ref_lock); + simple_lock_init(&map->hint_lock); +} + + +/* + * U N M A P - m a i n e n t r y p o i n t + */ + +/* + * uvm_unmap: remove mappings from a vm_map (from "start" up to "stop") + * + * => caller must check alignment and size + * => map must be unlocked (we will lock it) + * => if the "start"/"stop" range lie within a mapping of a share map, + * then the unmap takes place within the context of that share map + * rather than in the main map, unless the "mainonly" flag is set. + * (e.g. the "exit" system call would want to set "mainonly"). + */ + +MAP_INLINE int +uvm_unmap(map, start, end) + vm_map_t map; + vaddr_t start,end; +{ + int result; + vm_map_entry_t dead_entries; + UVMHIST_FUNC("uvm_unmap"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, " (map=0x%x, start=0x%x, end=0x%x)", + map, start, end, 0); + /* + * work now done by helper functions. wipe the pmap's and then + * detach from the dead entries... + */ + vm_map_lock(map); + result = uvm_unmap_remove(map, start, end, &dead_entries); + vm_map_unlock(map); + + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, 0); + + UVMHIST_LOG(maphist, "<- done", 0,0,0,0); + return(result); +} + + +/* + * uvm_map_reference: add reference to a map + * + * => map need not be locked (we use ref_lock). + */ + +MAP_INLINE void +uvm_map_reference(map) + vm_map_t map; +{ + if (map == NULL) { +#ifdef DIAGNOSTIC + printf("uvm_map_reference: reference to NULL map\n"); +#ifdef DDB + Debugger(); +#endif +#endif + return; + } + + simple_lock(&map->ref_lock); + map->ref_count++; + simple_unlock(&map->ref_lock); +} + +/* + * uvm_map_deallocate: drop reference to a map + * + * => caller must not lock map + * => we will zap map if ref count goes to zero + */ + +MAP_INLINE void +uvm_map_deallocate(map) + vm_map_t map; +{ + int c; + + if (map == NULL) { +#ifdef DIAGNOSTIC + printf("uvm_map_deallocate: reference to NULL map\n"); +#ifdef DDB + Debugger(); +#endif +#endif + return; + } + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + + if (c > 0) { + return; + } + + /* + * all references gone. unmap and free. + */ + + uvm_unmap(map, map->min_offset, map->max_offset); + pmap_destroy(map->pmap); + + FREE(map, M_VMMAP); +} + +#endif /* defined(UVM_MAP_INLINE) || defined(UVM_MAP) */ + +#endif /* _UVM_UVM_MAP_I_H_ */ diff --git a/sys/uvm/uvm_meter.c b/sys/uvm/uvm_meter.c new file mode 100644 index 00000000000..e064a087e64 --- /dev/null +++ b/sys/uvm/uvm_meter.c @@ -0,0 +1,246 @@ +/* $NetBSD: uvm_meter.c,v 1.7 1998/08/09 22:36:39 perry Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, and the University of California, Berkeley + * and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 + * from: Id: uvm_meter.c,v 1.1.2.1 1997/08/14 19:10:35 chuck Exp + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <vm/vm.h> +#include <sys/sysctl.h> +#include <sys/exec.h> + +/* + * maxslp: ???? XXXCDC + */ + +int maxslp = MAXSLP; /* patchable ... */ +struct loadavg averunnable; /* decl. */ + +/* + * constants for averages over 1, 5, and 15 minutes when sampling at + * 5 second intervals. + */ + +static fixpt_t cexp[3] = { + 0.9200444146293232 * FSCALE, /* exp(-1/12) */ + 0.9834714538216174 * FSCALE, /* exp(-1/60) */ + 0.9944598480048967 * FSCALE, /* exp(-1/180) */ +}; + +/* + * prototypes + */ + +static void uvm_loadav __P((struct loadavg *)); + +/* + * uvm_meter: calculate load average and wake up the swapper (if needed) + */ +void +uvm_meter() +{ + if ((time.tv_sec % 5) == 0) + uvm_loadav(&averunnable); + if (proc0.p_slptime > (maxslp / 2)) + wakeup((caddr_t)&proc0); +} + +/* + * uvm_loadav: compute a tenex style load average of a quantity on + * 1, 5, and 15 minute internvals. + */ +static void +uvm_loadav(avg) + struct loadavg *avg; +{ + int i, nrun; + struct proc *p; + + for (nrun = 0, p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + switch (p->p_stat) { + case SSLEEP: + if (p->p_priority > PZERO || p->p_slptime > 1) + continue; + /* fall through */ + case SRUN: + case SIDL: + nrun++; + } + } + for (i = 0; i < 3; i++) + avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; +} + +/* + * uvm_sysctl: sysctl hook into UVM system. + */ +int +uvm_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct vmtotal vmtotals; + struct _ps_strings _ps = { PS_STRINGS }; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case VM_LOADAVG: + return (sysctl_rdstruct(oldp, oldlenp, newp, &averunnable, + sizeof(averunnable))); + + case VM_METER: + uvm_total(&vmtotals); + return (sysctl_rdstruct(oldp, oldlenp, newp, &vmtotals, + sizeof(vmtotals))); + + case VM_UVMEXP: + return (sysctl_rdstruct(oldp, oldlenp, newp, &uvmexp, + sizeof(uvmexp))); + + case VM_PSSTRINGS: + return (sysctl_rdstruct(oldp, oldlenp, newp, &_ps, + sizeof(_ps))); + + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +/* + * uvm_total: calculate the current state of the system. + */ +void +uvm_total(totalp) + struct vmtotal *totalp; +{ + struct proc *p; +#if 0 + vm_map_entry_t entry; + vm_map_t map; + int paging; +#endif + + bzero(totalp, sizeof *totalp); + + /* + * calculate process statistics + */ + + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_flag & P_SYSTEM) + continue; + switch (p->p_stat) { + case 0: + continue; + + case SSLEEP: + case SSTOP: + if (p->p_flag & P_INMEM) { + if (p->p_priority <= PZERO) + totalp->t_dw++; + else if (p->p_slptime < maxslp) + totalp->t_sl++; + } else if (p->p_slptime < maxslp) + totalp->t_sw++; + if (p->p_slptime >= maxslp) + continue; + break; + + case SRUN: + case SIDL: + if (p->p_flag & P_INMEM) + totalp->t_rq++; + else + totalp->t_sw++; + if (p->p_stat == SIDL) + continue; + break; + } + /* + * note active objects + */ +#if 0 + /* + * XXXCDC: BOGUS! rethink this. in the mean time + * don't do it. + */ + paging = 0; + vm_map_lock(map); + for (map = &p->p_vmspace->vm_map, entry = map->header.next; + entry != &map->header; entry = entry->next) { + if (entry->is_a_map || entry->is_sub_map || + entry->object.uvm_obj == NULL) + continue; + /* XXX how to do this with uvm */ + } + vm_map_unlock(map); + if (paging) + totalp->t_pw++; +#endif + } + /* + * Calculate object memory usage statistics. + */ + totalp->t_free = uvmexp.free; + totalp->t_vm = uvmexp.npages - uvmexp.free + uvmexp.swpginuse; + totalp->t_avm = uvmexp.active + uvmexp.swpginuse; /* XXX */ + totalp->t_rm = uvmexp.npages - uvmexp.free; + totalp->t_arm = uvmexp.active; + totalp->t_vmshr = 0; /* XXX */ + totalp->t_avmshr = 0; /* XXX */ + totalp->t_rmshr = 0; /* XXX */ + totalp->t_armshr = 0; /* XXX */ +} diff --git a/sys/uvm/uvm_mmap.c b/sys/uvm/uvm_mmap.c new file mode 100644 index 00000000000..66724213c55 --- /dev/null +++ b/sys/uvm/uvm_mmap.c @@ -0,0 +1,963 @@ +/* $NetBSD: uvm_mmap.c,v 1.15 1998/10/11 23:18:20 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993 The Regents of the University of California. + * Copyright (c) 1988 University of Utah. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Charles D. Cranor, + * Washington University, University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ + * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 + * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp + */ + +/* + * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap + * function. + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/resourcevar.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/conf.h> +#include <sys/stat.h> + +#include <miscfs/specfs/specdev.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <sys/syscallargs.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_device.h> +#include <uvm/uvm_vnode.h> + + +/* + * unimplemented VM system calls: + */ + +/* + * sys_sbrk: sbrk system call. + */ + +/* ARGSUSED */ +int +sys_sbrk(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ +#if 0 + struct sys_sbrk_args /* { + syscallarg(int) incr; + } */ *uap = v; +#endif + + return (EOPNOTSUPP); +} + +/* + * sys_sstk: sstk system call. + */ + +/* ARGSUSED */ +int +sys_sstk(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ +#if 0 + struct sys_sstk_args /* { + syscallarg(int) incr; + } */ *uap = v; +#endif + + return (EOPNOTSUPP); +} + +/* + * sys_madvise: give advice about memory usage. + */ + +/* ARGSUSED */ +int +sys_madvise(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ +#if 0 + struct sys_madvise_args /* { + syscallarg(caddr_t) addr; + syscallarg(size_t) len; + syscallarg(int) behav; + } */ *uap = v; +#endif + + return (EOPNOTSUPP); +} + +/* + * sys_mincore: determine if pages are in core or not. + */ + +/* ARGSUSED */ +int +sys_mincore(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ +#if 0 + struct sys_mincore_args /* { + syscallarg(caddr_t) addr; + syscallarg(size_t) len; + syscallarg(char *) vec; + } */ *uap = v; +#endif + + return (EOPNOTSUPP); +} + +#if 0 +/* + * munmapfd: unmap file descriptor + * + * XXX: is this acutally a useful function? could it be useful? + */ + +void +munmapfd(p, fd) + struct proc *p; + int fd; +{ + + /* + * XXX should vm_deallocate any regions mapped to this file + */ + p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; +} +#endif + +/* + * sys_mmap: mmap system call. + * + * => file offest and address may not be page aligned + * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE + * - if address isn't page aligned the mapping starts at trunc_page(addr) + * and the return value is adjusted up by the page offset. + */ + +int +sys_mmap(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + register struct sys_mmap_args /* { + syscallarg(caddr_t) addr; + syscallarg(size_t) len; + syscallarg(int) prot; + syscallarg(int) flags; + syscallarg(int) fd; + syscallarg(long) pad; + syscallarg(off_t) pos; + } */ *uap = v; + vaddr_t addr; + struct vattr va; + off_t pos; + vsize_t size, pageoff; + vm_prot_t prot, maxprot; + int flags, fd; + vaddr_t vm_min_address = VM_MIN_ADDRESS; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + caddr_t handle; + int error; + + /* + * first, extract syscall args from the uap. + */ + + addr = (vaddr_t) SCARG(uap, addr); + size = (vsize_t) SCARG(uap, len); + prot = SCARG(uap, prot) & VM_PROT_ALL; + flags = SCARG(uap, flags); + fd = SCARG(uap, fd); + pos = SCARG(uap, pos); + + /* + * make sure that the newsize fits within a vaddr_t + * XXX: need to revise addressing data types + */ + if (pos + size > (vaddr_t)-PAGE_SIZE) { +#ifdef DEBUG + printf("mmap: pos=%qx, size=%x too big\n", pos, (int)size); +#endif + return (EINVAL); + } + + /* + * align file position and save offset. adjust size. + */ + + pageoff = (pos & PAGE_MASK); + pos -= pageoff; + size += pageoff; /* add offset */ + size = (vsize_t) round_page(size); /* round up */ + if ((ssize_t) size < 0) + return (EINVAL); /* don't allow wrap */ + + /* + * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" + */ + + if (flags & MAP_FIXED) { + + /* ensure address and file offset are aligned properly */ + addr -= pageoff; + if (addr & PAGE_MASK) + return (EINVAL); + + if (VM_MAXUSER_ADDRESS > 0 && + (addr + size) > VM_MAXUSER_ADDRESS) + return (EINVAL); + if (vm_min_address > 0 && addr < vm_min_address) + return (EINVAL); + if (addr > addr + size) + return (EINVAL); /* no wrapping! */ + + } else { + + /* + * not fixed: make sure we skip over the largest possible heap. + * we will refine our guess later (e.g. to account for VAC, etc) + */ + if (addr < round_page(p->p_vmspace->vm_daddr + MAXDSIZ)) + addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); + } + + /* + * check for file mappings (i.e. not anonymous) and verify file. + */ + + if ((flags & MAP_ANON) == 0) { + + if (fd < 0 || fd >= fdp->fd_nfiles) + return(EBADF); /* failed range check? */ + fp = fdp->fd_ofiles[fd]; /* convert to file pointer */ + if (fp == NULL) + return(EBADF); + + if (fp->f_type != DTYPE_VNODE) + return (ENODEV); /* only mmap vnodes! */ + vp = (struct vnode *)fp->f_data; /* convert to vnode */ + + if (vp->v_type != VREG && vp->v_type != VCHR && + vp->v_type != VBLK) + return (ENODEV); /* only REG/CHR/BLK support mmap */ + + /* special case: catch SunOS style /dev/zero */ + if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { + flags |= MAP_ANON; + goto is_anon; + } + + /* + * Old programs may not select a specific sharing type, so + * default to an appropriate one. + * + * XXX: how does MAP_ANON fit in the picture? + */ + if ((flags & (MAP_SHARED|MAP_PRIVATE|MAP_COPY)) == 0) { +#if defined(DEBUG) + printf("WARNING: defaulted mmap() share type to " + "%s (pid %d comm %s)\n", vp->v_type == VCHR ? + "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, + p->p_comm); +#endif + if (vp->v_type == VCHR) + flags |= MAP_SHARED; /* for a device */ + else + flags |= MAP_PRIVATE; /* for a file */ + } + + /* + * MAP_PRIVATE device mappings don't make sense (and aren't + * supported anyway). However, some programs rely on this, + * so just change it to MAP_SHARED. + */ + if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { +#if defined(DIAGNOSTIC) + printf("WARNING: converted MAP_PRIVATE device mapping " + "to MAP_SHARED (pid %d comm %s)\n", p->p_pid, + p->p_comm); +#endif + flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; + } + + /* + * now check protection + */ + + maxprot = VM_PROT_EXECUTE; + + /* check read access */ + if (fp->f_flag & FREAD) + maxprot |= VM_PROT_READ; + else if (prot & PROT_READ) + return (EACCES); + + /* check write access, shared case first */ + if (flags & MAP_SHARED) { + /* + * if the file is writable, only add PROT_WRITE to + * maxprot if the file is not immutable, append-only. + * otherwise, if we have asked for PROT_WRITE, return + * EPERM. + */ + if (fp->f_flag & FWRITE) { + if ((error = + VOP_GETATTR(vp, &va, p->p_ucred, p))) + return (error); + if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) + maxprot |= VM_PROT_WRITE; + else if (prot & PROT_WRITE) + return (EPERM); + } + else if (prot & PROT_WRITE) + return (EACCES); + } else { + /* MAP_PRIVATE mappings can always write to */ + maxprot |= VM_PROT_WRITE; + } + + /* + * set handle to vnode + */ + + handle = (caddr_t)vp; + + } else { /* MAP_ANON case */ + + if (fd != -1) + return (EINVAL); + +is_anon: /* label for SunOS style /dev/zero */ + handle = NULL; + maxprot = VM_PROT_ALL; + pos = 0; + } + + /* + * now let kernel internal function uvm_mmap do the work. + */ + + error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, + flags, handle, pos); + + if (error == 0) + /* remember to add offset */ + *retval = (register_t)(addr + pageoff); + + return (error); +} + +/* + * XXX + * XXX + * XXX + */ +int +sys_omsync(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + return EOPNOTSUPP; +} + +/* + * sys___msync13: the msync system call (a front-end for flush) + */ + +int +sys_msync(p, v, retval) /* ART_UVM_XXX - is this correct msync? */ + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_msync_args /* { + syscallarg(caddr_t) addr; + syscallarg(size_t) len; + syscallarg(int) flags; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + vm_map_t map; + int rv, flags, uvmflags; + + /* + * extract syscall args from the uap + */ + + addr = (vaddr_t)SCARG(uap, addr); + size = (vsize_t)SCARG(uap, len); + flags = SCARG(uap, flags); + + /* sanity check flags */ + if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || + (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || + (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) + return (EINVAL); + if ((flags & (MS_ASYNC | MS_SYNC)) == 0) + flags |= MS_SYNC; + + /* + * align the address to a page boundary, and adjust the size accordingly + */ + + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + + /* disallow wrap-around. */ + if (addr + size < addr) + return (EINVAL); + + /* + * get map + */ + + map = &p->p_vmspace->vm_map; + + /* + * XXXCDC: do we really need this semantic? + * + * XXX Gak! If size is zero we are supposed to sync "all modified + * pages with the region containing addr". Unfortunately, we + * don't really keep track of individual mmaps so we approximate + * by flushing the range of the map entry containing addr. + * This can be incorrect if the region splits or is coalesced + * with a neighbor. + */ + if (size == 0) { + vm_map_entry_t entry; + + vm_map_lock_read(map); + rv = uvm_map_lookup_entry(map, addr, &entry); + if (rv == TRUE) { + addr = entry->start; + size = entry->end - entry->start; + } + vm_map_unlock_read(map); + if (rv == FALSE) + return (EINVAL); + } + + /* + * translate MS_ flags into PGO_ flags + */ + uvmflags = (flags & MS_INVALIDATE) ? PGO_FREE : 0; + if (flags & MS_SYNC) + uvmflags |= PGO_SYNCIO; + else + uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */ + + /* + * doit! + */ + rv = uvm_map_clean(map, addr, addr+size, uvmflags); + + /* + * and return... + */ + switch (rv) { + case KERN_SUCCESS: + return(0); + case KERN_INVALID_ADDRESS: + return (ENOMEM); + case KERN_FAILURE: + return (EIO); + case KERN_PAGES_LOCKED: /* XXXCDC: uvm doesn't return this */ + return (EBUSY); + default: + return (EINVAL); + } + /*NOTREACHED*/ +} + +/* + * sys_munmap: unmap a users memory + */ + +int +sys_munmap(p, v, retval) + register struct proc *p; + void *v; + register_t *retval; +{ + register struct sys_munmap_args /* { + syscallarg(caddr_t) addr; + syscallarg(size_t) len; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + vm_map_t map; + vaddr_t vm_min_address = VM_MIN_ADDRESS; + struct vm_map_entry *dead_entries; + + /* + * get syscall args... + */ + + addr = (vaddr_t) SCARG(uap, addr); + size = (vsize_t) SCARG(uap, len); + + /* + * align the address to a page boundary, and adjust the size accordingly + */ + + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + + if ((int)size < 0) + return (EINVAL); + if (size == 0) + return (0); + + /* + * Check for illegal addresses. Watch out for address wrap... + * Note that VM_*_ADDRESS are not constants due to casts (argh). + */ + if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) + return (EINVAL); + if (vm_min_address > 0 && addr < vm_min_address) + return (EINVAL); + if (addr > addr + size) + return (EINVAL); + map = &p->p_vmspace->vm_map; + + + vm_map_lock(map); /* lock map so we can checkprot */ + + /* + * interesting system call semantic: make sure entire range is + * allocated before allowing an unmap. + */ + + if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { + vm_map_unlock(map); + return (EINVAL); + } + + /* + * doit! + */ + (void) uvm_unmap_remove(map, addr, addr + size, &dead_entries); + + vm_map_unlock(map); /* and unlock */ + + if (dead_entries != NULL) + uvm_unmap_detach(dead_entries, 0); + + return (0); +} + +/* + * sys_mprotect: the mprotect system call + */ + +int +sys_mprotect(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_mprotect_args /* { + syscallarg(caddr_t) addr; + syscallarg(int) len; + syscallarg(int) prot; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + vm_prot_t prot; + int rv; + + /* + * extract syscall args from uap + */ + + addr = (vaddr_t)SCARG(uap, addr); + size = (vsize_t)SCARG(uap, len); + prot = SCARG(uap, prot) & VM_PROT_ALL; + + /* + * align the address to a page boundary, and adjust the size accordingly + */ + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + if ((int)size < 0) + return (EINVAL); + + /* + * doit + */ + + rv = uvm_map_protect(&p->p_vmspace->vm_map, + addr, addr+size, prot, FALSE); + + if (rv == KERN_SUCCESS) + return (0); + if (rv == KERN_PROTECTION_FAILURE) + return (EACCES); + return (EINVAL); +} + +/* + * sys_minherit: the minherit system call + */ + +int +sys_minherit(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_minherit_args /* { + syscallarg(caddr_t) addr; + syscallarg(int) len; + syscallarg(int) inherit; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + register vm_inherit_t inherit; + + addr = (vaddr_t)SCARG(uap, addr); + size = (vsize_t)SCARG(uap, len); + inherit = SCARG(uap, inherit); + /* + * align the address to a page boundary, and adjust the size accordingly + */ + + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + + if ((int)size < 0) + return (EINVAL); + + switch (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, + inherit)) { + case KERN_SUCCESS: + return (0); + case KERN_PROTECTION_FAILURE: + return (EACCES); + } + return (EINVAL); +} + +/* + * sys_mlock: memory lock + */ + +int +sys_mlock(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_mlock_args /* { + syscallarg(const void *) addr; + syscallarg(size_t) len; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + int error; + + /* + * extract syscall args from uap + */ + addr = (vaddr_t)SCARG(uap, addr); + size = (vsize_t)SCARG(uap, len); + + /* + * align the address to a page boundary and adjust the size accordingly + */ + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + + /* disallow wrap-around. */ + if (addr + (int)size < addr) + return (EINVAL); + + if (atop(size) + uvmexp.wired > uvmexp.wiredmax) + return (EAGAIN); + +#ifdef pmap_wired_count + if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > + p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) + return (EAGAIN); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + + error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +/* + * sys_munlock: unlock wired pages + */ + +int +sys_munlock(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_munlock_args /* { + syscallarg(const void *) addr; + syscallarg(size_t) len; + } */ *uap = v; + vaddr_t addr; + vsize_t size, pageoff; + int error; + + /* + * extract syscall args from uap + */ + + addr = (vaddr_t)SCARG(uap, addr); + size = (vsize_t)SCARG(uap, len); + + /* + * align the address to a page boundary, and adjust the size accordingly + */ + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vsize_t) round_page(size); + + /* disallow wrap-around. */ + if (addr + (int)size < addr) + return (EINVAL); + +#ifndef pmap_wired_count + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + + error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +/* + * uvm_mmap: internal version of mmap + * + * - used by sys_mmap, exec, and sysv shm + * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true, + * sysv shm uses "named anonymous memory") + * - caller must page-align the file offset + */ + +int +uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) + vm_map_t map; + vaddr_t *addr; + vsize_t size; + vm_prot_t prot, maxprot; + int flags; + caddr_t handle; /* XXX: VNODE? */ + vaddr_t foff; +{ + struct uvm_object *uobj; + struct vnode *vp; + int retval; + int advice = UVM_ADV_NORMAL; + uvm_flag_t uvmflag = 0; + + /* + * check params + */ + + if (size == 0) + return(0); + if (foff & PAGE_MASK) + return(EINVAL); + if ((prot & maxprot) != prot) + return(EINVAL); + + /* + * for non-fixed mappings, round off the suggested address. + * for fixed mappings, check alignment and zap old mappings. + */ + + if ((flags & MAP_FIXED) == 0) { + *addr = round_page(*addr); /* round */ + } else { + + if (*addr & PAGE_MASK) + return(EINVAL); + uvmflag |= UVM_FLAG_FIXED; + (void) uvm_unmap(map, *addr, *addr + size); /* zap! */ + } + + /* + * handle anon vs. non-anon mappings. for non-anon mappings attach + * to underlying vm object. + */ + + if (flags & MAP_ANON) { + + foff = UVM_UNKNOWN_OFFSET; + uobj = NULL; + if ((flags & MAP_SHARED) == 0) + /* XXX: defer amap create */ + uvmflag |= UVM_FLAG_COPYONW; + else + /* shared: create amap now */ + uvmflag |= UVM_FLAG_OVERLAY; + + } else { + + vp = (struct vnode *) handle; /* get vnode */ + if (vp->v_type != VCHR) { + uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ? + maxprot : (maxprot & ~VM_PROT_WRITE)); + + /* + * XXXCDC: hack from old code + * don't allow vnodes which have been mapped + * shared-writeable to persist [forces them to be + * flushed out when last reference goes]. + * XXXCDC: interesting side effect: avoids a bug. + * note that in WRITE [ufs_readwrite.c] that we + * allocate buffer, uncache, and then do the write. + * the problem with this is that if the uncache causes + * VM data to be flushed to the same area of the file + * we are writing to... in that case we've got the + * buffer locked and our process goes to sleep forever. + * + * XXXCDC: checking maxprot protects us from the + * "persistbug" program but this is not a long term + * solution. + * + * XXXCDC: we don't bother calling uncache with the vp + * VOP_LOCKed since we know that we are already + * holding a valid reference to the uvn (from the + * uvn_attach above), and thus it is impossible for + * the uncache to kill the uvn and trigger I/O. + */ + if (flags & MAP_SHARED) { + if ((prot & VM_PROT_WRITE) || + (maxprot & VM_PROT_WRITE)) { + uvm_vnp_uncache(vp); + } + } + + } else { + uobj = udv_attach((void *) &vp->v_rdev, + (flags & MAP_SHARED) ? + maxprot : (maxprot & ~VM_PROT_WRITE)); + advice = UVM_ADV_RANDOM; + } + + if (uobj == NULL) + return((vp->v_type == VREG) ? ENOMEM : EINVAL); + + if ((flags & MAP_SHARED) == 0) + uvmflag |= UVM_FLAG_COPYONW; + } + + /* + * set up mapping flags + */ + + uvmflag = UVM_MAPFLAG(prot, maxprot, + (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, + advice, uvmflag); + + /* + * do it! + */ + + retval = uvm_map(map, addr, size, uobj, foff, uvmflag); + + if (retval == KERN_SUCCESS) + return(0); + + /* + * errors: first detach from the uobj, if any. + */ + + if (uobj) + uobj->pgops->pgo_detach(uobj); + + switch (retval) { + case KERN_INVALID_ADDRESS: + case KERN_NO_SPACE: + return(ENOMEM); + case KERN_PROTECTION_FAILURE: + return(EACCES); + } + return(EINVAL); +} diff --git a/sys/uvm/uvm_object.h b/sys/uvm/uvm_object.h new file mode 100644 index 00000000000..10e00d1535a --- /dev/null +++ b/sys/uvm/uvm_object.h @@ -0,0 +1,74 @@ +/* $NetBSD: uvm_object.h,v 1.5 1998/03/09 00:58:58 mrg Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_object.h,v 1.1.2.2 1998/01/04 22:44:51 chuck Exp + */ + +#ifndef _UVM_UVM_OBJECT_H_ +#define _UVM_UVM_OBJECT_H_ + +/* + * uvm_object.h + */ + +/* + * uvm_object: all that is left of mach objects. + */ + +struct uvm_object { + simple_lock_data_t vmobjlock; /* lock on memq */ + struct uvm_pagerops *pgops; /* pager ops */ + struct pglist memq; /* pages in this object */ + int uo_npages; /* # of pages in memq */ + int uo_refs; /* reference count */ +}; + +/* + * UVM_OBJ_KERN is a 'special' uo_refs value which indicates that the + * object is a kernel memory object rather than a normal one (kernel + * memory objects don't have reference counts -- they never die). + * + * this value is used to detected kernel object mappings at uvm_unmap() + * time. normally when an object is unmapped its pages eventaully become + * deactivated and then paged out and/or freed. this is not useful + * for kernel objects... when a kernel object is unmapped we always want + * to free the resources associated with the mapping. UVM_OBJ_KERN + * allows us to decide which type of unmapping we want to do. + */ +#define UVM_OBJ_KERN (-2) + +#endif /* _UVM_UVM_OBJECT_H_ */ diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c new file mode 100644 index 00000000000..15ad5ce99aa --- /dev/null +++ b/sys/uvm/uvm_page.c @@ -0,0 +1,1122 @@ +/* $NetBSD: uvm_page.c,v 1.15 1998/10/18 23:50:00 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 + * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * uvm_page.c: page ops. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#define UVM_PAGE /* pull in uvm_page.h functions */ +#include <uvm/uvm.h> + +/* + * global vars... XXXCDC: move to uvm. structure. + */ + +/* + * physical memory config is stored in vm_physmem. + */ + +struct vm_physseg vm_physmem[VM_PHYSSEG_MAX]; /* XXXCDC: uvm.physmem */ +int vm_nphysseg = 0; /* XXXCDC: uvm.nphysseg */ + +/* + * local variables + */ + +/* + * these variables record the values returned by vm_page_bootstrap, + * for debugging purposes. The implementation of uvm_pageboot_alloc + * and pmap_startup here also uses them internally. + */ + +static vaddr_t virtual_space_start; +static vaddr_t virtual_space_end; + +/* + * we use a hash table with only one bucket during bootup. we will + * later rehash (resize) the hash table once malloc() is ready. + * we static allocate the bootstrap bucket below... + */ + +static struct pglist uvm_bootbucket; + +/* + * local prototypes + */ + +static void uvm_pageinsert __P((struct vm_page *)); + + +/* + * inline functions + */ + +/* + * uvm_pageinsert: insert a page in the object and the hash table + * + * => caller must lock object + * => caller must lock page queues + * => call should have already set pg's object and offset pointers + * and bumped the version counter + */ + +__inline static void +uvm_pageinsert(pg) + struct vm_page *pg; +{ + struct pglist *buck; + int s; + +#ifdef DIAGNOSTIC + if (pg->flags & PG_TABLED) + panic("uvm_pageinsert: already inserted"); +#endif + + buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)]; + s = splimp(); + simple_lock(&uvm.hashlock); + TAILQ_INSERT_TAIL(buck, pg, hashq); /* put in hash */ + simple_unlock(&uvm.hashlock); + splx(s); + + TAILQ_INSERT_TAIL(&pg->uobject->memq, pg, listq); /* put in object */ + pg->flags |= PG_TABLED; + pg->uobject->uo_npages++; + +} + +/* + * uvm_page_remove: remove page from object and hash + * + * => caller must lock object + * => caller must lock page queues + */ + +void __inline +uvm_pageremove(pg) + struct vm_page *pg; +{ + struct pglist *buck; + int s; + +#ifdef DIAGNOSTIC + if ((pg->flags & (PG_FAULTING)) != 0) + panic("uvm_pageremove: page is faulting"); +#endif + + if ((pg->flags & PG_TABLED) == 0) + return; /* XXX: log */ + + buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)]; + s = splimp(); + simple_lock(&uvm.hashlock); + TAILQ_REMOVE(buck, pg, hashq); + simple_unlock(&uvm.hashlock); + splx(s); + + /* object should be locked */ + TAILQ_REMOVE(&pg->uobject->memq, pg, listq); + + pg->flags &= ~PG_TABLED; + pg->uobject->uo_npages--; + pg->uobject = NULL; + pg->version++; + +} + +/* + * uvm_page_init: init the page system. called from uvm_init(). + * + * => we return the range of kernel virtual memory in kvm_startp/kvm_endp + */ + +void +uvm_page_init(kvm_startp, kvm_endp) + vaddr_t *kvm_startp, *kvm_endp; +{ + int freepages, pagecount; + vm_page_t pagearray; + int lcv, n, i; + paddr_t paddr; + + + /* + * step 1: init the page queues and page queue locks + */ + for (lcv = 0; lcv < VM_NFREELIST; lcv++) + TAILQ_INIT(&uvm.page_free[lcv]); + TAILQ_INIT(&uvm.page_active); + TAILQ_INIT(&uvm.page_inactive_swp); + TAILQ_INIT(&uvm.page_inactive_obj); + simple_lock_init(&uvm.pageqlock); + simple_lock_init(&uvm.fpageqlock); + + /* + * step 2: init the <obj,offset> => <page> hash table. for now + * we just have one bucket (the bootstrap bucket). later on we + * will malloc() new buckets as we dynamically resize the hash table. + */ + + uvm.page_nhash = 1; /* 1 bucket */ + uvm.page_hashmask = 0; /* mask for hash function */ + uvm.page_hash = &uvm_bootbucket; /* install bootstrap bucket */ + TAILQ_INIT(uvm.page_hash); /* init hash table */ + simple_lock_init(&uvm.hashlock); /* init hash table lock */ + + /* + * step 3: allocate vm_page structures. + */ + + /* + * sanity check: + * before calling this function the MD code is expected to register + * some free RAM with the uvm_page_physload() function. our job + * now is to allocate vm_page structures for this memory. + */ + + if (vm_nphysseg == 0) + panic("vm_page_bootstrap: no memory pre-allocated"); + + /* + * first calculate the number of free pages... + * + * note that we use start/end rather than avail_start/avail_end. + * this allows us to allocate extra vm_page structures in case we + * want to return some memory to the pool after booting. + */ + + freepages = 0; + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) + freepages += (vm_physmem[lcv].end - vm_physmem[lcv].start); + + /* + * we now know we have (PAGE_SIZE * freepages) bytes of memory we can + * use. for each page of memory we use we need a vm_page structure. + * thus, the total number of pages we can use is the total size of + * the memory divided by the PAGE_SIZE plus the size of the vm_page + * structure. we add one to freepages as a fudge factor to avoid + * truncation errors (since we can only allocate in terms of whole + * pages). + */ + + pagecount = ((freepages + 1) << PAGE_SHIFT) / + (PAGE_SIZE + sizeof(struct vm_page)); + pagearray = (vm_page_t)uvm_pageboot_alloc(pagecount * + sizeof(struct vm_page)); + bzero(pagearray, pagecount * sizeof(struct vm_page)); + + /* + * step 4: init the vm_page structures and put them in the correct + * place... + */ + + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) { + + n = vm_physmem[lcv].end - vm_physmem[lcv].start; + if (n > pagecount) { + printf("uvm_page_init: lost %d page(s) in init\n", + n - pagecount); + panic("uvm_page_init"); /* XXXCDC: shouldn't happen? */ + /* n = pagecount; */ + } + /* set up page array pointers */ + vm_physmem[lcv].pgs = pagearray; + pagearray += n; + pagecount -= n; + vm_physmem[lcv].lastpg = vm_physmem[lcv].pgs + (n - 1); + + /* init and free vm_pages (we've already zeroed them) */ + paddr = ptoa(vm_physmem[lcv].start); + for (i = 0 ; i < n ; i++, paddr += PAGE_SIZE) { + vm_physmem[lcv].pgs[i].phys_addr = paddr; + if (atop(paddr) >= vm_physmem[lcv].avail_start && + atop(paddr) <= vm_physmem[lcv].avail_end) { + uvmexp.npages++; + /* add page to free pool */ + uvm_pagefree(&vm_physmem[lcv].pgs[i]); + } + } + } + /* + * step 5: pass up the values of virtual_space_start and + * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper + * layers of the VM. + */ + + *kvm_startp = round_page(virtual_space_start); + *kvm_endp = trunc_page(virtual_space_end); + + /* + * step 6: init pagedaemon lock + */ + + simple_lock_init(&uvm.pagedaemon_lock); + + /* + * step 7: init reserve thresholds + * XXXCDC - values may need adjusting + */ + uvmexp.reserve_pagedaemon = 1; + uvmexp.reserve_kernel = 5; + + /* + * done! + */ + +} + +/* + * uvm_setpagesize: set the page size + * + * => sets page_shift and page_mask from uvmexp.pagesize. + * => XXXCDC: move global vars. + */ + +void +uvm_setpagesize() +{ + if (uvmexp.pagesize == 0) + uvmexp.pagesize = DEFAULT_PAGE_SIZE; + uvmexp.pagemask = uvmexp.pagesize - 1; + if ((uvmexp.pagemask & uvmexp.pagesize) != 0) + panic("uvm_setpagesize: page size not a power of two"); + for (uvmexp.pageshift = 0; ; uvmexp.pageshift++) + if ((1 << uvmexp.pageshift) == uvmexp.pagesize) + break; +} + +/* + * uvm_pageboot_alloc: steal memory from physmem for bootstrapping + */ + +vaddr_t +uvm_pageboot_alloc(size) + vsize_t size; +{ +#if defined(PMAP_STEAL_MEMORY) + vaddr_t addr; + + /* + * defer bootstrap allocation to MD code (it may want to allocate + * from a direct-mapped segment). pmap_steal_memory should round + * off virtual_space_start/virtual_space_end. + */ + + addr = pmap_steal_memory(size, &virtual_space_start, + &virtual_space_end); + + return(addr); + +#else /* !PMAP_STEAL_MEMORY */ + + vaddr_t addr, vaddr; + paddr_t paddr; + + /* round to page size */ + size = round_page(size); + + /* + * on first call to this function init ourselves. we detect this + * by checking virtual_space_start/end which are in the zero'd BSS area. + */ + + if (virtual_space_start == virtual_space_end) { + pmap_virtual_space(&virtual_space_start, &virtual_space_end); + + /* round it the way we like it */ + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); + } + + /* + * allocate virtual memory for this request + */ + + addr = virtual_space_start; + virtual_space_start += size; + + /* + * allocate and mapin physical pages to back new virtual pages + */ + + for (vaddr = round_page(addr) ; vaddr < addr + size ; + vaddr += PAGE_SIZE) { + + if (!uvm_page_physget(&paddr)) + panic("uvm_pageboot_alloc: out of memory"); + + /* XXX: should be wired, but some pmaps don't like that ... */ +#if defined(PMAP_NEW) + pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE); +#else + pmap_enter(pmap_kernel(), vaddr, paddr, + VM_PROT_READ|VM_PROT_WRITE, FALSE); +#endif + + } + return(addr); +#endif /* PMAP_STEAL_MEMORY */ +} + +#if !defined(PMAP_STEAL_MEMORY) +/* + * uvm_page_physget: "steal" one page from the vm_physmem structure. + * + * => attempt to allocate it off the end of a segment in which the "avail" + * values match the start/end values. if we can't do that, then we + * will advance both values (making them equal, and removing some + * vm_page structures from the non-avail area). + * => return false if out of memory. + */ + +boolean_t +uvm_page_physget(paddrp) + paddr_t *paddrp; +{ + int lcv, x; + + /* pass 1: try allocating from a matching end */ +#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) + for (lcv = vm_nphysseg - 1 ; lcv >= 0 ; lcv--) +#else + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) +#endif + { + + if (vm_physmem[lcv].pgs) + panic("vm_page_physget: called _after_ bootstrap"); + + /* try from front */ + if (vm_physmem[lcv].avail_start == vm_physmem[lcv].start && + vm_physmem[lcv].avail_start < vm_physmem[lcv].avail_end) { + *paddrp = ptoa(vm_physmem[lcv].avail_start); + vm_physmem[lcv].avail_start++; + vm_physmem[lcv].start++; + /* nothing left? nuke it */ + if (vm_physmem[lcv].avail_start == + vm_physmem[lcv].end) { + if (vm_nphysseg == 1) + panic("vm_page_physget: out of memory!"); + vm_nphysseg--; + for (x = lcv ; x < vm_nphysseg ; x++) + /* structure copy */ + vm_physmem[x] = vm_physmem[x+1]; + } + return (TRUE); + } + + /* try from rear */ + if (vm_physmem[lcv].avail_end == vm_physmem[lcv].end && + vm_physmem[lcv].avail_start < vm_physmem[lcv].avail_end) { + *paddrp = ptoa(vm_physmem[lcv].avail_end - 1); + vm_physmem[lcv].avail_end--; + vm_physmem[lcv].end--; + /* nothing left? nuke it */ + if (vm_physmem[lcv].avail_end == + vm_physmem[lcv].start) { + if (vm_nphysseg == 1) + panic("vm_page_physget: out of memory!"); + vm_nphysseg--; + for (x = lcv ; x < vm_nphysseg ; x++) + /* structure copy */ + vm_physmem[x] = vm_physmem[x+1]; + } + return (TRUE); + } + } + + /* pass2: forget about matching ends, just allocate something */ +#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) + for (lcv = vm_nphysseg - 1 ; lcv >= 0 ; lcv--) +#else + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) +#endif + { + + /* any room in this bank? */ + if (vm_physmem[lcv].avail_start >= vm_physmem[lcv].avail_end) + continue; /* nope */ + + *paddrp = ptoa(vm_physmem[lcv].avail_start); + vm_physmem[lcv].avail_start++; + /* truncate! */ + vm_physmem[lcv].start = vm_physmem[lcv].avail_start; + + /* nothing left? nuke it */ + if (vm_physmem[lcv].avail_start == vm_physmem[lcv].end) { + if (vm_nphysseg == 1) + panic("vm_page_physget: out of memory!"); + vm_nphysseg--; + for (x = lcv ; x < vm_nphysseg ; x++) + /* structure copy */ + vm_physmem[x] = vm_physmem[x+1]; + } + return (TRUE); + } + + return (FALSE); /* whoops! */ +} +#endif /* PMAP_STEAL_MEMORY */ + +/* + * uvm_page_physload: load physical memory into VM system + * + * => all args are PFs + * => all pages in start/end get vm_page structures + * => areas marked by avail_start/avail_end get added to the free page pool + * => we are limited to VM_PHYSSEG_MAX physical memory segments + */ + +void +uvm_page_physload(start, end, avail_start, avail_end, free_list) + vaddr_t start, end, avail_start, avail_end; + int free_list; +{ + int preload, lcv; + psize_t npages; + struct vm_page *pgs; + struct vm_physseg *ps; + + if (uvmexp.pagesize == 0) + panic("vm_page_physload: page size not set!"); + + if (free_list >= VM_NFREELIST || free_list < VM_FREELIST_DEFAULT) + panic("uvm_page_physload: bad free list %d\n", free_list); + + /* + * do we have room? + */ + if (vm_nphysseg == VM_PHYSSEG_MAX) { + printf("vm_page_physload: unable to load physical memory " + "segment\n"); + printf("\t%d segments allocated, ignoring 0x%lx -> 0x%lx\n", + VM_PHYSSEG_MAX, start, end); + return; + } + + /* + * check to see if this is a "preload" (i.e. uvm_mem_init hasn't been + * called yet, so malloc is not available). + */ + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) { + if (vm_physmem[lcv].pgs) + break; + } + preload = (lcv == vm_nphysseg); + + /* + * if VM is already running, attempt to malloc() vm_page structures + */ + if (!preload) { +#if defined(VM_PHYSSEG_NOADD) + panic("vm_page_physload: tried to add RAM after vm_mem_init"); +#else + /* XXXCDC: need some sort of lockout for this case */ + paddr_t paddr; + npages = end - start; /* # of pages */ + MALLOC(pgs, struct vm_page *, sizeof(struct vm_page) * npages, + M_VMPAGE, M_NOWAIT); + if (pgs == NULL) { + printf("vm_page_physload: can not malloc vm_page " + "structs for segment\n"); + printf("\tignoring 0x%lx -> 0x%lx\n", start, end); + return; + } + /* zero data, init phys_addr and free_list, and free pages */ + bzero(pgs, sizeof(struct vm_page) * npages); + for (lcv = 0, paddr = ptoa(start) ; + lcv < npages ; lcv++, paddr += PAGE_SIZE) { + pgs[lcv].phys_addr = paddr; + pgs[lcv].free_list = free_list; + if (atop(paddr) >= avail_start && + atop(paddr) <= avail_end) + uvm_pagefree(&pgs[lcv]); + } + /* XXXCDC: incomplete: need to update uvmexp.free, what else? */ + /* XXXCDC: need hook to tell pmap to rebuild pv_list, etc... */ +#endif + } else { + + /* gcc complains if these don't get init'd */ + pgs = NULL; + npages = 0; + + } + + /* + * now insert us in the proper place in vm_physmem[] + */ + +#if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM) + + /* random: put it at the end (easy!) */ + ps = &vm_physmem[vm_nphysseg]; + +#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH) + + { + int x; + /* sort by address for binary search */ + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) + if (start < vm_physmem[lcv].start) + break; + ps = &vm_physmem[lcv]; + /* move back other entries, if necessary ... */ + for (x = vm_nphysseg ; x > lcv ; x--) + /* structure copy */ + vm_physmem[x] = vm_physmem[x - 1]; + } + +#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST) + + { + int x; + /* sort by largest segment first */ + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) + if ((end - start) > + (vm_physmem[lcv].end - vm_physmem[lcv].start)) + break; + ps = &vm_physmem[lcv]; + /* move back other entries, if necessary ... */ + for (x = vm_nphysseg ; x > lcv ; x--) + /* structure copy */ + vm_physmem[x] = vm_physmem[x - 1]; + } + +#else + + panic("vm_page_physload: unknown physseg strategy selected!"); + +#endif + + ps->start = start; + ps->end = end; + ps->avail_start = avail_start; + ps->avail_end = avail_end; + if (preload) { + ps->pgs = NULL; + } else { + ps->pgs = pgs; + ps->lastpg = pgs + npages - 1; + } + ps->free_list = free_list; + vm_nphysseg++; + + /* + * done! + */ + + if (!preload) + uvm_page_rehash(); + + return; +} + +/* + * uvm_page_rehash: reallocate hash table based on number of free pages. + */ + +void +uvm_page_rehash() +{ + int freepages, lcv, bucketcount, s, oldcount; + struct pglist *newbuckets, *oldbuckets; + struct vm_page *pg; + + /* + * compute number of pages that can go in the free pool + */ + + freepages = 0; + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) + freepages += + (vm_physmem[lcv].avail_end - vm_physmem[lcv].avail_start); + + /* + * compute number of buckets needed for this number of pages + */ + + bucketcount = 1; + while (bucketcount < freepages) + bucketcount = bucketcount * 2; + + /* + * malloc new buckets + */ + + MALLOC(newbuckets, struct pglist *, sizeof(struct pglist) * bucketcount, + M_VMPBUCKET, M_NOWAIT); + if (newbuckets == NULL) { + printf("vm_page_physrehash: WARNING: could not grow page " + "hash table\n"); + return; + } + for (lcv = 0 ; lcv < bucketcount ; lcv++) + TAILQ_INIT(&newbuckets[lcv]); + + /* + * now replace the old buckets with the new ones and rehash everything + */ + + s = splimp(); + simple_lock(&uvm.hashlock); + /* swap old for new ... */ + oldbuckets = uvm.page_hash; + oldcount = uvm.page_nhash; + uvm.page_hash = newbuckets; + uvm.page_nhash = bucketcount; + uvm.page_hashmask = bucketcount - 1; /* power of 2 */ + + /* ... and rehash */ + for (lcv = 0 ; lcv < oldcount ; lcv++) { + while ((pg = oldbuckets[lcv].tqh_first) != NULL) { + TAILQ_REMOVE(&oldbuckets[lcv], pg, hashq); + TAILQ_INSERT_TAIL( + &uvm.page_hash[uvm_pagehash(pg->uobject, pg->offset)], + pg, hashq); + } + } + simple_unlock(&uvm.hashlock); + splx(s); + + /* + * free old bucket array if we malloc'd it previously + */ + + if (oldbuckets != &uvm_bootbucket) + FREE(oldbuckets, M_VMPBUCKET); + + /* + * done + */ + return; +} + + +#if 1 /* XXXCDC: TMP TMP TMP DEBUG DEBUG DEBUG */ + +void uvm_page_physdump __P((void)); /* SHUT UP GCC */ + +/* call from DDB */ +void +uvm_page_physdump() +{ + int lcv; + + printf("rehash: physical memory config [segs=%d of %d]:\n", + vm_nphysseg, VM_PHYSSEG_MAX); + for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) + printf("0x%lx->0x%lx [0x%lx->0x%lx]\n", vm_physmem[lcv].start, + vm_physmem[lcv].end, vm_physmem[lcv].avail_start, + vm_physmem[lcv].avail_end); + printf("STRATEGY = "); + switch (VM_PHYSSEG_STRAT) { + case VM_PSTRAT_RANDOM: printf("RANDOM\n"); break; + case VM_PSTRAT_BSEARCH: printf("BSEARCH\n"); break; + case VM_PSTRAT_BIGFIRST: printf("BIGFIRST\n"); break; + default: printf("<<UNKNOWN>>!!!!\n"); + } + printf("number of buckets = %d\n", uvm.page_nhash); +} +#endif + +/* + * uvm_pagealloc_strat: allocate vm_page from a particular free list. + * + * => return null if no pages free + * => wake up pagedaemon if number of free pages drops below low water mark + * => if obj != NULL, obj must be locked (to put in hash) + * => if anon != NULL, anon must be locked (to put in anon) + * => only one of obj or anon can be non-null + * => caller must activate/deactivate page if it is not wired. + * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL. + */ + +struct vm_page * +uvm_pagealloc_strat(obj, off, anon, strat, free_list) + struct uvm_object *obj; + vaddr_t off; + struct vm_anon *anon; + int strat, free_list; +{ + int lcv, s; + struct vm_page *pg; + struct pglist *freeq; + +#ifdef DIAGNOSTIC + /* sanity check */ + if (obj && anon) + panic("uvm_pagealloc: obj and anon != NULL"); +#endif + + s = splimp(); + + uvm_lock_fpageq(); /* lock free page queue */ + + /* + * check to see if we need to generate some free pages waking + * the pagedaemon. + */ + + if (uvmexp.free < uvmexp.freemin || (uvmexp.free < uvmexp.freetarg && + uvmexp.inactive < uvmexp.inactarg)) + thread_wakeup(&uvm.pagedaemon); + + /* + * fail if any of these conditions is true: + * [1] there really are no free pages, or + * [2] only kernel "reserved" pages remain and + * the page isn't being allocated to a kernel object. + * [3] only pagedaemon "reserved" pages remain and + * the requestor isn't the pagedaemon. + */ + + if ((uvmexp.free <= uvmexp.reserve_kernel && + !(obj && obj->uo_refs == UVM_OBJ_KERN)) || + (uvmexp.free <= uvmexp.reserve_pagedaemon && + !(obj == uvmexp.kmem_object && curproc == uvm.pagedaemon_proc))) + goto fail; + + again: + switch (strat) { + case UVM_PGA_STRAT_NORMAL: + /* Check all freelists in descending priority order. */ + for (lcv = 0; lcv < VM_NFREELIST; lcv++) { + freeq = &uvm.page_free[lcv]; + if ((pg = freeq->tqh_first) != NULL) + goto gotit; + } + + /* No pages free! */ + goto fail; + + case UVM_PGA_STRAT_ONLY: + case UVM_PGA_STRAT_FALLBACK: + /* Attempt to allocate from the specified free list. */ +#ifdef DIAGNOSTIC + if (free_list >= VM_NFREELIST || free_list < 0) + panic("uvm_pagealloc_strat: bad free list %d", + free_list); +#endif + freeq = &uvm.page_free[free_list]; + if ((pg = freeq->tqh_first) != NULL) + goto gotit; + + /* Fall back, if possible. */ + if (strat == UVM_PGA_STRAT_FALLBACK) { + strat = UVM_PGA_STRAT_NORMAL; + goto again; + } + + /* No pages free! */ + goto fail; + + default: + panic("uvm_pagealloc_strat: bad strat %d", strat); + /* NOTREACHED */ + } + + gotit: + TAILQ_REMOVE(freeq, pg, pageq); + uvmexp.free--; + + uvm_unlock_fpageq(); /* unlock free page queue */ + splx(s); + + pg->offset = off; + pg->uobject = obj; + pg->uanon = anon; + pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE; + pg->version++; + pg->wire_count = 0; + pg->loan_count = 0; + if (anon) { + anon->u.an_page = pg; + pg->pqflags = PQ_ANON; + } else { + if (obj) + uvm_pageinsert(pg); + pg->pqflags = 0; + } +#if defined(UVM_PAGE_TRKOWN) + pg->owner_tag = NULL; +#endif + UVM_PAGE_OWN(pg, "new alloc"); + + return(pg); + + fail: + uvm_unlock_fpageq(); + splx(s); + return (NULL); +} + +/* + * uvm_pagerealloc: reallocate a page from one object to another + * + * => both objects must be locked + */ + +void +uvm_pagerealloc(pg, newobj, newoff) + struct vm_page *pg; + struct uvm_object *newobj; + vaddr_t newoff; +{ + /* + * remove it from the old object + */ + + if (pg->uobject) { + uvm_pageremove(pg); + } + + /* + * put it in the new object + */ + + if (newobj) { + pg->uobject = newobj; + pg->offset = newoff; + pg->version++; + uvm_pageinsert(pg); + } + + return; +} + + +/* + * uvm_pagefree: free page + * + * => erase page's identity (i.e. remove from hash/object) + * => put page on free list + * => caller must lock owning object (either anon or uvm_object) + * => caller must lock page queues + * => assumes all valid mappings of pg are gone + */ + +void uvm_pagefree(pg) + +struct vm_page *pg; + +{ + int s; + int saved_loan_count = pg->loan_count; + + /* + * if the page was an object page (and thus "TABLED"), remove it + * from the object. + */ + + if (pg->flags & PG_TABLED) { + + /* + * if the object page is on loan we are going to drop ownership. + * it is possible that an anon will take over as owner for this + * page later on. the anon will want a !PG_CLEAN page so that + * it knows it needs to allocate swap if it wants to page the + * page out. + */ + + if (saved_loan_count) + pg->flags &= ~PG_CLEAN; /* in case an anon takes over */ + + uvm_pageremove(pg); + + /* + * if our page was on loan, then we just lost control over it + * (in fact, if it was loaned to an anon, the anon may have + * already taken over ownership of the page by now and thus + * changed the loan_count [e.g. in uvmfault_anonget()]) we just + * return (when the last loan is dropped, then the page can be + * freed by whatever was holding the last loan). + */ + if (saved_loan_count) + return; + + } else if (saved_loan_count && (pg->pqflags & PQ_ANON)) { + + /* + * if our page is owned by an anon and is loaned out to the + * kernel then we just want to drop ownership and return. + * the kernel must free the page when all its loans clear ... + * note that the kernel can't change the loan status of our + * page as long as we are holding PQ lock. + */ + pg->pqflags &= ~PQ_ANON; + pg->uanon = NULL; + return; + } + +#ifdef DIAGNOSTIC + if (saved_loan_count) { + printf("uvm_pagefree: warning: freeing page with a loan " + "count of %d\n", saved_loan_count); + panic("uvm_pagefree: loan count"); + } +#endif + + + /* + * now remove the page from the queues + */ + + if (pg->pqflags & PQ_ACTIVE) { + TAILQ_REMOVE(&uvm.page_active, pg, pageq); + pg->pqflags &= ~PQ_ACTIVE; + uvmexp.active--; + } + if (pg->pqflags & PQ_INACTIVE) { + if (pg->pqflags & PQ_SWAPBACKED) + TAILQ_REMOVE(&uvm.page_inactive_swp, pg, pageq); + else + TAILQ_REMOVE(&uvm.page_inactive_obj, pg, pageq); + pg->pqflags &= ~PQ_INACTIVE; + uvmexp.inactive--; + } + + /* + * if the page was wired, unwire it now. + */ + if (pg->wire_count) + { + pg->wire_count = 0; + uvmexp.wired--; + } + + /* + * and put on free queue + */ + + s = splimp(); + uvm_lock_fpageq(); + TAILQ_INSERT_TAIL(&uvm.page_free[uvm_page_lookup_freelist(pg)], + pg, pageq); + pg->pqflags = PQ_FREE; +#ifdef DEBUG + pg->uobject = (void *)0xdeadbeef; + pg->offset = 0xdeadbeef; + pg->uanon = (void *)0xdeadbeef; +#endif + uvmexp.free++; + uvm_unlock_fpageq(); + splx(s); +} + +#if defined(UVM_PAGE_TRKOWN) +/* + * uvm_page_own: set or release page ownership + * + * => this is a debugging function that keeps track of who sets PG_BUSY + * and where they do it. it can be used to track down problems + * such a process setting "PG_BUSY" and never releasing it. + * => page's object [if any] must be locked + * => if "tag" is NULL then we are releasing page ownership + */ +void +uvm_page_own(pg, tag) + struct vm_page *pg; + char *tag; +{ + /* gain ownership? */ + if (tag) { + if (pg->owner_tag) { + printf("uvm_page_own: page %p already owned " + "by proc %d [%s]\n", pg, + pg->owner, pg->owner_tag); + panic("uvm_page_own"); + } + pg->owner = (curproc) ? curproc->p_pid : (pid_t) -1; + pg->owner_tag = tag; + return; + } + + /* drop ownership */ + if (pg->owner_tag == NULL) { + printf("uvm_page_own: dropping ownership of an non-owned " + "page (%p)\n", pg); + panic("uvm_page_own"); + } + pg->owner_tag = NULL; + return; +} +#endif diff --git a/sys/uvm/uvm_page.h b/sys/uvm/uvm_page.h new file mode 100644 index 00000000000..dd40fc5bee1 --- /dev/null +++ b/sys/uvm/uvm_page.h @@ -0,0 +1,132 @@ +/* $NetBSD: uvm_page.h,v 1.10 1998/08/13 02:11:02 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.h 7.3 (Berkeley) 4/21/91 + * from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _UVM_UVM_PAGE_H_ +#define _UVM_UVM_PAGE_H_ + +/* + * uvm_page.h + */ + +/* + * macros + */ + +#define uvm_lock_pageq() simple_lock(&uvm.pageqlock) +#define uvm_unlock_pageq() simple_unlock(&uvm.pageqlock) +#define uvm_lock_fpageq() simple_lock(&uvm.fpageqlock) +#define uvm_unlock_fpageq() simple_unlock(&uvm.fpageqlock) + +#define uvm_pagehash(obj,off) \ + (((unsigned long)obj+(unsigned long)atop(off)) & uvm.page_hashmask) + +/* + * handle inline options + */ + +#ifdef UVM_PAGE_INLINE +#define PAGE_INLINE static __inline +#else +#define PAGE_INLINE /* nothing */ +#endif /* UVM_PAGE_INLINE */ + +/* + * prototypes: the following prototypes define the interface to pages + */ + +void uvm_page_init __P((vaddr_t *, vaddr_t *)); +#if defined(UVM_PAGE_TRKOWN) +void uvm_page_own __P((struct vm_page *, char *)); +#endif +#if !defined(PMAP_STEAL_MEMORY) +boolean_t uvm_page_physget __P((paddr_t *)); +#endif +void uvm_page_rehash __P((void)); + +PAGE_INLINE void uvm_pageactivate __P((struct vm_page *)); +vaddr_t uvm_pageboot_alloc __P((vsize_t)); +PAGE_INLINE void uvm_pagecopy __P((struct vm_page *, struct vm_page *)); +PAGE_INLINE void uvm_pagedeactivate __P((struct vm_page *)); +void uvm_pagefree __P((struct vm_page *)); +PAGE_INLINE struct vm_page *uvm_pagelookup + __P((struct uvm_object *, vaddr_t)); +void uvm_pageremove __P((struct vm_page *)); +/* uvm_pagerename: not needed */ +PAGE_INLINE void uvm_pageunwire __P((struct vm_page *)); +PAGE_INLINE void uvm_pagewait __P((struct vm_page *, int)); +PAGE_INLINE void uvm_pagewake __P((struct vm_page *)); +PAGE_INLINE void uvm_pagewire __P((struct vm_page *)); +PAGE_INLINE void uvm_pagezero __P((struct vm_page *)); + +PAGE_INLINE int uvm_page_lookup_freelist __P((struct vm_page *)); + +#endif /* _UVM_UVM_PAGE_H_ */ diff --git a/sys/uvm/uvm_page_i.h b/sys/uvm/uvm_page_i.h new file mode 100644 index 00000000000..5a5671a3f4e --- /dev/null +++ b/sys/uvm/uvm_page_i.h @@ -0,0 +1,292 @@ +/* $NetBSD: uvm_page_i.h,v 1.8 1998/08/13 02:11:02 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 + * from: Id: uvm_page_i.h,v 1.1.2.7 1998/01/05 00:26:02 chuck Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _UVM_UVM_PAGE_I_H_ +#define _UVM_UVM_PAGE_I_H_ + +/* + * uvm_page_i.h + */ + +/* + * inline functions [maybe] + */ + +#if defined(UVM_PAGE_INLINE) || defined(UVM_PAGE) + +/* + * uvm_pagelookup: look up a page + * + * => caller should lock object to keep someone from pulling the page + * out from under it + */ + +struct vm_page * +uvm_pagelookup(obj, off) + struct uvm_object *obj; + vaddr_t off; +{ + struct vm_page *pg; + struct pglist *buck; + int s; + + buck = &uvm.page_hash[uvm_pagehash(obj,off)]; + + s = splimp(); + simple_lock(&uvm.hashlock); + for (pg = buck->tqh_first ; pg != NULL ; pg = pg->hashq.tqe_next) { + if (pg->uobject == obj && pg->offset == off) { + simple_unlock(&uvm.hashlock); + splx(s); + return(pg); + } + } + simple_unlock(&uvm.hashlock); + splx(s); + return(NULL); +} + +/* + * uvm_pagewire: wire the page, thus removing it from the daemon's grasp + * + * => caller must lock page queues + */ + +PAGE_INLINE void +uvm_pagewire(pg) + struct vm_page *pg; +{ + + if (pg->wire_count == 0) { + if (pg->pqflags & PQ_ACTIVE) { + TAILQ_REMOVE(&uvm.page_active, pg, pageq); + pg->pqflags &= ~PQ_ACTIVE; + uvmexp.active--; + } + if (pg->pqflags & PQ_INACTIVE) { + if (pg->pqflags & PQ_SWAPBACKED) + TAILQ_REMOVE(&uvm.page_inactive_swp, pg, pageq); + else + TAILQ_REMOVE(&uvm.page_inactive_obj, pg, pageq); + pg->pqflags &= ~PQ_INACTIVE; + uvmexp.inactive--; + } + uvmexp.wired++; + } + pg->wire_count++; +} + +/* + * uvm_pageunwire: unwire the page. + * + * => activate if wire count goes to zero. + * => caller must lock page queues + */ + +PAGE_INLINE void +uvm_pageunwire(pg) + struct vm_page *pg; +{ + + pg->wire_count--; + if (pg->wire_count == 0) { + TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq); + uvmexp.active++; + pg->pqflags |= PQ_ACTIVE; + uvmexp.wired--; + } +} + +/* + * uvm_pagedeactivate: deactivate page -- no pmaps have access to page + * + * => caller must lock page queues + * => caller must check to make sure page is not wired + * => object that page belongs to must be locked (so we can adjust pg->flags) + */ + +PAGE_INLINE void +uvm_pagedeactivate(pg) + struct vm_page *pg; +{ + if (pg->pqflags & PQ_ACTIVE) { + TAILQ_REMOVE(&uvm.page_active, pg, pageq); + pg->pqflags &= ~PQ_ACTIVE; + uvmexp.active--; + } + if ((pg->pqflags & PQ_INACTIVE) == 0) { +#ifdef DIAGNOSTIC + if (pg->wire_count) + panic("uvm_pagedeactivate: caller did not check " + "wire count"); +#endif + if (pg->pqflags & PQ_SWAPBACKED) + TAILQ_INSERT_TAIL(&uvm.page_inactive_swp, pg, pageq); + else + TAILQ_INSERT_TAIL(&uvm.page_inactive_obj, pg, pageq); + pg->pqflags |= PQ_INACTIVE; + uvmexp.inactive++; + pmap_clear_reference(PMAP_PGARG(pg)); + if (pmap_is_modified(PMAP_PGARG(pg))) + pg->flags &= ~PG_CLEAN; + } +} + +/* + * uvm_pageactivate: activate page + * + * => caller must lock page queues + */ + +PAGE_INLINE void +uvm_pageactivate(pg) + struct vm_page *pg; +{ + if (pg->pqflags & PQ_INACTIVE) { + if (pg->pqflags & PQ_SWAPBACKED) + TAILQ_REMOVE(&uvm.page_inactive_swp, pg, pageq); + else + TAILQ_REMOVE(&uvm.page_inactive_obj, pg, pageq); + pg->pqflags &= ~PQ_INACTIVE; + uvmexp.inactive--; + } + if (pg->wire_count == 0) { + + /* + * if page is already active, remove it from list so we + * can put it at tail. if it wasn't active, then mark + * it active and bump active count + */ + if (pg->pqflags & PQ_ACTIVE) + TAILQ_REMOVE(&uvm.page_active, pg, pageq); + else { + pg->pqflags |= PQ_ACTIVE; + uvmexp.active++; + } + + TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq); + } +} + +/* + * uvm_pagezero: zero fill a page + * + * => if page is part of an object then the object should be locked + * to protect pg->flags. + */ + +PAGE_INLINE void +uvm_pagezero(pg) + struct vm_page *pg; +{ + + pg->flags &= ~PG_CLEAN; + pmap_zero_page(VM_PAGE_TO_PHYS(pg)); +} + +/* + * uvm_pagecopy: copy a page + * + * => if page is part of an object then the object should be locked + * to protect pg->flags. + */ + +PAGE_INLINE void +uvm_pagecopy(src, dst) + struct vm_page *src, *dst; +{ + + dst->flags &= ~PG_CLEAN; + pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); +} + +/* + * uvm_page_lookup_freelist: look up the free list for the specified page + */ + +PAGE_INLINE int +uvm_page_lookup_freelist(pg) + struct vm_page *pg; +{ + int lcv; + + lcv = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL); +#ifdef DIAGNOSTIC + if (lcv == -1) + panic("uvm_page_lookup_freelist: unable to locate physseg"); +#endif + return (vm_physmem[lcv].free_list); +} + +#endif /* defined(UVM_PAGE_INLINE) || defined(UVM_PAGE) */ + +#endif /* _UVM_UVM_PAGE_I_H_ */ diff --git a/sys/uvm/uvm_pager.c b/sys/uvm/uvm_pager.c new file mode 100644 index 00000000000..1b8c8a36d3e --- /dev/null +++ b/sys/uvm/uvm_pager.c @@ -0,0 +1,762 @@ +/* $NetBSD: uvm_pager.c,v 1.14 1999/01/22 08:00:35 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp + */ + +/* + * uvm_pager.c: generic functions used to assist the pagers. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#define UVM_PAGER +#include <uvm/uvm.h> + +/* + * list of uvm pagers in the system + */ + +extern struct uvm_pagerops aobj_pager; +extern struct uvm_pagerops uvm_deviceops; +extern struct uvm_pagerops uvm_vnodeops; + +struct uvm_pagerops *uvmpagerops[] = { + &aobj_pager, + &uvm_deviceops, + &uvm_vnodeops, +}; + +/* + * the pager map: provides KVA for I/O + */ + +#define PAGER_MAP_SIZE (4 * 1024 * 1024) +vm_map_t pager_map; /* XXX */ +simple_lock_data_t pager_map_wanted_lock; +boolean_t pager_map_wanted; /* locked by pager map */ + + +/* + * uvm_pager_init: init pagers (at boot time) + */ + +void +uvm_pager_init() +{ + int lcv; + + /* + * init pager map + */ + + pager_map = uvm_km_suballoc(kernel_map, &uvm.pager_sva, &uvm.pager_eva, + PAGER_MAP_SIZE, FALSE, FALSE, NULL); + simple_lock_init(&pager_map_wanted_lock); + pager_map_wanted = FALSE; + + /* + * init ASYNC I/O queue + */ + + TAILQ_INIT(&uvm.aio_done); + + /* + * call pager init functions + */ + for (lcv = 0 ; lcv < sizeof(uvmpagerops)/sizeof(struct uvm_pagerops *); + lcv++) { + if (uvmpagerops[lcv]->pgo_init) + uvmpagerops[lcv]->pgo_init(); + } +} + +/* + * uvm_pagermapin: map pages into KVA (pager_map) for I/O that needs mappings + * + * we basically just map in a blank map entry to reserve the space in the + * map and then use pmap_enter() to put the mappings in by hand. + */ + +vaddr_t +uvm_pagermapin(pps, npages, aiop, waitf) + struct vm_page **pps; + int npages; + struct uvm_aiodesc **aiop; /* OUT */ + int waitf; +{ + vsize_t size; + vaddr_t kva; + struct uvm_aiodesc *aio; +#if !defined(PMAP_NEW) + vaddr_t cva; + struct vm_page *pp; +#endif + UVMHIST_FUNC("uvm_pagermapin"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d, aiop=0x%x, waitf=%d)", + pps, npages, aiop, waitf); + +ReStart: + if (aiop) { + MALLOC(aio, struct uvm_aiodesc *, sizeof(*aio), M_TEMP, waitf); + if (aio == NULL) + return(0); + *aiop = aio; + } else { + aio = NULL; + } + + size = npages << PAGE_SHIFT; + kva = NULL; /* let system choose VA */ + + if (uvm_map(pager_map, &kva, size, NULL, + UVM_UNKNOWN_OFFSET, UVM_FLAG_NOMERGE) != KERN_SUCCESS) { + if (waitf == M_NOWAIT) { + if (aio) + FREE(aio, M_TEMP); + UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0); + return(NULL); + } + simple_lock(&pager_map_wanted_lock); + pager_map_wanted = TRUE; + UVMHIST_LOG(maphist, " SLEEPING on pager_map",0,0,0,0); + UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, FALSE, + "pager_map",0); + goto ReStart; + } + +#if defined(PMAP_NEW) + /* + * XXX: (ab)using the pmap module to store state info for us. + * (pmap stores the PAs... we fetch them back later and convert back + * to pages with PHYS_TO_VM_PAGE). + */ + pmap_kenter_pgs(kva, pps, npages); + +#else /* PMAP_NEW */ + + /* got it */ + for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) { + pp = *pps++; +#ifdef DEBUG + if ((pp->flags & PG_BUSY) == 0) + panic("uvm_pagermapin: page not busy"); +#endif + + pmap_enter(vm_map_pmap(pager_map), cva, VM_PAGE_TO_PHYS(pp), + VM_PROT_DEFAULT, TRUE); + } + +#endif /* PMAP_NEW */ + + UVMHIST_LOG(maphist, "<- done (KVA=0x%x)", kva,0,0,0); + return(kva); +} + +/* + * uvm_pagermapout: remove pager_map mapping + * + * we remove our mappings by hand and then remove the mapping (waking + * up anyone wanting space). + */ + +void +uvm_pagermapout(kva, npages) + vaddr_t kva; + int npages; +{ + vsize_t size = npages << PAGE_SHIFT; + vm_map_entry_t entries; + UVMHIST_FUNC("uvm_pagermapout"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, " (kva=0x%x, npages=%d)", kva, npages,0,0); + + /* + * duplicate uvm_unmap, but add in pager_map_wanted handling. + */ + + vm_map_lock(pager_map); + (void) uvm_unmap_remove(pager_map, kva, kva + size, &entries); + simple_lock(&pager_map_wanted_lock); + if (pager_map_wanted) { + pager_map_wanted = FALSE; + wakeup(pager_map); + } + simple_unlock(&pager_map_wanted_lock); + vm_map_unlock(pager_map); + if (entries) + uvm_unmap_detach(entries, 0); + + UVMHIST_LOG(maphist,"<- done",0,0,0,0); +} + +/* + * uvm_mk_pcluster + * + * generic "make 'pager put' cluster" function. a pager can either + * [1] set pgo_mk_pcluster to NULL (never cluster), [2] set it to this + * generic function, or [3] set it to a pager specific function. + * + * => caller must lock object _and_ pagequeues (since we need to look + * at active vs. inactive bits, etc.) + * => caller must make center page busy and write-protect it + * => we mark all cluster pages busy for the caller + * => the caller must unbusy all pages (and check wanted/released + * status if it drops the object lock) + * => flags: + * PGO_ALLPAGES: all pages in object are valid targets + * !PGO_ALLPAGES: use "lo" and "hi" to limit range of cluster + * PGO_DOACTCLUST: include active pages in cluster. + * NOTE: the caller should clear PG_CLEANCHK bits if PGO_DOACTCLUST. + * PG_CLEANCHK is only a hint, but clearing will help reduce + * the number of calls we make to the pmap layer. + */ + +struct vm_page ** +uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) + struct uvm_object *uobj; /* IN */ + struct vm_page **pps, *center; /* IN/OUT, IN */ + int *npages, flags; /* IN/OUT, IN */ + vaddr_t mlo, mhi; /* IN (if !PGO_ALLPAGES) */ +{ + struct vm_page **ppsp, *pclust; + vaddr_t lo, hi, curoff; + int center_idx, forward; + UVMHIST_FUNC("uvm_mk_pcluster"); UVMHIST_CALLED(maphist); + + /* + * center page should already be busy and write protected. XXX: + * suppose page is wired? if we lock, then a process could + * fault/block on it. if we don't lock, a process could write the + * pages in the middle of an I/O. (consider an msync()). let's + * lock it for now (better to delay than corrupt data?). + */ + + /* + * get cluster boundaries, check sanity, and apply our limits as well. + */ + + uobj->pgops->pgo_cluster(uobj, center->offset, &lo, &hi); + if ((flags & PGO_ALLPAGES) == 0) { + if (lo < mlo) + lo = mlo; + if (hi > mhi) + hi = mhi; + } + if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */ +#ifdef DIAGNOSTIC + printf("uvm_mk_pcluster: provided page array too small (fixed)\n"); +#endif + pps[0] = center; + *npages = 1; + return(pps); + } + + /* + * now determine the center and attempt to cluster around the + * edges + */ + + center_idx = (center->offset - lo) >> PAGE_SHIFT; + pps[center_idx] = center; /* plug in the center page */ + ppsp = &pps[center_idx]; + *npages = 1; + + /* + * attempt to cluster around the left [backward], and then + * the right side [forward]. + * + * note that for inactive pages (pages that have been deactivated) + * there are no valid mappings and PG_CLEAN should be up to date. + * [i.e. there is no need to query the pmap with pmap_is_modified + * since there are no mappings]. + */ + + for (forward = 0 ; forward <= 1 ; forward++) { + + curoff = center->offset + (forward ? PAGE_SIZE : -PAGE_SIZE); + for ( ;(forward == 0 && curoff >= lo) || + (forward && curoff < hi); + curoff += (forward ? 1 : -1) << PAGE_SHIFT) { + + pclust = uvm_pagelookup(uobj, curoff); /* lookup page */ + if (pclust == NULL) + break; /* no page */ + /* handle active pages */ + /* NOTE: inactive pages don't have pmap mappings */ + if ((pclust->pqflags & PQ_INACTIVE) == 0) { + if ((flags & PGO_DOACTCLUST) == 0) + /* dont want mapped pages at all */ + break; + + /* make sure "clean" bit is sync'd */ + if ((pclust->flags & PG_CLEANCHK) == 0) { + if ((pclust->flags & (PG_CLEAN|PG_BUSY)) + == PG_CLEAN && + pmap_is_modified(PMAP_PGARG(pclust))) + pclust->flags &= ~PG_CLEAN; + /* now checked */ + pclust->flags |= PG_CLEANCHK; + } + } + /* is page available for cleaning and does it need it */ + if ((pclust->flags & (PG_CLEAN|PG_BUSY)) != 0) + break; /* page is already clean or is busy */ + + /* yes! enroll the page in our array */ + pclust->flags |= PG_BUSY; /* busy! */ + UVM_PAGE_OWN(pclust, "uvm_mk_pcluster"); + /* XXX: protect wired page? see above comment. */ + pmap_page_protect(PMAP_PGARG(pclust), VM_PROT_READ); + if (!forward) { + ppsp--; /* back up one page */ + *ppsp = pclust; + } else { + /* move forward one page */ + ppsp[*npages] = pclust; + } + *npages = *npages + 1; + } + } + + /* + * done! return the cluster array to the caller!!! + */ + + UVMHIST_LOG(maphist, "<- done",0,0,0,0); + return(ppsp); +} + + +/* + * uvm_shareprot: generic share protect routine + * + * => caller must lock map entry's map + * => caller must lock object pointed to by map entry + */ + +void +uvm_shareprot(entry, prot) + vm_map_entry_t entry; + vm_prot_t prot; +{ + struct uvm_object *uobj = entry->object.uvm_obj; + struct vm_page *pp; + vaddr_t start, stop; + UVMHIST_FUNC("uvm_shareprot"); UVMHIST_CALLED(maphist); + + if (UVM_ET_ISSUBMAP(entry)) + panic("uvm_shareprot: non-object attached"); + + start = entry->offset; + stop = start + (entry->end - entry->start); + + /* + * traverse list of pages in object. if page in range, pmap_prot it + */ + + for (pp = uobj->memq.tqh_first ; pp != NULL ; pp = pp->listq.tqe_next) { + if (pp->offset >= start && pp->offset < stop) + pmap_page_protect(PMAP_PGARG(pp), prot); + } + UVMHIST_LOG(maphist, "<- done",0,0,0,0); +} + +/* + * uvm_pager_put: high level pageout routine + * + * we want to pageout page "pg" to backing store, clustering if + * possible. + * + * => page queues must be locked by caller + * => if page is not swap-backed, then "uobj" points to the object + * backing it. this object should be locked by the caller. + * => if page is swap-backed, then "uobj" should be NULL. + * => "pg" should be PG_BUSY (by caller), and !PG_CLEAN + * for swap-backed memory, "pg" can be NULL if there is no page + * of interest [sometimes the case for the pagedaemon] + * => "ppsp_ptr" should point to an array of npages vm_page pointers + * for possible cluster building + * => flags (first two for non-swap-backed pages) + * PGO_ALLPAGES: all pages in uobj are valid targets + * PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets + * PGO_SYNCIO: do SYNC I/O (no async) + * PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O + * => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range + * if (!uobj) start is the (daddr_t) of the starting swapblk + * => return state: + * 1. we return the VM_PAGER status code of the pageout + * 2. we return with the page queues unlocked + * 3. if (uobj != NULL) [!swap_backed] we return with + * uobj locked _only_ if PGO_PDFREECLUST is set + * AND result != VM_PAGER_PEND. in all other cases + * we return with uobj unlocked. [this is a hack + * that allows the pagedaemon to save one lock/unlock + * pair in the !swap_backed case since we have to + * lock the uobj to drop the cluster anyway] + * 4. on errors we always drop the cluster. thus, if we return + * !PEND, !OK, then the caller only has to worry about + * un-busying the main page (not the cluster pages). + * 5. on success, if !PGO_PDFREECLUST, we return the cluster + * with all pages busy (caller must un-busy and check + * wanted/released flags). + */ + +int +uvm_pager_put(uobj, pg, ppsp_ptr, npages, flags, start, stop) + struct uvm_object *uobj; /* IN */ + struct vm_page *pg, ***ppsp_ptr;/* IN, IN/OUT */ + int *npages; /* IN/OUT */ + int flags; /* IN */ + vaddr_t start, stop; /* IN, IN */ +{ + int result; + daddr_t swblk; + struct vm_page **ppsp = *ppsp_ptr; + + /* + * note that uobj is null if we are doing a swap-backed pageout. + * note that uobj is !null if we are doing normal object pageout. + * note that the page queues must be locked to cluster. + */ + + if (uobj) { /* if !swap-backed */ + + /* + * attempt to build a cluster for pageout using its + * make-put-cluster function (if it has one). + */ + + if (uobj->pgops->pgo_mk_pcluster) { + ppsp = uobj->pgops->pgo_mk_pcluster(uobj, ppsp, + npages, pg, flags, start, stop); + *ppsp_ptr = ppsp; /* update caller's pointer */ + } else { + ppsp[0] = pg; + *npages = 1; + } + + swblk = 0; /* XXX: keep gcc happy */ + + } else { + + /* + * for swap-backed pageout, the caller (the pagedaemon) has + * already built the cluster for us. the starting swap + * block we are writing to has been passed in as "start." + * "pg" could be NULL if there is no page we are especially + * interested in (in which case the whole cluster gets dropped + * in the event of an error or a sync "done"). + */ + swblk = (daddr_t) start; + /* ppsp and npages should be ok */ + } + + /* now that we've clustered we can unlock the page queues */ + uvm_unlock_pageq(); + + /* + * now attempt the I/O. if we have a failure and we are + * clustered, we will drop the cluster and try again. + */ + +ReTry: + if (uobj) { + /* object is locked */ + result = uobj->pgops->pgo_put(uobj, ppsp, *npages, + flags & PGO_SYNCIO); + /* object is now unlocked */ + } else { + /* nothing locked */ + result = uvm_swap_put(swblk, ppsp, *npages, flags & PGO_SYNCIO); + /* nothing locked */ + } + + /* + * we have attempted the I/O. + * + * if the I/O was a success then: + * if !PGO_PDFREECLUST, we return the cluster to the + * caller (who must un-busy all pages) + * else we un-busy cluster pages for the pagedaemon + * + * if I/O is pending (async i/o) then we return the pending code. + * [in this case the async i/o done function must clean up when + * i/o is done...] + */ + + if (result == VM_PAGER_PEND || result == VM_PAGER_OK) { + if (result == VM_PAGER_OK && (flags & PGO_PDFREECLUST)) { + /* + * drop cluster and relock object (only if I/O is + * not pending) + */ + if (uobj) + /* required for dropcluster */ + simple_lock(&uobj->vmobjlock); + if (*npages > 1 || pg == NULL) + uvm_pager_dropcluster(uobj, pg, ppsp, npages, + PGO_PDFREECLUST, 0); + /* if (uobj): object still locked, as per + * return-state item #3 */ + } + return (result); + } + + /* + * a pager error occured. if we have clustered, we drop the + * cluster and try again. + */ + + if (*npages > 1 || pg == NULL) { + if (uobj) + simple_lock(&uobj->vmobjlock); + uvm_pager_dropcluster(uobj, pg, ppsp, npages, PGO_REALLOCSWAP, + swblk); + if (pg != NULL) + goto ReTry; + } + + /* + * a pager error occured (even after dropping the cluster, if there + * was one). give up! the caller only has one page ("pg") + * to worry about. + */ + + if (uobj && (flags & PGO_PDFREECLUST) != 0) + simple_lock(&uobj->vmobjlock); + return(result); +} + +/* + * uvm_pager_dropcluster: drop a cluster we have built (because we + * got an error, or, if PGO_PDFREECLUST we are un-busying the + * cluster pages on behalf of the pagedaemon). + * + * => uobj, if non-null, is a non-swap-backed object that is + * locked by the caller. we return with this object still + * locked. + * => page queues are not locked + * => pg is our page of interest (the one we clustered around, can be null) + * => ppsp/npages is our current cluster + * => flags: PGO_PDFREECLUST: pageout was a success: un-busy cluster + * pages on behalf of the pagedaemon. + * PGO_REALLOCSWAP: drop previously allocated swap slots for + * clustered swap-backed pages (except for "pg" if !NULL) + * "swblk" is the start of swap alloc (e.g. for ppsp[0]) + * [only meaningful if swap-backed (uobj == NULL)] + */ + + +void uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags, swblk) + +struct uvm_object *uobj; /* IN */ +struct vm_page *pg, **ppsp; /* IN, IN/OUT */ +int *npages; /* IN/OUT */ +int flags; +int swblk; /* valid if (uobj == NULL && PGO_REALLOCSWAP) */ + +{ + int lcv; + boolean_t obj_is_alive; + struct uvm_object *saved_uobj; + + /* + * if we need to reallocate swap space for the cluster we are dropping + * (true if swap-backed and PGO_REALLOCSWAP) then free the old + * allocation now. save a block for "pg" if it is non-NULL. + * + * note that we will zap the object's pointer to swap in the "for" loop + * below... + */ + + if (uobj == NULL && (flags & PGO_REALLOCSWAP)) { + if (pg) + uvm_swap_free(swblk + 1, *npages - 1); + else + uvm_swap_free(swblk, *npages); + } + + /* + * drop all pages but "pg" + */ + + for (lcv = 0 ; lcv < *npages ; lcv++) { + + if (ppsp[lcv] == pg) /* skip "pg" */ + continue; + + /* + * if swap-backed, gain lock on object that owns page. note + * that PQ_ANON bit can't change as long as we are holding + * the PG_BUSY bit (so there is no need to lock the page + * queues to test it). + * + * once we have the lock, dispose of the pointer to swap, if + * requested + */ + if (!uobj) { + if (ppsp[lcv]->pqflags & PQ_ANON) { + simple_lock(&ppsp[lcv]->uanon->an_lock); + if (flags & PGO_REALLOCSWAP) + /* zap swap block */ + ppsp[lcv]->uanon->an_swslot = 0; + } else { + simple_lock(&ppsp[lcv]->uobject->vmobjlock); + if (flags & PGO_REALLOCSWAP) + uao_set_swslot(ppsp[lcv]->uobject, + ppsp[lcv]->offset >> PAGE_SHIFT, 0); + } + } + + /* did someone want the page while we had it busy-locked? */ + if (ppsp[lcv]->flags & PG_WANTED) + /* still holding obj lock */ + thread_wakeup(ppsp[lcv]); + + /* if page was released, release it. otherwise un-busy it */ + if (ppsp[lcv]->flags & PG_RELEASED) { + + if (ppsp[lcv]->pqflags & PQ_ANON) { + /* so that anfree will free */ + ppsp[lcv]->flags &= ~(PG_BUSY); + UVM_PAGE_OWN(ppsp[lcv], NULL); + + pmap_page_protect(PMAP_PGARG(ppsp[lcv]), + VM_PROT_NONE); /* be safe */ + simple_unlock(&ppsp[lcv]->uanon->an_lock); + /* kills anon and frees pg */ + uvm_anfree(ppsp[lcv]->uanon); + + continue; + } + + /* + * pgo_releasepg will dump the page for us + */ + +#ifdef DIAGNOSTIC + if (ppsp[lcv]->uobject->pgops->pgo_releasepg == NULL) + panic("uvm_pager_dropcluster: no releasepg " + "function"); +#endif + saved_uobj = ppsp[lcv]->uobject; + obj_is_alive = + saved_uobj->pgops->pgo_releasepg(ppsp[lcv], NULL); + +#ifdef DIAGNOSTIC + /* for normal objects, "pg" is still PG_BUSY by us, + * so obj can't die */ + if (uobj && !obj_is_alive) + panic("uvm_pager_dropcluster: object died " + "with active page"); +#endif + /* only unlock the object if it is still alive... */ + if (obj_is_alive && saved_uobj != uobj) + simple_unlock(&saved_uobj->vmobjlock); + + /* + * XXXCDC: suppose uobj died in the pgo_releasepg? + * how pass that + * info up to caller. we are currently ignoring it... + */ + + continue; /* next page */ + + } else { + ppsp[lcv]->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(ppsp[lcv], NULL); + } + + /* + * if we are operating on behalf of the pagedaemon and we + * had a successful pageout update the page! + */ + if (flags & PGO_PDFREECLUST) { + /* XXX: with PMAP_NEW ref should already be clear, + * but don't trust! */ + pmap_clear_reference(PMAP_PGARG(ppsp[lcv])); + pmap_clear_modify(PMAP_PGARG(ppsp[lcv])); + ppsp[lcv]->flags |= PG_CLEAN; + } + + /* if anonymous cluster, unlock object and move on */ + if (!uobj) { + if (ppsp[lcv]->pqflags & PQ_ANON) + simple_unlock(&ppsp[lcv]->uanon->an_lock); + else + simple_unlock(&ppsp[lcv]->uobject->vmobjlock); + } + + } + + /* + * drop to a cluster of 1 page ("pg") if requested + */ + + if (pg && (flags & PGO_PDFREECLUST) == 0) { + /* + * if we are not a successful pageout, we make a 1 page cluster. + */ + ppsp[0] = pg; + *npages = 1; + + /* + * assign new swap block to new cluster, if anon backed + */ + if (uobj == NULL && (flags & PGO_REALLOCSWAP)) { + if (pg->pqflags & PQ_ANON) { + simple_lock(&pg->uanon->an_lock); + pg->uanon->an_swslot = swblk; /* reassign */ + simple_unlock(&pg->uanon->an_lock); + } else { + simple_lock(&pg->uobject->vmobjlock); + uao_set_swslot(pg->uobject, + pg->offset >> PAGE_SHIFT, swblk); + simple_unlock(&pg->uobject->vmobjlock); + } + } + } +} diff --git a/sys/uvm/uvm_pager.h b/sys/uvm/uvm_pager.h new file mode 100644 index 00000000000..f48082e4b44 --- /dev/null +++ b/sys/uvm/uvm_pager.h @@ -0,0 +1,158 @@ +/* $NetBSD: uvm_pager.h,v 1.7 1998/08/13 02:11:03 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_pager.h,v 1.1.2.14 1998/01/13 19:00:50 chuck Exp + */ + +#ifndef _UVM_UVM_PAGER_H_ +#define _UVM_UVM_PAGER_H_ + +/* + * uvm_pager.h + */ + +/* + * async pager i/o descriptor structure + */ + +TAILQ_HEAD(uvm_aiohead, uvm_aiodesc); + +struct uvm_aiodesc { + void (*aiodone) __P((struct uvm_aiodesc *)); + /* aio done function */ + vaddr_t kva; /* KVA of mapped page(s) */ + int npages; /* # of pages in I/O req */ + void *pd_ptr; /* pager-dependent pointer */ + TAILQ_ENTRY(uvm_aiodesc) aioq; /* linked list of aio's */ +}; + +/* + * pager ops + */ + +struct uvm_pagerops { + void (*pgo_init) __P((void));/* init pager */ + struct uvm_object * (*pgo_attach) /* get uvm_object */ + __P((void *, vm_prot_t)); + void (*pgo_reference) /* add reference to obj */ + __P((struct uvm_object *)); + void (*pgo_detach) /* drop reference to obj */ + __P((struct uvm_object *)); + int (*pgo_fault) /* special nonstd fault fn */ + __P((struct uvm_faultinfo *, vaddr_t, + vm_page_t *, int, int, vm_fault_t, + vm_prot_t, int)); + boolean_t (*pgo_flush) /* flush pages out of obj */ + __P((struct uvm_object *, vaddr_t, + vaddr_t, int)); + int (*pgo_get) /* get/read page */ + __P((struct uvm_object *, vaddr_t, + vm_page_t *, int *, int, vm_prot_t, int, int)); + int (*pgo_asyncget) /* start async get */ + __P((struct uvm_object *, vaddr_t, int)); + int (*pgo_put) /* put/write page */ + __P((struct uvm_object *, vm_page_t *, + int, boolean_t)); + void (*pgo_cluster) /* return range of cluster */ + __P((struct uvm_object *, vaddr_t, vaddr_t *, + vaddr_t *)); + struct vm_page ** (*pgo_mk_pcluster) /* make "put" cluster */ + __P((struct uvm_object *, struct vm_page **, + int *, struct vm_page *, int, vaddr_t, + vaddr_t)); + void (*pgo_shareprot) /* share protect */ + __P((vm_map_entry_t, vm_prot_t)); + void (*pgo_aiodone) /* async iodone */ + __P((struct uvm_aiodesc *)); + boolean_t (*pgo_releasepg) /* release page */ + __P((struct vm_page *, struct vm_page **)); +}; + +/* pager flags [mostly for flush] */ + +#define PGO_CLEANIT 0x001 /* write dirty pages to backing store */ +#define PGO_SYNCIO 0x002 /* if PGO_CLEAN: use sync I/O? */ +/* + * obviously if neither PGO_INVALIDATE or PGO_FREE are set then the pages + * stay where they are. + */ +#define PGO_DEACTIVATE 0x004 /* deactivate flushed pages */ +#define PGO_FREE 0x008 /* free flushed pages */ + +#define PGO_ALLPAGES 0x010 /* flush whole object/get all pages */ +#define PGO_DOACTCLUST 0x020 /* flag to mk_pcluster to include active */ +#define PGO_LOCKED 0x040 /* fault data structures are locked [get] */ +#define PGO_PDFREECLUST 0x080 /* daemon's free cluster flag [uvm_pager_put] */ +#define PGO_REALLOCSWAP 0x100 /* reallocate swap area [pager_dropcluster] */ + +/* page we are not interested in getting */ +#define PGO_DONTCARE ((struct vm_page *) -1) /* [get only] */ + +/* + * handle inline options + */ + +#ifdef UVM_PAGER_INLINE +#define PAGER_INLINE static __inline +#else +#define PAGER_INLINE /* nothing */ +#endif /* UVM_PAGER_INLINE */ + +/* + * prototypes + */ + +void uvm_pager_dropcluster __P((struct uvm_object *, + struct vm_page *, struct vm_page **, + int *, int, int)); +void uvm_pager_init __P((void)); +int uvm_pager_put __P((struct uvm_object *, struct vm_page *, + struct vm_page ***, int *, int, + vaddr_t, vaddr_t)); + +PAGER_INLINE struct vm_page *uvm_pageratop __P((vaddr_t)); + +vaddr_t uvm_pagermapin __P((struct vm_page **, int, + struct uvm_aiodesc **, int)); +void uvm_pagermapout __P((vaddr_t, int)); +struct vm_page **uvm_mk_pcluster __P((struct uvm_object *, struct vm_page **, + int *, struct vm_page *, int, + vaddr_t, vaddr_t)); +void uvm_shareprot __P((vm_map_entry_t, vm_prot_t)); + + +#endif /* _UVM_UVM_PAGER_H_ */ diff --git a/sys/uvm/uvm_pager_i.h b/sys/uvm/uvm_pager_i.h new file mode 100644 index 00000000000..7e8e8675df7 --- /dev/null +++ b/sys/uvm/uvm_pager_i.h @@ -0,0 +1,73 @@ +/* $NetBSD: uvm_pager_i.h,v 1.6 1998/08/13 02:11:03 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_pager_i.h,v 1.1.2.2 1997/10/09 23:05:46 chuck Exp + */ + +#ifndef _UVM_UVM_PAGER_I_H_ +#define _UVM_UVM_PAGER_I_H_ + +/* + * uvm_pager_i.h + */ + +/* + * inline functions [maybe] + */ + +#if defined(UVM_PAGER_INLINE) || defined(UVM_PAGER) + +/* + * uvm_pageratop: convert KVAs in the pager map back to their page + * structures. + */ + +PAGER_INLINE struct vm_page * +uvm_pageratop(kva) + vaddr_t kva; +{ + paddr_t pa; + + pa = pmap_extract(pmap_kernel(), kva); + if (pa == 0) + panic("uvm_pageratop"); + return (PHYS_TO_VM_PAGE(pa)); +} + +#endif /* defined(UVM_PAGER_INLINE) || defined(UVM_PAGER) */ + +#endif /* _UVM_UVM_PAGER_I_H_ */ diff --git a/sys/uvm/uvm_pdaemon.c b/sys/uvm/uvm_pdaemon.c new file mode 100644 index 00000000000..f1b0fcc327d --- /dev/null +++ b/sys/uvm/uvm_pdaemon.c @@ -0,0 +1,1012 @@ +/* $NetBSD: uvm_pdaemon.c,v 1.12 1998/11/04 07:06:05 chs Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94 + * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * uvm_pdaemon.c: the page daemon + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/pool.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +/* + * local prototypes + */ + +static void uvmpd_scan __P((void)); +static boolean_t uvmpd_scan_inactive __P((struct pglist *)); +static void uvmpd_tune __P((void)); + + +/* + * uvm_wait: wait (sleep) for the page daemon to free some pages + * + * => should be called with all locks released + * => should _not_ be called by the page daemon (to avoid deadlock) + */ + +void uvm_wait(wmsg) + char *wmsg; +{ + int timo = 0; + int s = splbio(); + + /* + * check for page daemon going to sleep (waiting for itself) + */ + + if (curproc == uvm.pagedaemon_proc) { + /* + * now we have a problem: the pagedaemon wants to go to + * sleep until it frees more memory. but how can it + * free more memory if it is asleep? that is a deadlock. + * we have two options: + * [1] panic now + * [2] put a timeout on the sleep, thus causing the + * pagedaemon to only pause (rather than sleep forever) + * + * note that option [2] will only help us if we get lucky + * and some other process on the system breaks the deadlock + * by exiting or freeing memory (thus allowing the pagedaemon + * to continue). for now we panic if DEBUG is defined, + * otherwise we hope for the best with option [2] (better + * yet, this should never happen in the first place!). + */ + + printf("pagedaemon: deadlock detected!\n"); + timo = hz >> 3; /* set timeout */ +#if defined(DEBUG) + /* DEBUG: panic so we can debug it */ + panic("pagedaemon deadlock"); +#endif + } + + simple_lock(&uvm.pagedaemon_lock); + thread_wakeup(&uvm.pagedaemon); /* wake the daemon! */ + UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvm.pagedaemon_lock, FALSE, wmsg, + timo); + + splx(s); +} + + +/* + * uvmpd_tune: tune paging parameters + * + * => called when ever memory is added (or removed?) to the system + * => caller must call with page queues locked + */ + +static void +uvmpd_tune() +{ + UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist); + + uvmexp.freemin = uvmexp.npages / 20; + + /* between 16k and 256k */ + /* XXX: what are these values good for? */ + uvmexp.freemin = max(uvmexp.freemin, (16*1024) >> PAGE_SHIFT); + uvmexp.freemin = min(uvmexp.freemin, (256*1024) >> PAGE_SHIFT); + + uvmexp.freetarg = (uvmexp.freemin * 4) / 3; + if (uvmexp.freetarg <= uvmexp.freemin) + uvmexp.freetarg = uvmexp.freemin + 1; + + /* uvmexp.inactarg: computed in main daemon loop */ + + uvmexp.wiredmax = uvmexp.npages / 3; + UVMHIST_LOG(pdhist, "<- done, freemin=%d, freetarg=%d, wiredmax=%d", + uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0); +} + +/* + * uvm_pageout: the main loop for the pagedaemon + */ + +void +uvm_pageout() +{ + int npages = 0; + int s; + struct uvm_aiodesc *aio, *nextaio; + UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0); + + /* + * ensure correct priority and set paging parameters... + */ + + uvm.pagedaemon_proc = curproc; + (void) spl0(); + uvm_lock_pageq(); + npages = uvmexp.npages; + uvmpd_tune(); + uvm_unlock_pageq(); + + /* + * main loop + */ + while (TRUE) { + + /* + * carefully attempt to go to sleep (without losing "wakeups"!). + * we need splbio because we want to make sure the aio_done list + * is totally empty before we go to sleep. + */ + + s = splbio(); + simple_lock(&uvm.pagedaemon_lock); + + /* + * if we've got done aio's, then bypass the sleep + */ + + if (uvm.aio_done.tqh_first == NULL) { + UVMHIST_LOG(maphist," <<SLEEPING>>",0,0,0,0); + UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon, + &uvm.pagedaemon_lock, FALSE, "daemon_slp", 0); + uvmexp.pdwoke++; + UVMHIST_LOG(pdhist," <<WOKE UP>>",0,0,0,0); + + /* relock pagedaemon_lock, still at splbio */ + simple_lock(&uvm.pagedaemon_lock); + } + + /* + * check for done aio structures + */ + + aio = uvm.aio_done.tqh_first; /* save current list (if any)*/ + if (aio) { + TAILQ_INIT(&uvm.aio_done); /* zero global list */ + } + + simple_unlock(&uvm.pagedaemon_lock); /* unlock */ + splx(s); /* drop splbio */ + + /* + * first clear out any pending aios (to free space in case we + * want to pageout more stuff). + */ + + for (/*null*/; aio != NULL ; aio = nextaio) { + + uvmexp.paging -= aio->npages; + nextaio = aio->aioq.tqe_next; + aio->aiodone(aio); + + } + + /* Next, drain pool resources */ + pool_drain(0); + + /* + * now lock page queues and recompute inactive count + */ + uvm_lock_pageq(); + + if (npages != uvmexp.npages) { /* check for new pages? */ + npages = uvmexp.npages; + uvmpd_tune(); + } + + uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3; + if (uvmexp.inactarg <= uvmexp.freetarg) + uvmexp.inactarg = uvmexp.freetarg + 1; + + UVMHIST_LOG(pdhist," free/ftarg=%d/%d, inact/itarg=%d/%d", + uvmexp.free, uvmexp.freetarg, uvmexp.inactive, + uvmexp.inactarg); + + /* + * scan if needed + * [XXX: note we are reading uvm.free without locking] + */ + if (uvmexp.free < uvmexp.freetarg || + uvmexp.inactive < uvmexp.inactarg) + uvmpd_scan(); + + /* + * done scan. unlock page queues (the only lock we are holding) + */ + uvm_unlock_pageq(); + + /* + * done! restart loop. + */ + thread_wakeup(&uvmexp.free); + } + /*NOTREACHED*/ +} + +/* + * uvmpd_scan_inactive: the first loop of uvmpd_scan broken out into + * its own function for ease of reading. + * + * => called with page queues locked + * => we work on meeting our free target by converting inactive pages + * into free pages. + * => we handle the building of swap-backed clusters + * => we return TRUE if we are exiting because we met our target + */ + +static boolean_t +uvmpd_scan_inactive(pglst) + struct pglist *pglst; +{ + boolean_t retval = FALSE; /* assume we haven't hit target */ + int s, free, result; + struct vm_page *p, *nextpg; + struct uvm_object *uobj; + struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; + int npages; + struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; /* XXX: see below */ + int swnpages, swcpages; /* XXX: see below */ + int swslot, oldslot; + struct vm_anon *anon; + boolean_t swap_backed; + vaddr_t start; + UVMHIST_FUNC("uvmpd_scan_inactive"); UVMHIST_CALLED(pdhist); + + /* + * note: we currently keep swap-backed pages on a seperate inactive + * list from object-backed pages. however, merging the two lists + * back together again hasn't been ruled out. thus, we keep our + * swap cluster in "swpps" rather than in pps (allows us to mix + * clustering types in the event of a mixed inactive queue). + */ + + /* + * swslot is non-zero if we are building a swap cluster. we want + * to stay in the loop while we have a page to scan or we have + * a swap-cluster to build. + */ + swslot = 0; + swnpages = swcpages = 0; + free = 0; + + for (p = pglst->tqh_first ; p != NULL || swslot != 0 ; p = nextpg) { + + /* + * note that p can be NULL iff we have traversed the whole + * list and need to do one final swap-backed clustered pageout. + */ + if (p) { + /* + * update our copy of "free" and see if we've met + * our target + */ + s = splimp(); + uvm_lock_fpageq(); + free = uvmexp.free; + uvm_unlock_fpageq(); + splx(s); + + if (free >= uvmexp.freetarg) { + UVMHIST_LOG(pdhist," met free target: " + "exit loop", 0, 0, 0, 0); + retval = TRUE; /* hit the target! */ + + if (swslot == 0) + /* exit now if no swap-i/o pending */ + break; + + /* set p to null to signal final swap i/o */ + p = NULL; + } + } + + uobj = NULL; /* be safe and shut gcc up */ + anon = NULL; /* be safe and shut gcc up */ + + if (p) { /* if (we have a new page to consider) */ + /* + * we are below target and have a new page to consider. + */ + uvmexp.pdscans++; + nextpg = p->pageq.tqe_next; + + /* + * move referenced pages back to active queue and + * skip to next page (unlikely to happen since + * inactive pages shouldn't have any valid mappings + * and we cleared reference before deactivating). + */ + if (pmap_is_referenced(PMAP_PGARG(p))) { + uvm_pageactivate(p); + uvmexp.pdreact++; + continue; + } + + /* + * first we attempt to lock the object that this page + * belongs to. if our attempt fails we skip on to + * the next page (no harm done). it is important to + * "try" locking the object as we are locking in the + * wrong order (pageq -> object) and we don't want to + * get deadlocked. + * + * the only time we exepct to see an ownerless page + * (i.e. a page with no uobject and !PQ_ANON) is if an + * anon has loaned a page from a uvm_object and the + * uvm_object has dropped the ownership. in that + * case, the anon can "take over" the loaned page + * and make it its own. + */ + + /* is page part of an anon or ownerless ? */ + if ((p->pqflags & PQ_ANON) || p->uobject == NULL) { + + anon = p->uanon; + +#ifdef DIAGNOSTIC + /* to be on inactive q, page must be part + * of _something_ */ + if (anon == NULL) + panic("pagedaemon: page with no anon " + "or object detected - loop 1"); +#endif + + if (!simple_lock_try(&anon->an_lock)) + /* lock failed, skip this page */ + continue; + + /* + * if the page is ownerless, claim it in the + * name of "anon"! + */ + if ((p->pqflags & PQ_ANON) == 0) { +#ifdef DIAGNOSTIC + if (p->loan_count < 1) + panic("pagedaemon: non-loaned " + "ownerless page detected -" + " loop 1"); +#endif + p->loan_count--; + p->pqflags |= PQ_ANON; /* anon now owns it */ + } + + if (p->flags & PG_BUSY) { + simple_unlock(&anon->an_lock); + uvmexp.pdbusy++; + /* someone else owns page, skip it */ + continue; + } + + uvmexp.pdanscan++; + + } else { + + uobj = p->uobject; + + if (!simple_lock_try(&uobj->vmobjlock)) + /* lock failed, skip this page */ + continue; + + if (p->flags & PG_BUSY) { + simple_unlock(&uobj->vmobjlock); + uvmexp.pdbusy++; + /* someone else owns page, skip it */ + continue; + } + + uvmexp.pdobscan++; + } + + /* + * we now have the object and the page queues locked. + * the page is not busy. if the page is clean we + * can free it now and continue. + */ + + if (p->flags & PG_CLEAN) { + /* zap all mappings with pmap_page_protect... */ + pmap_page_protect(PMAP_PGARG(p), VM_PROT_NONE); + uvm_pagefree(p); + uvmexp.pdfreed++; + + if (anon) { +#ifdef DIAGNOSTIC + /* + * an anonymous page can only be clean + * if it has valid backing store. + */ + if (anon->an_swslot == 0) + panic("pagedaemon: clean anon " + "page without backing store?"); +#endif + /* remove from object */ + anon->u.an_page = NULL; + simple_unlock(&anon->an_lock); + } else { + /* pagefree has already removed the + * page from the object */ + simple_unlock(&uobj->vmobjlock); + } + continue; + } + + /* + * this page is dirty, skip it if we'll have met our + * free target when all the current pageouts complete. + */ + if (free + uvmexp.paging > uvmexp.freetarg) + { + if (anon) { + simple_unlock(&anon->an_lock); + } else { + simple_unlock(&uobj->vmobjlock); + } + continue; + } + + /* + * the page we are looking at is dirty. we must + * clean it before it can be freed. to do this we + * first mark the page busy so that no one else will + * touch the page. we write protect all the mappings + * of the page so that no one touches it while it is + * in I/O. + */ + + swap_backed = ((p->pqflags & PQ_SWAPBACKED) != 0); + p->flags |= PG_BUSY; /* now we own it */ + UVM_PAGE_OWN(p, "scan_inactive"); + pmap_page_protect(PMAP_PGARG(p), VM_PROT_READ); + uvmexp.pgswapout++; + + /* + * for swap-backed pages we need to (re)allocate + * swap space. + */ + if (swap_backed) { + + /* + * free old swap slot (if any) + */ + if (anon) { + if (anon->an_swslot) { + uvm_swap_free(anon->an_swslot, + 1); + anon->an_swslot = 0; + } + } else { + oldslot = uao_set_swslot(uobj, + p->offset >> PAGE_SHIFT, 0); + + if (oldslot) + uvm_swap_free(oldslot, 1); + } + + /* + * start new cluster (if necessary) + */ + if (swslot == 0) { + /* want this much */ + swnpages = MAXBSIZE >> PAGE_SHIFT; + + swslot = uvm_swap_alloc(&swnpages, + TRUE); + + if (swslot == 0) { + /* no swap? give up! */ + p->flags &= ~PG_BUSY; + UVM_PAGE_OWN(p, NULL); + if (anon) + simple_unlock( + &anon->an_lock); + else + simple_unlock( + &uobj->vmobjlock); + continue; + } + swcpages = 0; /* cluster is empty */ + } + + /* + * add block to cluster + */ + swpps[swcpages] = p; + uvmexp.pgswapout++; + if (anon) + anon->an_swslot = swslot + swcpages; + else + uao_set_swslot(uobj, + p->offset >> PAGE_SHIFT, + swslot + swcpages); + swcpages++; + + /* done (swap-backed) */ + } + + /* end: if (p) ["if we have new page to consider"] */ + } else { + + /* if p == NULL we must be doing a last swap i/o */ + swap_backed = TRUE; + } + + /* + * now consider doing the pageout. + * + * for swap-backed pages, we do the pageout if we have either + * filled the cluster (in which case (swnpages == swcpages) or + * run out of pages (p == NULL). + * + * for object pages, we always do the pageout. + */ + if (swap_backed) { + + if (p) { /* if we just added a page to cluster */ + if (anon) + simple_unlock(&anon->an_lock); + else + simple_unlock(&uobj->vmobjlock); + + /* cluster not full yet? */ + if (swcpages < swnpages) + continue; + } + + /* starting I/O now... set up for it */ + npages = swcpages; + ppsp = swpps; + /* for swap-backed pages only */ + start = (vaddr_t) swslot; + + /* if this is final pageout we could have a few + * extra swap blocks */ + if (swcpages < swnpages) { + uvm_swap_free(swslot + swcpages, + (swnpages - swcpages)); + } + + } else { + + /* normal object pageout */ + ppsp = pps; + npages = sizeof(pps) / sizeof(struct vm_page *); + /* not looked at because PGO_ALLPAGES is set */ + start = 0; + + } + + /* + * now do the pageout. + * + * for swap_backed pages we have already built the cluster. + * for !swap_backed pages, uvm_pager_put will call the object's + * "make put cluster" function to build a cluster on our behalf. + * + * we pass the PGO_PDFREECLUST flag to uvm_pager_put to instruct + * it to free the cluster pages for us on a successful I/O (it + * always does this for un-successful I/O requests). this + * allows us to do clustered pageout without having to deal + * with cluster pages at this level. + * + * note locking semantics of uvm_pager_put with PGO_PDFREECLUST: + * IN: locked: uobj (if !swap_backed), page queues + * OUT: locked: uobj (if !swap_backed && result !=VM_PAGER_PEND) + * !locked: pageqs, uobj (if swap_backed || VM_PAGER_PEND) + * + * [the bit about VM_PAGER_PEND saves us one lock-unlock pair] + */ + + /* locked: uobj (if !swap_backed), page queues */ + uvmexp.pdpageouts++; + result = uvm_pager_put((swap_backed) ? NULL : uobj, p, + &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0); + /* locked: uobj (if !swap_backed && result != PEND) */ + /* unlocked: pageqs, object (if swap_backed ||result == PEND) */ + + /* + * if we did i/o to swap, zero swslot to indicate that we are + * no longer building a swap-backed cluster. + */ + + if (swap_backed) + swslot = 0; /* done with this cluster */ + + /* + * first, we check for VM_PAGER_PEND which means that the + * async I/O is in progress and the async I/O done routine + * will clean up after us. in this case we move on to the + * next page. + * + * there is a very remote chance that the pending async i/o can + * finish _before_ we get here. if that happens, our page "p" + * may no longer be on the inactive queue. so we verify this + * when determining the next page (starting over at the head if + * we've lost our inactive page). + */ + + if (result == VM_PAGER_PEND) { + uvmexp.paging += npages; + uvm_lock_pageq(); /* relock page queues */ + uvmexp.pdpending++; + if (p) { + if (p->pqflags & PQ_INACTIVE) + /* reload! */ + nextpg = p->pageq.tqe_next; + else + /* reload! */ + nextpg = pglst->tqh_first; + } else { + nextpg = NULL; /* done list */ + } + continue; + } + + /* + * clean up "p" if we have one + */ + + if (p) { + /* + * the I/O request to "p" is done and uvm_pager_put + * has freed any cluster pages it may have allocated + * during I/O. all that is left for us to do is + * clean up page "p" (which is still PG_BUSY). + * + * our result could be one of the following: + * VM_PAGER_OK: successful pageout + * + * VM_PAGER_AGAIN: tmp resource shortage, we skip + * to next page + * VM_PAGER_{FAIL,ERROR,BAD}: an error. we + * "reactivate" page to get it out of the way (it + * will eventually drift back into the inactive + * queue for a retry). + * VM_PAGER_UNLOCK: should never see this as it is + * only valid for "get" operations + */ + + /* relock p's object: page queues not lock yet, so + * no need for "try" */ + + /* !swap_backed case: already locked... */ + if (swap_backed) { + if (anon) + simple_lock(&anon->an_lock); + else + simple_lock(&uobj->vmobjlock); + } + +#ifdef DIAGNOSTIC + if (result == VM_PAGER_UNLOCK) + panic("pagedaemon: pageout returned " + "invalid 'unlock' code"); +#endif + + /* handle PG_WANTED now */ + if (p->flags & PG_WANTED) + /* still holding object lock */ + thread_wakeup(p); + + p->flags &= ~(PG_BUSY|PG_WANTED); + UVM_PAGE_OWN(p, NULL); + + /* released during I/O? */ + if (p->flags & PG_RELEASED) { + if (anon) { + /* remove page so we can get nextpg */ + anon->u.an_page = NULL; + + simple_unlock(&anon->an_lock); + uvm_anfree(anon); /* kills anon */ + pmap_page_protect(PMAP_PGARG(p), + VM_PROT_NONE); + anon = NULL; + uvm_lock_pageq(); + nextpg = p->pageq.tqe_next; + /* free released page */ + uvm_pagefree(p); + + } else { + +#ifdef DIAGNOSTIC + if (uobj->pgops->pgo_releasepg == NULL) + panic("pagedaemon: no " + "pgo_releasepg function"); +#endif + + /* + * pgo_releasepg nukes the page and + * gets "nextpg" for us. it returns + * with the page queues locked (when + * given nextpg ptr). + */ + if (!uobj->pgops->pgo_releasepg(p, + &nextpg)) + /* uobj died after release */ + uobj = NULL; + + /* + * lock page queues here so that they're + * always locked at the end of the loop. + */ + uvm_lock_pageq(); + } + + } else { /* page was not released during I/O */ + + uvm_lock_pageq(); + nextpg = p->pageq.tqe_next; + + if (result != VM_PAGER_OK) { + + /* pageout was a failure... */ + if (result != VM_PAGER_AGAIN) + uvm_pageactivate(p); + pmap_clear_reference(PMAP_PGARG(p)); + /* XXXCDC: if (swap_backed) FREE p's + * swap block? */ + + } else { + + /* pageout was a success... */ + pmap_clear_reference(PMAP_PGARG(p)); + pmap_clear_modify(PMAP_PGARG(p)); + p->flags |= PG_CLEAN; + /* XXX: could free page here, but old + * pagedaemon does not */ + + } + } + + /* + * drop object lock (if there is an object left). do + * a safety check of nextpg to make sure it is on the + * inactive queue (it should be since PG_BUSY pages on + * the inactive queue can't be re-queued [note: not + * true for active queue]). + */ + + if (anon) + simple_unlock(&anon->an_lock); + else if (uobj) + simple_unlock(&uobj->vmobjlock); + + } /* if (p) */ else { + + /* if p is null in this loop, make sure it stays null + * in next loop */ + nextpg = NULL; + + /* + * lock page queues here just so they're always locked + * at the end of the loop. + */ + uvm_lock_pageq(); + } + + if (nextpg && (nextpg->pqflags & PQ_INACTIVE) == 0) { + printf("pagedaemon: invalid nextpg! reverting to " + "queue head\n"); + nextpg = pglst->tqh_first; /* reload! */ + } + + } /* end of "inactive" 'for' loop */ + return (retval); +} + +/* + * uvmpd_scan: scan the page queues and attempt to meet our targets. + * + * => called with pageq's locked + */ + +void +uvmpd_scan() +{ + int s, free, pages_freed, page_shortage; + struct vm_page *p, *nextpg; + struct uvm_object *uobj; + boolean_t got_it; + UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist); + + uvmexp.pdrevs++; /* counter */ + +#ifdef __GNUC__ + uobj = NULL; /* XXX gcc */ +#endif + /* + * get current "free" page count + */ + s = splimp(); + uvm_lock_fpageq(); + free = uvmexp.free; + uvm_unlock_fpageq(); + splx(s); + +#ifndef __SWAP_BROKEN + /* + * swap out some processes if we are below our free target. + * we need to unlock the page queues for this. + */ + if (free < uvmexp.freetarg) { + + uvmexp.pdswout++; + UVMHIST_LOG(pdhist," free %d < target %d: swapout", free, + uvmexp.freetarg, 0, 0); + uvm_unlock_pageq(); + uvm_swapout_threads(); + pmap_update(); /* update so we can scan inactive q */ + uvm_lock_pageq(); + + } +#endif + + /* + * now we want to work on meeting our targets. first we work on our + * free target by converting inactive pages into free pages. then + * we work on meeting our inactive target by converting active pages + * to inactive ones. + */ + + UVMHIST_LOG(pdhist, " starting 'free' loop",0,0,0,0); + pages_freed = uvmexp.pdfreed; /* so far... */ + + /* + * do loop #1! alternate starting queue between swap and object based + * on the low bit of uvmexp.pdrevs (which we bump by one each call). + */ + + got_it = FALSE; + if ((uvmexp.pdrevs & 1) != 0 && uvmexp.nswapdev != 0) + got_it = uvmpd_scan_inactive(&uvm.page_inactive_swp); + if (!got_it) + got_it = uvmpd_scan_inactive(&uvm.page_inactive_obj); + if (!got_it && (uvmexp.pdrevs & 1) == 0 && uvmexp.nswapdev != 0) + (void) uvmpd_scan_inactive(&uvm.page_inactive_swp); + + /* + * we have done the scan to get free pages. now we work on meeting + * our inactive target. + */ + + page_shortage = uvmexp.inactarg - uvmexp.inactive; + pages_freed = uvmexp.pdfreed - pages_freed; /* # pages freed in loop */ + if (page_shortage <= 0 && pages_freed == 0) + page_shortage = 1; + + UVMHIST_LOG(pdhist, " second loop: page_shortage=%d", page_shortage, + 0, 0, 0); + for (p = uvm.page_active.tqh_first ; + p != NULL && page_shortage > 0 ; p = nextpg) { + nextpg = p->pageq.tqe_next; + if (p->flags & PG_BUSY) + continue; /* quick check before trying to lock */ + + /* + * lock owner + */ + /* is page anon owned or ownerless? */ + if ((p->pqflags & PQ_ANON) || p->uobject == NULL) { + +#ifdef DIAGNOSTIC + if (p->uanon == NULL) + panic("pagedaemon: page with no anon or " + "object detected - loop 2"); +#endif + + if (!simple_lock_try(&p->uanon->an_lock)) + continue; + + /* take over the page? */ + if ((p->pqflags & PQ_ANON) == 0) { + +#ifdef DIAGNOSTIC + if (p->loan_count < 1) + panic("pagedaemon: non-loaned " + "ownerless page detected - loop 2"); +#endif + + p->loan_count--; + p->pqflags |= PQ_ANON; + } + + } else { + + if (!simple_lock_try(&p->uobject->vmobjlock)) + continue; + + } + + if ((p->flags & PG_BUSY) == 0) { + pmap_page_protect(PMAP_PGARG(p), VM_PROT_NONE); + /* no need to check wire_count as pg is "active" */ + uvm_pagedeactivate(p); + uvmexp.pddeact++; + page_shortage--; + } + + if (p->pqflags & PQ_ANON) + simple_unlock(&p->uanon->an_lock); + else + simple_unlock(&p->uobject->vmobjlock); + } + + /* + * done scan + */ +} diff --git a/sys/uvm/uvm_pdaemon.h b/sys/uvm/uvm_pdaemon.h new file mode 100644 index 00000000000..c5aad80ef81 --- /dev/null +++ b/sys/uvm/uvm_pdaemon.h @@ -0,0 +1,86 @@ +/* $NetBSD: uvm_pdaemon.h,v 1.5 1998/02/10 14:12:28 mrg Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993, The Regents of the University of California. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 + * from: Id: uvm_pdaemon.h,v 1.1.2.4 1998/02/02 20:07:20 chuck Exp + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _UVM_UVM_PDAEMON_H_ +#define _UVM_UVM_PDAEMON_H_ + +/* + * uvm_pdaemon.h: page daemon hooks + */ + +/* + * prototypes + */ + +void uvm_wait __P((char *)); + +#endif /* _UVM_UVM_PDAEMON_H_ */ diff --git a/sys/uvm/uvm_pglist.c b/sys/uvm/uvm_pglist.c new file mode 100644 index 00000000000..d7fe645ebeb --- /dev/null +++ b/sys/uvm/uvm_pglist.c @@ -0,0 +1,292 @@ +/* $NetBSD: uvm_pglist.c,v 1.6 1998/08/13 02:11:03 eeh Exp $ */ + +#define VM_PAGE_ALLOC_MEMORY_STATS + +/*- + * Copyright (c) 1997 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * uvm_pglist.c: pglist functions + * + * XXX: was part of uvm_page but has an incompatable copyright so it + * gets its own file now. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> + +#ifdef VM_PAGE_ALLOC_MEMORY_STATS +#define STAT_INCR(v) (v)++ +#define STAT_DECR(v) do { \ + if ((v) == 0) \ + printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \ + else \ + (v)--; \ + } while (0) +u_long uvm_pglistalloc_npages; +#else +#define STAT_INCR(v) +#define STAT_DECR(v) +#endif + +/* + * uvm_pglistalloc: allocate a list of pages + * + * => allocated pages are placed at the tail of rlist. rlist is + * assumed to be properly initialized by caller. + * => returns 0 on success or errno on failure + * => XXX: implementation allocates only a single segment, also + * might be able to better advantage of vm_physeg[]. + * => doesn't take into account clean non-busy pages on inactive list + * that could be used(?) + * => params: + * size the size of the allocation, rounded to page size. + * low the low address of the allowed allocation range. + * high the high address of the allowed allocation range. + * alignment memory must be aligned to this power-of-two boundary. + * boundary no segment in the allocation may cross this + * power-of-two boundary (relative to zero). + */ + +int +uvm_pglistalloc(size, low, high, alignment, boundary, rlist, nsegs, waitok) + psize_t size; + paddr_t low, high, alignment, boundary; + struct pglist *rlist; + int nsegs, waitok; +{ + paddr_t try, idxpa, lastidxpa; + int psi; + struct vm_page *pgs; + int s, tryidx, idx, end, error, free_list; + vm_page_t m; + u_long pagemask; +#ifdef DEBUG + vm_page_t tp; +#endif + +#ifdef DIAGNOSTIC + if ((alignment & (alignment - 1)) != 0) + panic("vm_page_alloc_memory: alignment must be power of 2"); + + if ((boundary & (boundary - 1)) != 0) + panic("vm_page_alloc_memory: boundary must be power of 2"); +#endif + + /* + * Our allocations are always page granularity, so our alignment + * must be, too. + */ + if (alignment < PAGE_SIZE) + alignment = PAGE_SIZE; + + size = round_page(size); + try = roundup(low, alignment); + + if (boundary != 0 && boundary < size) + return (EINVAL); + + pagemask = ~(boundary - 1); + + /* Default to "lose". */ + error = ENOMEM; + + /* + * Block all memory allocation and lock the free list. + */ + s = splimp(); + uvm_lock_fpageq(); /* lock free page queue */ + + /* Are there even any free pages? */ + for (idx = 0; idx < VM_NFREELIST; idx++) + if (uvm.page_free[idx].tqh_first != NULL) + break; + if (idx == VM_NFREELIST) + goto out; + + for (;; try += alignment) { + if (try + size > high) { + /* + * We've run past the allowable range. + */ + goto out; + } + + /* + * Make sure this is a managed physical page. + */ + + if ((psi = vm_physseg_find(atop(try), &idx)) == -1) + continue; /* managed? */ + if (vm_physseg_find(atop(try + size), NULL) != psi) + continue; /* end must be in this segment */ + + tryidx = idx; + end = idx + (size / PAGE_SIZE); + pgs = vm_physmem[psi].pgs; + + /* + * Found a suitable starting page. See of the range is free. + */ + for (; idx < end; idx++) { + if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) { + /* + * Page not available. + */ + break; + } + + idxpa = VM_PAGE_TO_PHYS(&pgs[idx]); + + if (idx > tryidx) { + lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]); + + if ((lastidxpa + PAGE_SIZE) != idxpa) { + /* + * Region not contiguous. + */ + break; + } + if (boundary != 0 && + ((lastidxpa ^ idxpa) & pagemask) != 0) { + /* + * Region crosses boundary. + */ + break; + } + } + } + + if (idx == end) { + /* + * Woo hoo! Found one. + */ + break; + } + } + + /* + * we have a chunk of memory that conforms to the requested constraints. + */ + idx = tryidx; + while (idx < end) { + m = &pgs[idx]; + free_list = uvm_page_lookup_freelist(m); +#ifdef DEBUG + for (tp = uvm.page_free[free_list].tqh_first; + tp != NULL; tp = tp->pageq.tqe_next) { + if (tp == m) + break; + } + if (tp == NULL) + panic("uvm_pglistalloc: page not on freelist"); +#endif + TAILQ_REMOVE(&uvm.page_free[free_list], m, pageq); + uvmexp.free--; + m->flags = PG_CLEAN; + m->pqflags = 0; + m->uobject = NULL; + m->uanon = NULL; + m->wire_count = 0; + m->loan_count = 0; + TAILQ_INSERT_TAIL(rlist, m, pageq); + idx++; + STAT_INCR(uvm_pglistalloc_npages); + } + error = 0; + +out: + uvm_unlock_fpageq(); + splx(s); + + /* + * check to see if we need to generate some free pages waking + * the pagedaemon. + * XXX: we read uvm.free without locking + */ + + if (uvmexp.free < uvmexp.freemin || + (uvmexp.free < uvmexp.freetarg && + uvmexp.inactive < uvmexp.inactarg)) + thread_wakeup(&uvm.pagedaemon); + + return (error); +} + +/* + * uvm_pglistfree: free a list of pages + * + * => pages should already be unmapped + */ + +void +uvm_pglistfree(list) + struct pglist *list; +{ + vm_page_t m; + int s; + + /* + * Block all memory allocation and lock the free list. + */ + s = splimp(); + uvm_lock_fpageq(); + + while ((m = list->tqh_first) != NULL) { +#ifdef DIAGNOSTIC + if (m->pqflags & (PQ_ACTIVE|PQ_INACTIVE)) + panic("uvm_pglistfree: active/inactive page!"); +#endif + TAILQ_REMOVE(list, m, pageq); + m->pqflags = PQ_FREE; + TAILQ_INSERT_TAIL(&uvm.page_free[uvm_page_lookup_freelist(m)], + m, pageq); + uvmexp.free++; + STAT_DECR(uvm_pglistalloc_npages); + } + + uvm_unlock_fpageq(); + splx(s); +} diff --git a/sys/uvm/uvm_stat.c b/sys/uvm/uvm_stat.c new file mode 100644 index 00000000000..fbe3139c116 --- /dev/null +++ b/sys/uvm/uvm_stat.c @@ -0,0 +1,253 @@ +/* $NetBSD: uvm_stat.c,v 1.10 1998/06/20 13:19:00 mrg Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_stat.c,v 1.1.2.3 1997/12/19 15:01:00 mrg Exp + */ + +/* + * uvm_stat.c + */ + +#include <sys/param.h> +#include <sys/systm.h> + +#include <vm/vm.h> + +#include <uvm/uvm.h> + +/* + * globals + */ + +struct uvm_cnt *uvm_cnt_head = NULL; + +#ifdef UVMHIST +struct uvm_history_head uvm_histories; +#endif + +#ifdef UVMHIST_PRINT +int uvmhist_print_enabled = 1; +#endif + +/* + * prototypes + */ + +#ifdef UVMHIST +void uvmhist_dump __P((struct uvm_history *)); +void uvm_hist __P((u_int32_t)); +static void uvmhist_dump_histories __P((struct uvm_history *[])); +#endif +void uvmcnt_dump __P((void)); +void uvm_dump __P((void)); + + +#ifdef UVMHIST +/* call this from ddb */ +void +uvmhist_dump(l) + struct uvm_history *l; +{ + int lcv, s; + + s = splhigh(); + lcv = l->f; + do { + if (l->e[lcv].fmt) + uvmhist_print(&l->e[lcv]); + lcv = (lcv + 1) % l->n; + } while (lcv != l->f); + splx(s); +} + +/* + * print a merged list of uvm_history structures + */ +static void +uvmhist_dump_histories(hists) + struct uvm_history *hists[]; +{ + struct timeval tv; + int cur[MAXHISTS]; + int s, lcv, hi; + + /* so we don't get corrupted lists! */ + s = splhigh(); + + /* find the first of each list */ + for (lcv = 0; hists[lcv]; lcv++) + cur[lcv] = hists[lcv]->f; + + /* + * here we loop "forever", finding the next earliest + * history entry and printing it. cur[X] is the current + * entry to test for the history in hists[X]. if it is + * -1, then this history is finished. + */ + for (;;) { + hi = -1; + tv.tv_sec = tv.tv_usec = 0; + + /* loop over each history */ + for (lcv = 0; hists[lcv]; lcv++) { +restart: + if (cur[lcv] == -1) + continue; + + /* + * if the format is empty, go to the next entry + * and retry. + */ + if (hists[lcv]->e[cur[lcv]].fmt == NULL) { + cur[lcv] = (cur[lcv] + 1) % (hists[lcv]->n); + if (cur[lcv] == hists[lcv]->f) + cur[lcv] = -1; + goto restart; + } + + /* + * if the time hasn't been set yet, or this entry is + * earlier than the current tv, set the time and history + * index. + */ + if (tv.tv_sec == 0 || + timercmp(&hists[lcv]->e[cur[lcv]].tv, &tv, <)) { + tv = hists[lcv]->e[cur[lcv]].tv; + hi = lcv; + } + } + + /* if we didn't find any entries, we must be done */ + if (hi == -1) + break; + + /* print and move to the next entry */ + uvmhist_print(&hists[hi]->e[cur[hi]]); + cur[hi] = (cur[hi] + 1) % (hists[hi]->n); + if (cur[hi] == hists[hi]->f) + cur[hi] = -1; + } + + /* done! */ + splx(s); +} + +/* + * call this from ddb. `bitmask' is from <uvm/uvm_stat.h>. it + * merges the named histories. + */ +void +uvm_hist(bitmask) + u_int32_t bitmask; /* XXX only support 32 hists */ +{ + struct uvm_history *hists[MAXHISTS + 1]; + int i = 0; + + if ((bitmask & UVMHIST_MAPHIST) || bitmask == 0) + hists[i++] = &maphist; + + if ((bitmask & UVMHIST_PDHIST) || bitmask == 0) + hists[i++] = &pdhist; + + hists[i] = NULL; + + uvmhist_dump_histories(hists); +} +#endif /* UVMHIST */ + +void +uvmcnt_dump() +{ + struct uvm_cnt *uvc = uvm_cnt_head; + + while (uvc) { + if ((uvc->t & UVMCNT_MASK) != UVMCNT_CNT) + continue; + printf("%s = %d\n", uvc->name, uvc->c); + uvc = uvc->next; + } +} + +/* + * uvm_dump: ddb hook to dump interesting uvm counters + */ +void +uvm_dump() +{ + + printf("Current UVM status:\n"); + printf(" pagesize=%d (0x%x), pagemask=0x%x, pageshift=%d\n", + uvmexp.pagesize, uvmexp.pagesize, uvmexp.pagemask, + uvmexp.pageshift); + printf(" %d VM pages: %d active, %d inactive, %d wired, %d free\n", + uvmexp.npages, uvmexp.active, uvmexp.inactive, uvmexp.wired, + uvmexp.free); + printf(" freemin=%d, free-target=%d, inactive-target=%d, " + "wired-max=%d\n", uvmexp.freemin, uvmexp.freetarg, uvmexp.inactarg, + uvmexp.wiredmax); + printf(" faults=%d, traps=%d, intrs=%d, ctxswitch=%d\n", + uvmexp.faults, uvmexp.traps, uvmexp.intrs, uvmexp.swtch); + printf(" softint=%d, syscalls=%d, swapins=%d, swapouts=%d\n", + uvmexp.softs, uvmexp.syscalls, uvmexp.swapins, uvmexp.swapouts); + + printf(" fault counts:\n"); + printf(" noram=%d, noanon=%d, pgwait=%d, pgrele=%d\n", + uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltpgwait, + uvmexp.fltpgrele); + printf(" ok relocks(total)=%d(%d), anget(retrys)=%d(%d), " + "amapcopy=%d\n", uvmexp.fltrelckok, uvmexp.fltrelck, + uvmexp.fltanget, uvmexp.fltanretry, uvmexp.fltamcopy); + printf(" neighbor anon/obj pg=%d/%d, gets(lock/unlock)=%d/%d\n", + uvmexp.fltnamap, uvmexp.fltnomap, uvmexp.fltlget, uvmexp.fltget); + printf(" cases: anon=%d, anoncow=%d, obj=%d, prcopy=%d, przero=%d\n", + uvmexp.flt_anon, uvmexp.flt_acow, uvmexp.flt_obj, uvmexp.flt_prcopy, + uvmexp.flt_przero); + + printf(" daemon and swap counts:\n"); + printf(" woke=%d, revs=%d, scans=%d, swout=%d\n", uvmexp.pdwoke, + uvmexp.pdrevs, uvmexp.pdscans, uvmexp.pdswout); + printf(" busy=%d, freed=%d, reactivate=%d, deactivate=%d\n", + uvmexp.pdbusy, uvmexp.pdfreed, uvmexp.pdreact, uvmexp.pddeact); + printf(" pageouts=%d, pending=%d, nswget=%d\n", uvmexp.pdpageouts, + uvmexp.pdpending, uvmexp.nswget); + printf(" nswapdev=%d, nanon=%d, nfreeanon=%d\n", uvmexp.nswapdev, + uvmexp.nanon, uvmexp.nfreeanon); + + printf(" kernel pointers:\n"); + printf(" objs(kern/kmem/mb)=%p/%p/%p\n", uvm.kernel_object, + uvmexp.kmem_object, uvmexp.mb_object); +} diff --git a/sys/uvm/uvm_stat.h b/sys/uvm/uvm_stat.h new file mode 100644 index 00000000000..62ce32fe46e --- /dev/null +++ b/sys/uvm/uvm_stat.h @@ -0,0 +1,245 @@ +/* $NetBSD: uvm_stat.h,v 1.13 1998/08/09 22:36:39 perry Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_stat.h,v 1.1.2.4 1998/02/07 01:16:56 chs Exp + */ + +#ifndef _UVM_UVM_STAT_H_ +#define _UVM_UVM_STAT_H_ + +#include <sys/queue.h> + +/* + * uvm_stat: monitor what is going on with uvm (or whatever) + */ + +/* + * counters [XXX: maybe replace event counters with this] + */ + +#define UVMCNT_MASK 0xf /* rest are private */ +#define UVMCNT_CNT 0 /* normal counter */ +#define UVMCNT_DEV 1 /* device event counter */ + +struct uvm_cnt { + int c; /* the value */ + int t; /* type */ + struct uvm_cnt *next; /* global list of cnts */ + char *name; /* counter name */ + void *p; /* private data */ +}; + +extern struct uvm_cnt *uvm_cnt_head; + +/* + * counter operations. assume spl is set ok. + */ + +#define UVMCNT_INIT(CNT,TYP,VAL,NAM,PRIV) \ +do { \ + CNT.c = VAL; \ + CNT.t = TYP; \ + CNT.next = uvm_cnt_head; \ + uvm_cnt_head = &CNT; \ + CNT.name = NAM; \ + CNT.p = PRIV; \ +} while (0) + +#define UVMCNT_SET(C,V) \ +do { \ + (C).c = (V); \ +} while (0) + +#define UVMCNT_ADD(C,V) \ +do { \ + (C).c += (V); \ +} while (0) + +#define UVMCNT_INCR(C) UVMCNT_ADD(C,1) +#define UVMCNT_DECR(C) UVMCNT_ADD(C,-1) + + +/* + * history/tracing + */ + +struct uvm_history_ent { + struct timeval tv; /* time stamp */ + char *fmt; /* printf format */ + size_t fmtlen; /* length of printf format */ + char *fn; /* function name */ + size_t fnlen; /* length of function name */ + u_long call; /* function call number */ + u_long v[4]; /* values */ +}; + +struct uvm_history { + const char *name; /* name of this this history */ + size_t namelen; /* length of name, not including null */ + LIST_ENTRY(uvm_history) list; /* link on list of all histories */ + int n; /* number of entries */ + int f; /* next free one */ + simple_lock_data_t l; /* lock on this history */ + struct uvm_history_ent *e; /* the malloc'd entries */ +}; + +LIST_HEAD(uvm_history_head, uvm_history); + +/* + * grovelling lists all at once. we currently do not allow more than + * 32 histories to exist, as the way to dump a number of them at once + * is by calling uvm_hist() with a bitmask. + */ + +/* this is used to set the size of some arrays */ +#define MAXHISTS 32 /* do not change this! */ + +/* and these are the bit values of each history */ +#define UVMHIST_MAPHIST 0x00000001 /* maphist */ +#define UVMHIST_PDHIST 0x00000002 /* pdhist */ + +/* + * macros to use the history/tracing code. note that UVMHIST_LOG + * must take 4 arguments (even if they are ignored by the format). + */ +#ifndef UVMHIST +#define UVMHIST_DECL(NAME) +#define UVMHIST_INIT(NAME,N) +#define UVMHIST_INIT_STATIC(NAME,BUF) +#define UVMHIST_LOG(NAME,FMT,A,B,C,D) +#define UVMHIST_CALLED(NAME) +#define UVMHIST_FUNC(FNAME) +#define uvmhist_dump(NAME) +#else +extern struct uvm_history_head uvm_histories; + +#define UVMHIST_DECL(NAME) struct uvm_history NAME + +#define UVMHIST_INIT(NAME,N) \ +do { \ + (NAME).name = __STRING(NAME); \ + (NAME).namelen = strlen((NAME).name); \ + (NAME).n = (N); \ + (NAME).f = 0; \ + simple_lock_init(&(NAME).l); \ + (NAME).e = (struct uvm_history_ent *) \ + malloc(sizeof(struct uvm_history_ent) * (N), M_TEMP, \ + M_WAITOK); \ + bzero((NAME).e, sizeof(struct uvm_history_ent) * (N)); \ + LIST_INSERT_HEAD(&uvm_histories, &(NAME), list); \ +} while (0) + +#define UVMHIST_INIT_STATIC(NAME,BUF) \ +do { \ + (NAME).name = __STRING(NAME); \ + (NAME).namelen = strlen((NAME).name); \ + (NAME).n = sizeof(BUF) / sizeof(struct uvm_history_ent); \ + (NAME).f = 0; \ + simple_lock_init(&(NAME).l); \ + (NAME).e = (struct uvm_history_ent *) (BUF); \ + bzero((NAME).e, sizeof(struct uvm_history_ent) * (NAME).n); \ + LIST_INSERT_HEAD(&uvm_histories, &(NAME), list); \ +} while (0) + +extern int cold; + +#if defined(UVMHIST_PRINT) +extern int uvmhist_print_enabled; +#define UVMHIST_PRINTNOW(E) \ +do { \ + if (uvmhist_print_enabled) { \ + uvmhist_print(E); \ + DELAY(100000); \ + } \ +} while (0) +#else +#define UVMHIST_PRINTNOW(E) /* nothing */ +#endif + +#define UVMHIST_LOG(NAME,FMT,A,B,C,D) \ +do { \ + register int i, s = splhigh(); \ + simple_lock(&(NAME).l); \ + i = (NAME).f; \ + (NAME).f = (i + 1) % (NAME).n; \ + simple_unlock(&(NAME).l); \ + splx(s); \ + if (!cold) \ + microtime(&(NAME).e[i].tv); \ + (NAME).e[i].fmt = (FMT); \ + (NAME).e[i].fmtlen = strlen((NAME).e[i].fmt); \ + (NAME).e[i].fn = _uvmhist_name; \ + (NAME).e[i].fnlen = strlen((NAME).e[i].fn); \ + (NAME).e[i].call = _uvmhist_call; \ + (NAME).e[i].v[0] = (u_long)(A); \ + (NAME).e[i].v[1] = (u_long)(B); \ + (NAME).e[i].v[2] = (u_long)(C); \ + (NAME).e[i].v[3] = (u_long)(D); \ + UVMHIST_PRINTNOW(&((NAME).e[i])); \ +} while (0) + +#define UVMHIST_CALLED(NAME) \ +do { \ + { \ + int s = splhigh(); \ + simple_lock(&(NAME).l); \ + _uvmhist_call = _uvmhist_cnt++; \ + simple_unlock(&(NAME).l); \ + splx(s); \ + } \ + UVMHIST_LOG(NAME,"called!", 0, 0, 0, 0); \ +} while (0) + +#define UVMHIST_FUNC(FNAME) \ + static int _uvmhist_cnt = 0; \ + static char *_uvmhist_name = FNAME; \ + int _uvmhist_call; + +static __inline void uvmhist_print __P((struct uvm_history_ent *)); + +static __inline void +uvmhist_print(e) + struct uvm_history_ent *e; +{ + printf("%06ld.%06ld ", e->tv.tv_sec, e->tv.tv_usec); + printf("%s#%ld: ", e->fn, e->call); + printf(e->fmt, e->v[0], e->v[1], e->v[2], e->v[3]); + printf("\n"); +} +#endif /* UVMHIST */ + +#endif /* _UVM_UVM_STAT_H_ */ diff --git a/sys/uvm/uvm_swap.c b/sys/uvm/uvm_swap.c new file mode 100644 index 00000000000..9fb7611e7a5 --- /dev/null +++ b/sys/uvm/uvm_swap.c @@ -0,0 +1,1977 @@ +/* $NetBSD: uvm_swap.c,v 1.23 1998/12/26 06:25:59 marc Exp $ */ + +/* + * Copyright (c) 1995, 1996, 1997 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp + * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/disklabel.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/extent.h> +#include <sys/mount.h> +#include <sys/pool.h> +#include <sys/syscallargs.h> +#include <sys/swap.h> + +#include <vm/vm.h> +#include <vm/vm_conf.h> + +#include <uvm/uvm.h> + +#include <miscfs/specfs/specdev.h> + +/* + * uvm_swap.c: manage configuration and i/o to swap space. + */ + +/* + * swap space is managed in the following way: + * + * each swap partition or file is described by a "swapdev" structure. + * each "swapdev" structure contains a "swapent" structure which contains + * information that is passed up to the user (via system calls). + * + * each swap partition is assigned a "priority" (int) which controls + * swap parition usage. + * + * the system maintains a global data structure describing all swap + * partitions/files. there is a sorted LIST of "swappri" structures + * which describe "swapdev"'s at that priority. this LIST is headed + * by the "swap_priority" global var. each "swappri" contains a + * CIRCLEQ of "swapdev" structures at that priority. + * + * the system maintains a fixed pool of "swapbuf" structures for use + * at swap i/o time. a swapbuf includes a "buf" structure and an + * "aiodone" [we want to avoid malloc()'ing anything at swapout time + * since memory may be low]. + * + * locking: + * - swap_syscall_lock (sleep lock): this lock serializes the swapctl + * system call and prevents the swap priority list from changing + * while we are in the middle of a system call (e.g. SWAP_STATS). + * - swap_data_lock (simple_lock): this lock protects all swap data + * structures including the priority list, the swapdev structures, + * and the swapmap extent. + * - swap_buf_lock (simple_lock): this lock protects the free swapbuf + * pool. + * + * each swap device has the following info: + * - swap device in use (could be disabled, preventing future use) + * - swap enabled (allows new allocations on swap) + * - map info in /dev/drum + * - vnode pointer + * for swap files only: + * - block size + * - max byte count in buffer + * - buffer + * - credentials to use when doing i/o to file + * + * userland controls and configures swap with the swapctl(2) system call. + * the sys_swapctl performs the following operations: + * [1] SWAP_NSWAP: returns the number of swap devices currently configured + * [2] SWAP_STATS: given a pointer to an array of swapent structures + * (passed in via "arg") of a size passed in via "misc" ... we load + * the current swap config into the array. + * [3] SWAP_ON: given a pathname in arg (could be device or file) and a + * priority in "misc", start swapping on it. + * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device + * [5] SWAP_CTL: changes the priority of a swap device (new priority in + * "misc") + */ + +/* + * SWAP_TO_FILES: allows swapping to plain files. + */ + +#define SWAP_TO_FILES + +/* + * swapdev: describes a single swap partition/file + * + * note the following should be true: + * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] + * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] + */ +struct swapdev { + struct oswapent swd_ose; +#define swd_dev swd_ose.ose_dev /* device id */ +#define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ +#define swd_priority swd_ose.ose_priority /* our priority */ + /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ + char *swd_path; /* saved pathname of device */ + int swd_pathlen; /* length of pathname */ + int swd_npages; /* #pages we can use */ + int swd_npginuse; /* #pages in use */ + int swd_drumoffset; /* page0 offset in drum */ + int swd_drumsize; /* #pages in drum */ + struct extent *swd_ex; /* extent for this swapdev */ + struct vnode *swd_vp; /* backing vnode */ + CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ + +#ifdef SWAP_TO_FILES + int swd_bsize; /* blocksize (bytes) */ + int swd_maxactive; /* max active i/o reqs */ + struct buf swd_tab; /* buffer list */ + struct ucred *swd_cred; /* cred for file access */ +#endif +}; + +/* + * swap device priority entry; the list is kept sorted on `spi_priority'. + */ +struct swappri { + int spi_priority; /* priority */ + CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; + /* circleq of swapdevs at this priority */ + LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ +}; + +/* + * swapbuf, swapbuffer plus async i/o info + */ +struct swapbuf { + struct buf sw_buf; /* a buffer structure */ + struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */ + SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */ +}; + +/* + * The following two structures are used to keep track of data transfers + * on swap devices associated with regular files. + * NOTE: this code is more or less a copy of vnd.c; we use the same + * structure names here to ease porting.. + */ +struct vndxfer { + struct buf *vx_bp; /* Pointer to parent buffer */ + struct swapdev *vx_sdp; + int vx_error; + int vx_pending; /* # of pending aux buffers */ + int vx_flags; +#define VX_BUSY 1 +#define VX_DEAD 2 +}; + +struct vndbuf { + struct buf vb_buf; + struct vndxfer *vb_xfer; +}; + + +/* + * We keep a of pool vndbuf's and vndxfer structures. + */ +struct pool *vndxfer_pool; +struct pool *vndbuf_pool; + +#define getvndxfer(vnx) do { \ + int s = splbio(); \ + vnx = (struct vndxfer *) \ + pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \ + splx(s); \ +} while (0) + +#define putvndxfer(vnx) { \ + pool_put(vndxfer_pool, (void *)(vnx)); \ +} + +#define getvndbuf(vbp) do { \ + int s = splbio(); \ + vbp = (struct vndbuf *) \ + pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \ + splx(s); \ +} while (0) + +#define putvndbuf(vbp) { \ + pool_put(vndbuf_pool, (void *)(vbp)); \ +} + + +/* + * local variables + */ +static struct extent *swapmap; /* controls the mapping of /dev/drum */ +SIMPLEQ_HEAD(swapbufhead, swapbuf); +struct pool *swapbuf_pool; + +/* list of all active swap devices [by priority] */ +LIST_HEAD(swap_priority, swappri); +static struct swap_priority swap_priority; + +/* locks */ +lock_data_t swap_syscall_lock; +static simple_lock_data_t swap_data_lock; + +/* + * prototypes + */ +#ifdef notyet +static void swapdrum_add __P((struct swapdev *, int)); +#endif +static struct swapdev *swapdrum_getsdp __P((int)); + +#ifdef notyet /* swapctl */ +static struct swapdev *swaplist_find __P((struct vnode *, int)); +static void swaplist_insert __P((struct swapdev *, + struct swappri *, int)); +static void swaplist_trim __P((void)); + +static int swap_on __P((struct proc *, struct swapdev *)); +#endif +#ifdef SWAP_OFF_WORKS +static int swap_off __P((struct proc *, struct swapdev *)); +#endif + +#ifdef SWAP_TO_FILES +static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); +static void sw_reg_iodone __P((struct buf *)); +static void sw_reg_start __P((struct swapdev *)); +#endif + +static void uvm_swap_aiodone __P((struct uvm_aiodesc *)); +static void uvm_swap_bufdone __P((struct buf *)); +static int uvm_swap_io __P((struct vm_page **, int, int, int)); + +/* + * uvm_swap_init: init the swap system data structures and locks + * + * => called at boot time from init_main.c after the filesystems + * are brought up (which happens after uvm_init()) + */ +void +uvm_swap_init() +{ + UVMHIST_FUNC("uvm_swap_init"); + + UVMHIST_CALLED(pdhist); + /* + * first, init the swap list, its counter, and its lock. + * then get a handle on the vnode for /dev/drum by using + * the its dev_t number ("swapdev", from MD conf.c). + */ + + LIST_INIT(&swap_priority); + uvmexp.nswapdev = 0; + lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); + simple_lock_init(&swap_data_lock); + + if (bdevvp(swapdev, &swapdev_vp)) + panic("uvm_swap_init: can't get vnode for swap device"); + + /* + * create swap block resource map to map /dev/drum. the range + * from 1 to INT_MAX allows 2 gigablocks of swap space. note + * that block 0 is reserved (used to indicate an allocation + * failure, or no allocation). + */ + swapmap = extent_create("swapmap", 1, INT_MAX, + M_VMSWAP, 0, 0, EX_NOWAIT); + if (swapmap == 0) + panic("uvm_swap_init: extent_create failed"); + + /* + * allocate our private pool of "swapbuf" structures (includes + * a "buf" structure). ["nswbuf" comes from param.c and can + * be adjusted by MD code before we get here]. + */ + + swapbuf_pool = + pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0, + NULL, NULL, 0); + if (swapbuf_pool == NULL) + panic("swapinit: pool_create failed"); + /* XXX - set a maximum on swapbuf_pool? */ + + vndxfer_pool = + pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0, + NULL, NULL, 0); + if (vndxfer_pool == NULL) + panic("swapinit: pool_create failed"); + + vndbuf_pool = + pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0, + NULL, NULL, 0); + if (vndbuf_pool == NULL) + panic("swapinit: pool_create failed"); + /* + * done! + */ + UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); +} + +/* + * swaplist functions: functions that operate on the list of swap + * devices on the system. + */ + +/* + * swaplist_insert: insert swap device "sdp" into the global list + * + * => caller must hold both swap_syscall_lock and swap_data_lock + * => caller must provide a newly malloc'd swappri structure (we will + * FREE it if we don't need it... this it to prevent malloc blocking + * here while adding swap) + */ +#ifdef notyet /* used by swapctl */ +static void +swaplist_insert(sdp, newspp, priority) + struct swapdev *sdp; + struct swappri *newspp; + int priority; +{ + struct swappri *spp, *pspp; + UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); + + /* + * find entry at or after which to insert the new device. + */ + for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + if (priority <= spp->spi_priority) + break; + pspp = spp; + } + + /* + * new priority? + */ + if (spp == NULL || spp->spi_priority != priority) { + spp = newspp; /* use newspp! */ + UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0); + + spp->spi_priority = priority; + CIRCLEQ_INIT(&spp->spi_swapdev); + + if (pspp) + LIST_INSERT_AFTER(pspp, spp, spi_swappri); + else + LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); + } else { + /* we don't need a new priority structure, free it */ + FREE(newspp, M_VMSWAP); + } + + /* + * priority found (or created). now insert on the priority's + * circleq list and bump the total number of swapdevs. + */ + sdp->swd_priority = priority; + CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); + uvmexp.nswapdev++; + + /* + * done! + */ +} +#endif + +#ifdef notyet /* used by swapctl */ +/* + * swaplist_find: find and optionally remove a swap device from the + * global list. + * + * => caller must hold both swap_syscall_lock and swap_data_lock + * => we return the swapdev we found (and removed) + */ +static struct swapdev * +swaplist_find(vp, remove) + struct vnode *vp; + boolean_t remove; +{ + struct swapdev *sdp; + struct swappri *spp; + + /* + * search the lists for the requested vp + */ + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) + if (sdp->swd_vp == vp) { + if (remove) { + CIRCLEQ_REMOVE(&spp->spi_swapdev, + sdp, swd_next); + uvmexp.nswapdev--; + } + return(sdp); + } + } + return (NULL); +} + + +/* + * swaplist_trim: scan priority list for empty priority entries and kill + * them. + * + * => caller must hold both swap_syscall_lock and swap_data_lock + */ +static void +swaplist_trim() +{ + struct swappri *spp, *nextspp; + + for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) { + nextspp = spp->spi_swappri.le_next; + if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev) + continue; + LIST_REMOVE(spp, spi_swappri); + free((caddr_t)spp, M_VMSWAP); + } +} + +/* + * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. + * + * => caller must hold swap_syscall_lock + * => swap_data_lock should be unlocked (we may sleep) + */ +static void +swapdrum_add(sdp, npages) + struct swapdev *sdp; + int npages; +{ + u_long result; + + if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY, + EX_WAITOK, &result)) + panic("swapdrum_add"); + + sdp->swd_drumoffset = result; + sdp->swd_drumsize = npages; +} +#endif + +/* + * swapdrum_getsdp: given a page offset in /dev/drum, convert it back + * to the "swapdev" that maps that section of the drum. + * + * => each swapdev takes one big contig chunk of the drum + * => caller must hold swap_data_lock + */ +static struct swapdev * +swapdrum_getsdp(pgno) + int pgno; +{ + struct swapdev *sdp; + struct swappri *spp; + + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) + if (pgno >= sdp->swd_drumoffset && + pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { + return sdp; + } + return NULL; +} + + +/*XXX + *XXX + *XXX*/ +int +sys_swapon(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + return EINVAL; +} + +#ifdef notyet /* XXXXXXXXXXXXXXXX (it has other bugs beside the fact that I don't want to change syscalls.master) */ +/* + * sys_swapctl: main entry point for swapctl(2) system call + * [with two helper functions: swap_on and swap_off] + */ +int +sys_swapctl(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_swapctl_args /* { + syscallarg(int) cmd; + syscallarg(void *) arg; + syscallarg(int) misc; + } */ *uap = (struct sys_swapctl_args *)v; + struct vnode *vp; + struct nameidata nd; + struct swappri *spp; + struct swapdev *sdp; + struct swapent *sep; + char userpath[PATH_MAX + 1]; + size_t len; + int count, error, misc; + int priority; + UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); + + misc = SCARG(uap, misc); + + /* + * ensure serialized syscall access by grabbing the swap_syscall_lock + */ + lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0, p); + + /* + * we handle the non-priv NSWAP and STATS request first. + * + * SWAP_NSWAP: return number of config'd swap devices + * [can also be obtained with uvmexp sysctl] + */ + if (SCARG(uap, cmd) == SWAP_NSWAP) { + UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, + 0, 0, 0); + *retval = uvmexp.nswapdev; + error = 0; + goto out; + } + + /* + * SWAP_STATS: get stats on current # of configured swap devs + * + * note that the swap_priority list can't change as long + * as we are holding the swap_syscall_lock. we don't want + * to grab the swap_data_lock because we may fault&sleep during + * copyout() and we don't want to be holding that lock then! + */ + if (SCARG(uap, cmd) == SWAP_STATS +#if defined(COMPAT_13) + || SCARG(uap, cmd) == SWAP_OSTATS +#endif + ) { + sep = (struct swapent *)SCARG(uap, arg); + count = 0; + + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev && misc-- > 0; + sdp = sdp->swd_next.cqe_next) { + /* + * backwards compatibility for system call. + * note that we use 'struct oswapent' as an + * overlay into both 'struct swapdev' and + * the userland 'struct swapent', as we + * want to retain backwards compatibility + * with NetBSD 1.3. + */ + sdp->swd_ose.ose_inuse = + btodb(sdp->swd_npginuse << PAGE_SHIFT); + error = copyout((caddr_t)&sdp->swd_ose, + (caddr_t)sep, sizeof(struct oswapent)); + + /* now copy out the path if necessary */ +#if defined(COMPAT_13) + if (error == 0 && SCARG(uap, cmd) == SWAP_STATS) +#else + if (error == 0) +#endif + error = copyout((caddr_t)sdp->swd_path, + (caddr_t)&sep->se_path, + sdp->swd_pathlen); + + if (error) + goto out; + count++; +#if defined(COMPAT_13) + if (SCARG(uap, cmd) == SWAP_OSTATS) + ((struct oswapent *)sep)++; + else +#endif + sep++; + } + } + + UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); + + *retval = count; + error = 0; + goto out; + } + + /* + * all other requests require superuser privs. verify. + */ + if ((error = suser(p->p_ucred, &p->p_acflag))) + goto out; + + /* + * at this point we expect a path name in arg. we will + * use namei() to gain a vnode reference (vref), and lock + * the vnode (VOP_LOCK). + * + * XXX: a NULL arg means use the root vnode pointer (e.g. for + * miniroot) + */ + if (SCARG(uap, arg) == NULL) { + vp = rootvp; /* miniroot */ + if (vget(vp, LK_EXCLUSIVE)) { + error = EBUSY; + goto out; + } + if (SCARG(uap, cmd) == SWAP_ON && + copystr("miniroot", userpath, sizeof userpath, &len)) + panic("swapctl: miniroot copy failed"); + } else { + int space; + char *where; + + if (SCARG(uap, cmd) == SWAP_ON) { + if ((error = copyinstr(SCARG(uap, arg), userpath, + sizeof userpath, &len))) + goto out; + space = UIO_SYSSPACE; + where = userpath; + } else { + space = UIO_USERSPACE; + where = (char *)SCARG(uap, arg); + } + NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p); + if ((error = namei(&nd))) + goto out; + vp = nd.ni_vp; + } + /* note: "vp" is referenced and locked */ + + error = 0; /* assume no error */ + switch(SCARG(uap, cmd)) { + case SWAP_CTL: + /* + * get new priority, remove old entry (if any) and then + * reinsert it in the correct place. finally, prune out + * any empty priority structures. + */ + priority = SCARG(uap, misc); + spp = (struct swappri *) + malloc(sizeof *spp, M_VMSWAP, M_WAITOK); + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 1)) == NULL) { + error = ENOENT; + } else { + swaplist_insert(sdp, spp, priority); + swaplist_trim(); + } + simple_unlock(&swap_data_lock); + if (error) + free(spp, M_VMSWAP); + break; + + case SWAP_ON: + /* + * check for duplicates. if none found, then insert a + * dummy entry on the list to prevent someone else from + * trying to enable this device while we are working on + * it. + */ + priority = SCARG(uap, misc); + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 0)) != NULL) { + error = EBUSY; + simple_unlock(&swap_data_lock); + break; + } + sdp = (struct swapdev *) + malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); + spp = (struct swappri *) + malloc(sizeof *spp, M_VMSWAP, M_WAITOK); + bzero(sdp, sizeof(*sdp)); + sdp->swd_flags = SWF_FAKE; /* placeholder only */ + sdp->swd_vp = vp; + sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; +#ifdef SWAP_TO_FILES + /* + * XXX Is NFS elaboration necessary? + */ + if (vp->v_type == VREG) + sdp->swd_cred = crdup(p->p_ucred); +#endif + swaplist_insert(sdp, spp, priority); + simple_unlock(&swap_data_lock); + + sdp->swd_pathlen = len; + sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); + if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) + panic("swapctl: copystr"); + /* + * we've now got a FAKE placeholder in the swap list. + * now attempt to enable swap on it. if we fail, undo + * what we've done and kill the fake entry we just inserted. + * if swap_on is a success, it will clear the SWF_FAKE flag + */ + if ((error = swap_on(p, sdp)) != 0) { + simple_lock(&swap_data_lock); + (void) swaplist_find(vp, 1); /* kill fake entry */ + swaplist_trim(); + simple_unlock(&swap_data_lock); +#ifdef SWAP_TO_FILES + if (vp->v_type == VREG) + crfree(sdp->swd_cred); +#endif + free(sdp->swd_path, M_VMSWAP); + free((caddr_t)sdp, M_VMSWAP); + break; + } + + /* + * got it! now add a second reference to vp so that + * we keep a reference to the vnode after we return. + */ + vref(vp); + break; + + case SWAP_OFF: + UVMHIST_LOG(pdhist, "someone is using SWAP_OFF...??", 0,0,0,0); +#ifdef SWAP_OFF_WORKS + /* + * find the entry of interest and ensure it is enabled. + */ + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 0)) == NULL) { + simple_unlock(&swap_data_lock); + error = ENXIO; + break; + } + /* + * If a device isn't in use or enabled, we + * can't stop swapping from it (again). + */ + if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { + simple_unlock(&swap_data_lock); + error = EBUSY; + break; + } + /* XXXCDC: should we call with list locked or unlocked? */ + if ((error = swap_off(p, sdp)) != 0) + break; + /* XXXCDC: might need relock here */ + + /* + * now we can kill the entry. + */ + if ((sdp = swaplist_find(vp, 1)) == NULL) { + error = ENXIO; + break; + } + simple_unlock(&swap_data_lock); + free((caddr_t)sdp, M_VMSWAP); +#else + error = EINVAL; +#endif + break; + + default: + UVMHIST_LOG(pdhist, "unhandled command: %#x", + SCARG(uap, cmd), 0, 0, 0); + error = EINVAL; + } + + /* + * done! use vput to drop our reference and unlock + */ + vput(vp); +out: + lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, p); + + UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); + return (error); +} +#endif + + +/* + * swap_on: attempt to enable a swapdev for swapping. note that the + * swapdev is already on the global list, but disabled (marked + * SWF_FAKE). + * + * => we avoid the start of the disk (to protect disk labels) + * => we also avoid the miniroot, if we are swapping to root. + * => caller should leave swap_data_lock unlocked, we may lock it + * if needed. + */ +#ifdef notyet /* used by swapctl */ +static int +swap_on(p, sdp) + struct proc *p; + struct swapdev *sdp; +{ + static int count = 0; /* static */ + struct vnode *vp; + int error, npages, nblocks, size; + long addr; +#ifdef SWAP_TO_FILES + struct vattr va; +#endif +#ifdef NFS + extern int (**nfsv2_vnodeop_p) __P((void *)); +#endif /* NFS */ + dev_t dev; + char *name; + UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); + + /* + * we want to enable swapping on sdp. the swd_vp contains + * the vnode we want (locked and ref'd), and the swd_dev + * contains the dev_t of the file, if it a block device. + */ + + vp = sdp->swd_vp; + dev = sdp->swd_dev; + + /* + * open the swap file (mostly useful for block device files to + * let device driver know what is up). + * + * we skip the open/close for root on swap because the root + * has already been opened when root was mounted (mountroot). + */ + if (vp != rootvp) { + if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) + return (error); + } + + /* XXX this only works for block devices */ + UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); + + /* + * we now need to determine the size of the swap area. for + * block specials we can call the d_psize function. + * for normal files, we must stat [get attrs]. + * + * we put the result in nblks. + * for normal files, we also want the filesystem block size + * (which we get with statfs). + */ + switch (vp->v_type) { + case VBLK: + if (bdevsw[major(dev)].d_psize == 0 || + (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { + error = ENXIO; + goto bad; + } + break; + +#ifdef SWAP_TO_FILES + case VREG: + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) + goto bad; + nblocks = (int)btodb(va.va_size); + if ((error = + VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) + goto bad; + + sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; + /* + * limit the max # of outstanding I/O requests we issue + * at any one time. take it easy on NFS servers. + */ +#ifdef NFS + if (vp->v_op == nfsv2_vnodeop_p) + sdp->swd_maxactive = 2; /* XXX */ + else +#endif /* NFS */ + sdp->swd_maxactive = 8; /* XXX */ + break; +#endif + + default: + error = ENXIO; + goto bad; + } + + /* + * save nblocks in a safe place and convert to pages. + */ + + sdp->swd_ose.ose_nblks = nblocks; + npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; + + /* + * for block special files, we want to make sure that leave + * the disklabel and bootblocks alone, so we arrange to skip + * over them (randomly choosing to skip PAGE_SIZE bytes). + * note that because of this the "size" can be less than the + * actual number of blocks on the device. + */ + if (vp->v_type == VBLK) { + /* we use pages 1 to (size - 1) [inclusive] */ + size = npages - 1; + addr = 1; + } else { + /* we use pages 0 to (size - 1) [inclusive] */ + size = npages; + addr = 0; + } + + /* + * make sure we have enough blocks for a reasonable sized swap + * area. we want at least one page. + */ + + if (size < 1) { + UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); + error = EINVAL; + goto bad; + } + + UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); + + /* + * now we need to allocate an extent to manage this swap device + */ + name = malloc(12, M_VMSWAP, M_WAITOK); + sprintf(name, "swap0x%04x", count++); + + /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ + sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP, + 0, 0, EX_WAITOK); + /* allocate the `saved' region from the extent so it won't be used */ + if (addr) { + if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) + panic("disklabel region"); + sdp->swd_npginuse += addr; + uvmexp.swpginuse += addr; + } + + + /* + * if the vnode we are swapping to is the root vnode + * (i.e. we are swapping to the miniroot) then we want + * to make sure we don't overwrite it. do a statfs to + * find its size and skip over it. + */ + if (vp == rootvp) { + struct mount *mp; + struct statfs *sp; + int rootblocks, rootpages; + + mp = rootvnode->v_mount; + sp = &mp->mnt_stat; + rootblocks = sp->f_blocks * btodb(sp->f_bsize); + rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; + if (rootpages > npages) + panic("swap_on: miniroot larger than swap?"); + + if (extent_alloc_region(sdp->swd_ex, addr, + rootpages, EX_WAITOK)) + panic("swap_on: unable to preserve miniroot"); + + sdp->swd_npginuse += (rootpages - addr); + uvmexp.swpginuse += (rootpages - addr); + + printf("Preserved %d pages of miniroot ", rootpages); + printf("leaving %d pages of swap\n", size - rootpages); + } + + /* + * now add the new swapdev to the drum and enable. + */ + simple_lock(&swap_data_lock); + swapdrum_add(sdp, npages); + sdp->swd_npages = npages; + sdp->swd_flags &= ~SWF_FAKE; /* going live */ + sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); + simple_unlock(&swap_data_lock); + uvmexp.swpages += npages; + + /* + * add anon's to reflect the swap space we added + */ + uvm_anon_add(size); + +#if 0 + /* + * At this point we could arrange to reserve memory for the + * swap buffer pools. + * + * I don't think this is necessary, since swapping starts well + * ahead of serious memory deprivation and the memory resource + * pools hold on to actively used memory. This should ensure + * we always have some resources to continue operation. + */ + + int s = splbio(); + int n = 8 * sdp->swd_maxactive; + + (void)pool_prime(swapbuf_pool, n, 0); + + if (vp->v_type == VREG) { + /* Allocate additional vnx and vnd buffers */ + /* + * Allocation Policy: + * (8 * swd_maxactive) vnx headers per swap dev + * (16 * swd_maxactive) vnd buffers per swap dev + */ + + n = 8 * sdp->swd_maxactive; + (void)pool_prime(vndxfer_pool, n, 0); + + n = 16 * sdp->swd_maxactive; + (void)pool_prime(vndbuf_pool, n, 0); + } + splx(s); +#endif + + return (0); + +bad: + /* + * failure: close device if necessary and return error. + */ + if (vp != rootvp) + (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + return (error); +} +#endif + +#ifdef SWAP_OFF_WORKS +/* + * swap_off: stop swapping on swapdev + * + * XXXCDC: what conditions go here? + */ +static int +swap_off(p, sdp) + struct proc *p; + struct swapdev *sdp; +{ + char *name; + UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); + + /* turn off the enable flag */ + sdp->swd_flags &= ~SWF_ENABLE; + + UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev); + + /* + * XXX write me + * + * the idea is to find out which processes are using this swap + * device, and page them all in. + * + * eventually, we should try to move them out to other swap areas + * if available. + * + * The alternative is to create a redirection map for this swap + * device. This should work by moving all the pages of data from + * the ex-swap device to another one, and making an entry in the + * redirection map for it. locking is going to be important for + * this! + * + * XXXCDC: also need to shrink anon pool + */ + + /* until the above code is written, we must ENODEV */ + return ENODEV; + + extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK); + name = sdp->swd_ex->ex_name; + extent_destroy(sdp->swd_ex); + free(name, M_VMSWAP); + free((caddr_t)sdp->swd_ex, M_VMSWAP); + if (sdp->swp_vp != rootvp) + (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); + if (sdp->swd_vp) + vrele(sdp->swd_vp); + free((caddr_t)sdp, M_VMSWAP); + return (0); +} +#endif + +/* + * /dev/drum interface and i/o functions + */ + +/* + * swread: the read function for the drum (just a call to physio) + */ +/*ARGSUSED*/ +int +swread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); + return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); +} + +/* + * swwrite: the write function for the drum (just a call to physio) + */ +/*ARGSUSED*/ +int +swwrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); + return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); +} + +/* + * swstrategy: perform I/O on the drum + * + * => we must map the i/o request from the drum to the correct swapdev. + */ +void +swstrategy(bp) + struct buf *bp; +{ + struct swapdev *sdp; + struct vnode *vp; + int pageno; + int bn; + UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); + + /* + * convert block number to swapdev. note that swapdev can't + * be yanked out from under us because we are holding resources + * in it (i.e. the blocks we are doing I/O on). + */ + pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT; + simple_lock(&swap_data_lock); + sdp = swapdrum_getsdp(pageno); + simple_unlock(&swap_data_lock); + if (sdp == NULL) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); + return; + } + + /* + * convert drum page number to block number on this swapdev. + */ + + pageno = pageno - sdp->swd_drumoffset; /* page # on swapdev */ + bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */ + + UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n", + ((bp->b_flags & B_READ) == 0) ? "write" : "read", + sdp->swd_drumoffset, bn, bp->b_bcount); + + + /* + * for block devices we finish up here. + * for regular files we have to do more work which we deligate + * to sw_reg_strategy(). + */ + + switch (sdp->swd_vp->v_type) { + default: + panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); + case VBLK: + + /* + * must convert "bp" from an I/O on /dev/drum to an I/O + * on the swapdev (sdp). + */ + bp->b_blkno = bn; /* swapdev block number */ + vp = sdp->swd_vp; /* swapdev vnode pointer */ + bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ + VHOLD(vp); /* "hold" swapdev vp for i/o */ + + /* + * if we are doing a write, we have to redirect the i/o on + * drum's v_numoutput counter to the swapdevs. + */ + if ((bp->b_flags & B_READ) == 0) { + int s = splbio(); + vwakeup(bp); /* kills one 'v_numoutput' on drum */ + vp->v_numoutput++; /* put it on swapdev */ + splx(s); + } + + /* + * dissassocate buffer with /dev/drum vnode + * [could be null if buf was from physio] + */ + if (bp->b_vp != NULLVP) + brelvp(bp); + + /* + * finally plug in swapdev vnode and start I/O + */ + bp->b_vp = vp; + VOP_STRATEGY(bp); + return; +#ifdef SWAP_TO_FILES + case VREG: + /* + * deligate to sw_reg_strategy function. + */ + sw_reg_strategy(sdp, bp, bn); + return; +#endif + } + /* NOTREACHED */ +} + +#ifdef SWAP_TO_FILES +/* + * sw_reg_strategy: handle swap i/o to regular files + */ +static void +sw_reg_strategy(sdp, bp, bn) + struct swapdev *sdp; + struct buf *bp; + int bn; +{ + struct vnode *vp; + struct vndxfer *vnx; + daddr_t nbn, byteoff; + caddr_t addr; + int s, off, nra, error, sz, resid; + UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); + + /* + * allocate a vndxfer head for this transfer and point it to + * our buffer. + */ + getvndxfer(vnx); + vnx->vx_flags = VX_BUSY; + vnx->vx_error = 0; + vnx->vx_pending = 0; + vnx->vx_bp = bp; + vnx->vx_sdp = sdp; + + /* + * setup for main loop where we read filesystem blocks into + * our buffer. + */ + error = 0; + bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ + addr = bp->b_data; /* current position in buffer */ + byteoff = dbtob(bn); + + for (resid = bp->b_resid; resid; resid -= sz) { + struct vndbuf *nbp; + + /* + * translate byteoffset into block number. return values: + * vp = vnode of underlying device + * nbn = new block number (on underlying vnode dev) + * nra = num blocks we can read-ahead (excludes requested + * block) + */ + nra = 0; + error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, + &vp, &nbn, &nra); + + if (error == 0 && (long)nbn == -1) { + /* + * this used to just set error, but that doesn't + * do the right thing. Instead, it causes random + * memory errors. The panic() should remain until + * this condition doesn't destabilize the system. + */ +#if 1 + panic("sw_reg_strategy: swap to sparse file"); +#else + error = EIO; /* failure */ +#endif + } + + /* + * punt if there was an error or a hole in the file. + * we must wait for any i/o ops we have already started + * to finish before returning. + * + * XXX we could deal with holes here but it would be + * a hassle (in the write case). + */ + if (error) { + s = splbio(); + vnx->vx_error = error; /* pass error up */ + goto out; + } + + /* + * compute the size ("sz") of this transfer (in bytes). + * XXXCDC: ignores read-ahead for non-zero offset + */ + if ((off = (byteoff % sdp->swd_bsize)) != 0) + sz = sdp->swd_bsize - off; + else + sz = (1 + nra) * sdp->swd_bsize; + + if (resid < sz) + sz = resid; + + UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x", + sdp->swd_vp, vp, byteoff, nbn); + + /* + * now get a buf structure. note that the vb_buf is + * at the front of the nbp structure so that you can + * cast pointers between the two structure easily. + */ + getvndbuf(nbp); + nbp->vb_buf.b_flags = bp->b_flags | B_CALL; + nbp->vb_buf.b_bcount = sz; +#if 0 + nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */ +#endif + nbp->vb_buf.b_bufsize = sz; + nbp->vb_buf.b_error = 0; + nbp->vb_buf.b_data = addr; + nbp->vb_buf.b_blkno = nbn + btodb(off); + nbp->vb_buf.b_proc = bp->b_proc; + nbp->vb_buf.b_iodone = sw_reg_iodone; + nbp->vb_buf.b_vp = NULLVP; + nbp->vb_buf.b_vnbufs.le_next = NOLIST; + nbp->vb_buf.b_rcred = sdp->swd_cred; + nbp->vb_buf.b_wcred = sdp->swd_cred; + + /* + * set b_dirtyoff/end and b_validoff/end. this is + * required by the NFS client code (otherwise it will + * just discard our I/O request). + */ + if (bp->b_dirtyend == 0) { + nbp->vb_buf.b_dirtyoff = 0; + nbp->vb_buf.b_dirtyend = sz; + } else { + nbp->vb_buf.b_dirtyoff = + max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); + nbp->vb_buf.b_dirtyend = + min(sz, + max(0, bp->b_dirtyend - (bp->b_bcount-resid))); + } + if (bp->b_validend == 0) { + nbp->vb_buf.b_validoff = 0; + nbp->vb_buf.b_validend = sz; + } else { + nbp->vb_buf.b_validoff = + max(0, bp->b_validoff - (bp->b_bcount-resid)); + nbp->vb_buf.b_validend = + min(sz, + max(0, bp->b_validend - (bp->b_bcount-resid))); + } + + nbp->vb_xfer = vnx; /* patch it back in to vnx */ + + /* + * Just sort by block number + */ + nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno; + s = splbio(); + if (vnx->vx_error != 0) { + putvndbuf(nbp); + goto out; + } + vnx->vx_pending++; + + /* assoc new buffer with underlying vnode */ + bgetvp(vp, &nbp->vb_buf); + + /* sort it in and start I/O if we are not over our limit */ + disksort(&sdp->swd_tab, &nbp->vb_buf); + sw_reg_start(sdp); + splx(s); + + /* + * advance to the next I/O + */ + byteoff += sz; + addr += sz; + } + + s = splbio(); + +out: /* Arrive here at splbio */ + vnx->vx_flags &= ~VX_BUSY; + if (vnx->vx_pending == 0) { + if (vnx->vx_error != 0) { + bp->b_error = vnx->vx_error; + bp->b_flags |= B_ERROR; + } + putvndxfer(vnx); + biodone(bp); + } + splx(s); +} + +/* + * sw_reg_start: start an I/O request on the requested swapdev + * + * => reqs are sorted by disksort (above) + */ +static void +sw_reg_start(sdp) + struct swapdev *sdp; +{ + struct buf *bp; + UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); + + /* recursion control */ + if ((sdp->swd_flags & SWF_BUSY) != 0) + return; + + sdp->swd_flags |= SWF_BUSY; + + while (sdp->swd_tab.b_active < sdp->swd_maxactive) { + bp = sdp->swd_tab.b_actf; + if (bp == NULL) + break; + sdp->swd_tab.b_actf = bp->b_actf; + sdp->swd_tab.b_active++; + + UVMHIST_LOG(pdhist, + "sw_reg_start: bp %p vp %p blkno %p cnt %lx", + bp, bp->b_vp, bp->b_blkno, bp->b_bcount); + if ((bp->b_flags & B_READ) == 0) + bp->b_vp->v_numoutput++; + VOP_STRATEGY(bp); + } + sdp->swd_flags &= ~SWF_BUSY; +} + +/* + * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup + * + * => note that we can recover the vndbuf struct by casting the buf ptr + */ +static void +sw_reg_iodone(bp) + struct buf *bp; +{ + struct vndbuf *vbp = (struct vndbuf *) bp; + struct vndxfer *vnx = vbp->vb_xfer; + struct buf *pbp = vnx->vx_bp; /* parent buffer */ + struct swapdev *sdp = vnx->vx_sdp; + int s, resid; + UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", + vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); + UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", + vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); + + /* + * protect vbp at splbio and update. + */ + + s = splbio(); + resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; + pbp->b_resid -= resid; + vnx->vx_pending--; + + if (vbp->vb_buf.b_error) { + UVMHIST_LOG(pdhist, " got error=%d !", + vbp->vb_buf.b_error, 0, 0, 0); + + /* pass error upward */ + vnx->vx_error = vbp->vb_buf.b_error; + } + + /* + * drop "hold" reference to vnode (if one) + * XXXCDC: always set to NULLVP, this is useless, right? + */ + if (vbp->vb_buf.b_vp != NULLVP) + brelvp(&vbp->vb_buf); + + /* + * kill vbp structure + */ + putvndbuf(vbp); + + /* + * wrap up this transaction if it has run to completion or, in + * case of an error, when all auxiliary buffers have returned. + */ + if (vnx->vx_error != 0) { + /* pass error upward */ + pbp->b_flags |= B_ERROR; + pbp->b_error = vnx->vx_error; + if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { + putvndxfer(vnx); + biodone(pbp); + } + } else if (pbp->b_resid == 0) { +#ifdef DIAGNOSTIC + if (vnx->vx_pending != 0) + panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending); +#endif + + if ((vnx->vx_flags & VX_BUSY) == 0) { + UVMHIST_LOG(pdhist, " iodone error=%d !", + pbp, vnx->vx_error, 0, 0); + putvndxfer(vnx); + biodone(pbp); + } + } + + /* + * done! start next swapdev I/O if one is pending + */ + sdp->swd_tab.b_active--; + sw_reg_start(sdp); + + splx(s); +} +#endif /* SWAP_TO_FILES */ + + +/* + * uvm_swap_alloc: allocate space on swap + * + * => allocation is done "round robin" down the priority list, as we + * allocate in a priority we "rotate" the circle queue. + * => space can be freed with uvm_swap_free + * => we return the page slot number in /dev/drum (0 == invalid slot) + * => we lock swap_data_lock + * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM + */ +int +uvm_swap_alloc(nslots, lessok) + int *nslots; /* IN/OUT */ + boolean_t lessok; +{ + struct swapdev *sdp; + struct swappri *spp; + u_long result; + UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); + + /* + * no swap devices configured yet? definite failure. + */ + if (uvmexp.nswapdev < 1) + return 0; + + /* + * lock data lock, convert slots into blocks, and enter loop + */ + simple_lock(&swap_data_lock); + +ReTry: /* XXXMRG */ + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) { + /* if it's not enabled, then we can't swap from it */ + if ((sdp->swd_flags & SWF_ENABLE) == 0) + continue; + if (sdp->swd_npginuse + *nslots > sdp->swd_npages) + continue; + if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, + EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, + &result) != 0) { + continue; + } + + /* + * successful allocation! now rotate the circleq. + */ + CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); + CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); + sdp->swd_npginuse += *nslots; + uvmexp.swpginuse += *nslots; + simple_unlock(&swap_data_lock); + /* done! return drum slot number */ + UVMHIST_LOG(pdhist, + "success! returning %d slots starting at %d", + *nslots, result + sdp->swd_drumoffset, 0, 0); +#if 0 +{ + struct swapdev *sdp2; + + sdp2 = swapdrum_getsdp(result + sdp->swd_drumoffset); + if (sdp2 == NULL) { +printf("uvm_swap_alloc: nslots=%d, dev=%x, drumoff=%d, result=%ld", + *nslots, sdp->swd_dev, sdp->swd_drumoffset, result); +panic("uvm_swap_alloc: allocating unmapped swap block!"); + } +} +#endif + return(result + sdp->swd_drumoffset); + } + } + + /* XXXMRG: BEGIN HACK */ + if (*nslots > 1 && lessok) { + *nslots = 1; + goto ReTry; /* XXXMRG: ugh! extent should support this for us */ + } + /* XXXMRG: END HACK */ + + simple_unlock(&swap_data_lock); + return 0; /* failed */ +} + +/* + * uvm_swap_free: free swap slots + * + * => this can be all or part of an allocation made by uvm_swap_alloc + * => we lock swap_data_lock + */ +void +uvm_swap_free(startslot, nslots) + int startslot; + int nslots; +{ + struct swapdev *sdp; + UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, + startslot, 0, 0); + /* + * convert drum slot offset back to sdp, free the blocks + * in the extent, and return. must hold pri lock to do + * lookup and access the extent. + */ + simple_lock(&swap_data_lock); + sdp = swapdrum_getsdp(startslot); + +#ifdef DIAGNOSTIC + if (uvmexp.nswapdev < 1) + panic("uvm_swap_free: uvmexp.nswapdev < 1\n"); + if (sdp == NULL) { + printf("uvm_swap_free: startslot %d, nslots %d\n", startslot, + nslots); + panic("uvm_swap_free: unmapped address\n"); + } +#endif + if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, + EX_MALLOCOK|EX_NOWAIT) != 0) + printf("warning: resource shortage: %d slots of swap lost\n", + nslots); + + sdp->swd_npginuse -= nslots; + uvmexp.swpginuse -= nslots; +#ifdef DIAGNOSTIC + if (sdp->swd_npginuse < 0) + panic("uvm_swap_free: inuse < 0"); +#endif + simple_unlock(&swap_data_lock); +} + +/* + * uvm_swap_put: put any number of pages into a contig place on swap + * + * => can be sync or async + * => XXXMRG: consider making it an inline or macro + */ +int +uvm_swap_put(swslot, ppsp, npages, flags) + int swslot; + struct vm_page **ppsp; + int npages; + int flags; +{ + int result; + +#if 0 + flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */ +#endif + + result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | + ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); + + return (result); +} + +/* + * uvm_swap_get: get a single page from swap + * + * => usually a sync op (from fault) + * => XXXMRG: consider making it an inline or macro + */ +int +uvm_swap_get(page, swslot, flags) + struct vm_page *page; + int swslot, flags; +{ + int result; + + uvmexp.nswget++; +#ifdef DIAGNOSTIC + if ((flags & PGO_SYNCIO) == 0) + printf("uvm_swap_get: ASYNC get requested?\n"); +#endif + + result = uvm_swap_io(&page, swslot, 1, B_READ | + ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); + + return (result); +} + +/* + * uvm_swap_io: do an i/o operation to swap + */ + +static int +uvm_swap_io(pps, startslot, npages, flags) + struct vm_page **pps; + int startslot, npages, flags; +{ + daddr_t startblk; + struct swapbuf *sbp; + struct buf *bp; + vaddr_t kva; + int result, s, waitf, pflag; + UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", + startslot, npages, flags, 0); + /* + * convert starting drum slot to block number + */ + startblk = btodb(startslot << PAGE_SHIFT); + + /* + * first, map the pages into the kernel (XXX: currently required + * by buffer system). note that we don't let pagermapin alloc + * an aiodesc structure because we don't want to chance a malloc. + * we've got our own pool of aiodesc structures (in swapbuf). + */ + waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK; + kva = uvm_pagermapin(pps, npages, NULL, waitf); + if (kva == NULL) + return (VM_PAGER_AGAIN); + + /* + * now allocate a swap buffer off of freesbufs + * [make sure we don't put the pagedaemon to sleep...] + */ + s = splbio(); + pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) + ? 0 + : PR_WAITOK; + sbp = pool_get(swapbuf_pool, pflag); + splx(s); /* drop splbio */ + + /* + * if we failed to get a swapbuf, return "try again" + */ + if (sbp == NULL) + return (VM_PAGER_AGAIN); + + /* + * fill in the bp/sbp. we currently route our i/o through + * /dev/drum's vnode [swapdev_vp]. + */ + bp = &sbp->sw_buf; + bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); + bp->b_proc = &proc0; /* XXX */ + bp->b_rcred = bp->b_wcred = proc0.p_ucred; + bp->b_vnbufs.le_next = NOLIST; + bp->b_data = (caddr_t)kva; + bp->b_blkno = startblk; + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + /* XXXCDC: isn't swapdev_vp always a VCHR? */ + /* XXXMRG: probably -- this is obviously something inherited... */ + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; + bp->b_bcount = npages << PAGE_SHIFT; + + /* + * for pageouts we must set "dirtyoff" [NFS client code needs it]. + * and we bump v_numoutput (counter of number of active outputs). + */ + if ((bp->b_flags & B_READ) == 0) { + bp->b_dirtyoff = 0; + bp->b_dirtyend = npages << PAGE_SHIFT; + s = splbio(); + swapdev_vp->v_numoutput++; + splx(s); + } + + /* + * for async ops we must set up the aiodesc and setup the callback + * XXX: we expect no async-reads, but we don't prevent it here. + */ + if (flags & B_ASYNC) { + sbp->sw_aio.aiodone = uvm_swap_aiodone; + sbp->sw_aio.kva = kva; + sbp->sw_aio.npages = npages; + sbp->sw_aio.pd_ptr = sbp; /* backpointer */ + bp->b_flags |= B_CALL; /* set callback */ + bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */ + UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); + } + UVMHIST_LOG(pdhist, + "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld", + bp->b_data, bp->b_blkno, bp->b_bcount, 0); + + /* + * now we start the I/O, and if async, return. + */ + VOP_STRATEGY(bp); + if (flags & B_ASYNC) + return (VM_PAGER_PEND); + + /* + * must be sync i/o. wait for it to finish + */ + bp->b_error = biowait(bp); + result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; + + /* + * kill the pager mapping + */ + uvm_pagermapout(kva, npages); + + /* + * now dispose of the swap buffer + */ + s = splbio(); + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY|B_NOCACHE); + if (bp->b_vp) + brelvp(bp); + + pool_put(swapbuf_pool, sbp); + splx(s); + + /* + * finally return. + */ + UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0); + return (result); +} + +/* + * uvm_swap_bufdone: called from the buffer system when the i/o is done + */ +static void +uvm_swap_bufdone(bp) + struct buf *bp; +{ + struct swapbuf *sbp = (struct swapbuf *) bp; + int s = splbio(); + UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0); +#ifdef DIAGNOSTIC + /* + * sanity check: swapbufs are private, so they shouldn't be wanted + */ + if (bp->b_flags & B_WANTED) + panic("uvm_swap_bufdone: private buf wanted"); +#endif + + /* + * drop buffers reference to the vnode and its flags. + */ + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY|B_NOCACHE); + if (bp->b_vp) + brelvp(bp); + + /* + * now put the aio on the uvm.aio_done list and wake the + * pagedaemon (which will finish up our job in its context). + */ + simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */ + TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq); + simple_unlock(&uvm.pagedaemon_lock); + + thread_wakeup(&uvm.pagedaemon); + splx(s); +} + +/* + * uvm_swap_aiodone: aiodone function for anonymous memory + * + * => this is called in the context of the pagedaemon (but with the + * page queues unlocked!) + * => our "aio" structure must be part of a "swapbuf" + */ +static void +uvm_swap_aiodone(aio) + struct uvm_aiodesc *aio; +{ + struct swapbuf *sbp = aio->pd_ptr; + struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT]; + int lcv, s; + vaddr_t addr; + UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0); +#ifdef DIAGNOSTIC + /* + * sanity check + */ + if (aio->npages > (MAXBSIZE >> PAGE_SHIFT)) + panic("uvm_swap_aiodone: aio too big!"); +#endif + + /* + * first, we have to recover the page pointers (pps) by poking in the + * kernel pmap (XXX: should be saved in the buf structure). + */ + for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; + addr += PAGE_SIZE, lcv++) { + pps[lcv] = uvm_pageratop(addr); + } + + /* + * now we can dispose of the kernel mappings of the buffer + */ + uvm_pagermapout(aio->kva, aio->npages); + + /* + * now we can dispose of the pages by using the dropcluster function + * [note that we have no "page of interest" so we pass in null] + */ + uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, + PGO_PDFREECLUST, 0); + + /* + * finally, we can dispose of the swapbuf + */ + s = splbio(); + pool_put(swapbuf_pool, sbp); + splx(s); + + /* + * done! + */ +} diff --git a/sys/uvm/uvm_swap.h b/sys/uvm/uvm_swap.h new file mode 100644 index 00000000000..008db98b241 --- /dev/null +++ b/sys/uvm/uvm_swap.h @@ -0,0 +1,42 @@ +/* $NetBSD: uvm_swap.h,v 1.3 1998/02/07 11:09:48 mrg Exp $ */ + +/* + * Copyright (c) 1997 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Id: uvm_swap.h,v 1.1.2.6 1997/12/15 05:39:31 mrg Exp + */ + +#ifndef _UVM_UVM_SWAP_H_ +#define _UVM_UVM_SWAP_H_ + +int uvm_swap_get __P((struct vm_page *, int, int)); +int uvm_swap_put __P((int, struct vm_page **, int, + int)); +int uvm_swap_alloc __P((int *wanted, boolean_t lessok)); +void uvm_swap_free __P((int startslot, int nslots)); + +#endif /* _UVM_UVM_SWAP_H_ */ diff --git a/sys/uvm/uvm_unix.c b/sys/uvm/uvm_unix.c new file mode 100644 index 00000000000..ed1588491cc --- /dev/null +++ b/sys/uvm/uvm_unix.c @@ -0,0 +1,258 @@ +/* $NetBSD: uvm_unix.c,v 1.7 1998/10/11 23:18:21 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993 The Regents of the University of California. + * Copyright (c) 1988 University of Utah. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$ + * @(#)vm_unix.c 8.1 (Berkeley) 6/11/93 + * from: Id: uvm_unix.c,v 1.1.2.2 1997/08/25 18:52:30 chuck Exp + */ + +/* + * uvm_unix.c: traditional sbrk/grow interface to vm. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/vnode.h> +#include <sys/core.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +#include <vm/vm.h> +#include <uvm/uvm.h> + + +/* + * sys_obreak: set break + */ + +int +sys_obreak(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_obreak_args /* { + syscallarg(char *) nsize; + } */ *uap = v; + register struct vmspace *vm = p->p_vmspace; + vaddr_t new, old; + int rv; + register int diff; + + old = (vaddr_t)vm->vm_daddr; + new = round_page(SCARG(uap, nsize)); + if ((int)(new - old) > p->p_rlimit[RLIMIT_DATA].rlim_cur) + return(ENOMEM); + + old = round_page(old + ctob(vm->vm_dsize)); + diff = new - old; + + /* + * grow or shrink? + */ + + if (diff > 0) { + + rv = uvm_map(&vm->vm_map, &old, diff, NULL, UVM_UNKNOWN_OFFSET, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_COPY, + UVM_ADV_NORMAL, UVM_FLAG_AMAPPAD|UVM_FLAG_FIXED| + UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW)); + + if (rv != KERN_SUCCESS) { + uprintf("sbrk: grow failed, return = %d\n", rv); + return(ENOMEM); + } + vm->vm_dsize += btoc(diff); + + } else if (diff < 0) { + + diff = -diff; + rv = uvm_deallocate(&vm->vm_map, new, diff); + if (rv != KERN_SUCCESS) { + uprintf("sbrk: shrink failed, return = %d\n", rv); + return(ENOMEM); + } + vm->vm_dsize -= btoc(diff); + + } + return(0); +} + +/* + * uvm_grow: enlarge the "stack segment" to include sp. + */ + +int +uvm_grow(p, sp) + struct proc *p; + vaddr_t sp; +{ + register struct vmspace *vm = p->p_vmspace; + register int si; + + /* + * For user defined stacks (from sendsig). + */ + if (sp < (vaddr_t)vm->vm_maxsaddr) + return (0); + + /* + * For common case of already allocated (from trap). + */ + if (sp >= USRSTACK - ctob(vm->vm_ssize)) + return (1); + + /* + * Really need to check vs limit and increment stack size if ok. + */ + si = clrnd(btoc(USRSTACK-sp) - vm->vm_ssize); + if (vm->vm_ssize + si > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) + return (0); + vm->vm_ssize += si; + return (1); +} + +/* + * sys_oadvise: old advice system call + */ + +/* ARGSUSED */ +int +sys_ovadvise(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ +#if 0 + struct sys_ovadvise_args /* { + syscallarg(int) anom; + } */ *uap = v; +#endif + + return (EINVAL); +} + +/* + * uvm_coredump: dump core! + */ + +int +uvm_coredump(p, vp, cred, chdr) + struct proc *p; + struct vnode *vp; + struct ucred *cred; + struct core *chdr; +{ + register struct vmspace *vm = p->p_vmspace; + register vm_map_t map = &vm->vm_map; + register vm_map_entry_t entry; + vaddr_t start, end; + struct coreseg cseg; + off_t offset; + int flag, error = 0; + + offset = chdr->c_hdrsize + chdr->c_seghdrsize + chdr->c_cpusize; + + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + + /* should never happen for a user process */ + if (UVM_ET_ISSUBMAP(entry)) { + panic("uvm_coredump: user process with submap?"); + } + + if (!(entry->protection & VM_PROT_WRITE)) + continue; + + start = entry->start; + end = entry->end; + + if (start >= VM_MAXUSER_ADDRESS) + continue; + + if (end > VM_MAXUSER_ADDRESS) + end = VM_MAXUSER_ADDRESS; + + if (start >= (vaddr_t)vm->vm_maxsaddr) { + flag = CORE_STACK; + start = trunc_page(USRSTACK - ctob(vm->vm_ssize)); + if (start >= end) + continue; + } else + flag = CORE_DATA; + + /* + * Set up a new core file segment. + */ + CORE_SETMAGIC(cseg, CORESEGMAGIC, CORE_GETMID(*chdr), flag); + cseg.c_addr = start; + cseg.c_size = end - start; + + error = vn_rdwr(UIO_WRITE, vp, + (caddr_t)&cseg, chdr->c_seghdrsize, + offset, UIO_SYSSPACE, + IO_NODELOCKED|IO_UNIT, cred, NULL, p); + if (error) + break; + + offset += chdr->c_seghdrsize; + error = vn_rdwr(UIO_WRITE, vp, + (caddr_t)cseg.c_addr, (int)cseg.c_size, + offset, UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, NULL, p); + if (error) + break; + + offset += cseg.c_size; + chdr->c_nseg++; + } + + return (error); +} + diff --git a/sys/uvm/uvm_user.c b/sys/uvm/uvm_user.c new file mode 100644 index 00000000000..e3c328298b7 --- /dev/null +++ b/sys/uvm/uvm_user.c @@ -0,0 +1,72 @@ +/* $NetBSD: uvm_user.c,v 1.6 1998/10/11 23:18:21 chuck Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_user.c,v 1.1.2.1 1997/08/14 19:10:41 chuck Exp + */ + +/* + * uvm_user.c: high level uvm_allocate/uvm_deallocate interface into vm. + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <uvm/uvm.h> + +/* + * uvm_deallocate: deallocate memory (unmap) + */ + +int +uvm_deallocate(map, start, size) + vm_map_t map; + vaddr_t start; + vsize_t size; +{ + + if (map == NULL) + panic("uvm_deallocate with null map"); + + if (size == (vaddr_t) 0) + return (KERN_SUCCESS); + + return(uvm_unmap(map, trunc_page(start), round_page(start+size))); + +} diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c new file mode 100644 index 00000000000..154c009b2d0 --- /dev/null +++ b/sys/uvm/uvm_vnode.c @@ -0,0 +1,2067 @@ +/* $NetBSD: uvm_vnode.c,v 1.18 1999/01/29 12:56:17 bouyer Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. + * Copyright (c) 1990 University of Utah. + * + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor, + * Washington University, the University of California, Berkeley and + * its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94 + * from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp + */ + +/* + * uvm_vnode.c: the vnode pager. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/disklabel.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#include <sys/conf.h> + +#include <miscfs/specfs/specdev.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_vnode.h> + +/* + * private global data structure + * + * we keep a list of writeable active vnode-backed VM objects for sync op. + * we keep a simpleq of vnodes that are currently being sync'd. + */ + +LIST_HEAD(uvn_list_struct, uvm_vnode); +static struct uvn_list_struct uvn_wlist; /* writeable uvns */ +static simple_lock_data_t uvn_wl_lock; /* locks uvn_wlist */ + +SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode); +static struct uvn_sq_struct uvn_sync_q; /* sync'ing uvns */ +lock_data_t uvn_sync_lock; /* locks sync operation */ + +/* + * functions + */ + +static int uvn_asyncget __P((struct uvm_object *, vaddr_t, + int)); +struct uvm_object *uvn_attach __P((void *, vm_prot_t)); +static void uvn_cluster __P((struct uvm_object *, vaddr_t, + vaddr_t *, vaddr_t *)); +static void uvn_detach __P((struct uvm_object *)); +static boolean_t uvn_flush __P((struct uvm_object *, vaddr_t, + vaddr_t, int)); +static int uvn_get __P((struct uvm_object *, vaddr_t, + vm_page_t *, int *, int, + vm_prot_t, int, int)); +static void uvn_init __P((void)); +static int uvn_io __P((struct uvm_vnode *, vm_page_t *, + int, int, int)); +static int uvn_put __P((struct uvm_object *, vm_page_t *, + int, boolean_t)); +static void uvn_reference __P((struct uvm_object *)); +static boolean_t uvn_releasepg __P((struct vm_page *, + struct vm_page **)); + +/* + * master pager structure + */ + +struct uvm_pagerops uvm_vnodeops = { + uvn_init, + uvn_attach, + uvn_reference, + uvn_detach, + NULL, /* no specialized fault routine required */ + uvn_flush, + uvn_get, + uvn_asyncget, + uvn_put, + uvn_cluster, + uvm_mk_pcluster, /* use generic version of this: see uvm_pager.c */ + uvm_shareprot, /* !NULL: allow us in share maps */ + NULL, /* AIO-DONE function (not until we have asyncio) */ + uvn_releasepg, +}; + +/* + * the ops! + */ + +/* + * uvn_init + * + * init pager private data structures. + */ + +static void +uvn_init() +{ + + LIST_INIT(&uvn_wlist); + simple_lock_init(&uvn_wl_lock); + /* note: uvn_sync_q init'd in uvm_vnp_sync() */ + lockinit(&uvn_sync_lock, PVM, "uvnsync", 0, 0); +} + +/* + * uvn_attach + * + * attach a vnode structure to a VM object. if the vnode is already + * attached, then just bump the reference count by one and return the + * VM object. if not already attached, attach and return the new VM obj. + * the "accessprot" tells the max access the attaching thread wants to + * our pages. + * + * => caller must _not_ already be holding the lock on the uvm_object. + * => in fact, nothing should be locked so that we can sleep here. + * => note that uvm_object is first thing in vnode structure, so their + * pointers are equiv. + */ + +struct uvm_object * +uvn_attach(arg, accessprot) + void *arg; + vm_prot_t accessprot; +{ + struct vnode *vp = arg; + struct uvm_vnode *uvn = &vp->v_uvm; + struct vattr vattr; + int oldflags, result; + struct partinfo pi; + u_quad_t used_vnode_size; + UVMHIST_FUNC("uvn_attach"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "(vn=0x%x)", arg,0,0,0); + + used_vnode_size = (u_quad_t)0; /* XXX gcc -Wuninitialized */ + + /* + * first get a lock on the uvn. + */ + simple_lock(&uvn->u_obj.vmobjlock); + while (uvn->u_flags & UVM_VNODE_BLOCKED) { + uvn->u_flags |= UVM_VNODE_WANTED; + UVMHIST_LOG(maphist, " SLEEPING on blocked vn",0,0,0,0); + UVM_UNLOCK_AND_WAIT(uvn, &uvn->u_obj.vmobjlock, FALSE, + "uvn_attach", 0); + simple_lock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist," WOKE UP",0,0,0,0); + } + + /* + * if we're mapping a BLK device, make sure it is a disk. + */ + if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) { + simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ + UVMHIST_LOG(maphist,"<- done (VBLK not D_DISK!)", 0,0,0,0); + return(NULL); + } + + /* + * now we have lock and uvn must not be in a blocked state. + * first check to see if it is already active, in which case + * we can bump the reference count, check to see if we need to + * add it to the writeable list, and then return. + */ + if (uvn->u_flags & UVM_VNODE_VALID) { /* already active? */ + + /* regain VREF if we were persisting */ + if (uvn->u_obj.uo_refs == 0) { + VREF(vp); + UVMHIST_LOG(maphist," VREF (reclaim persisting vnode)", + 0,0,0,0); + } + uvn->u_obj.uo_refs++; /* bump uvn ref! */ + + /* check for new writeable uvn */ + if ((accessprot & VM_PROT_WRITE) != 0 && + (uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) { + simple_lock(&uvn_wl_lock); + LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); + simple_unlock(&uvn_wl_lock); + /* we are now on wlist! */ + uvn->u_flags |= UVM_VNODE_WRITEABLE; + } + + /* unlock and return */ + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs, + 0, 0, 0); + return (&uvn->u_obj); + } + + /* + * need to call VOP_GETATTR() to get the attributes, but that could + * block (due to I/O), so we want to unlock the object before calling. + * however, we want to keep anyone else from playing with the object + * while it is unlocked. to do this we set UVM_VNODE_ALOCK which + * prevents anyone from attaching to the vnode until we are done with + * it. + */ + uvn->u_flags = UVM_VNODE_ALOCK; + simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */ + /* XXX: curproc? */ + + if (vp->v_type == VBLK) { + /* + * We could implement this as a specfs getattr call, but: + * + * (1) VOP_GETATTR() would get the file system + * vnode operation, not the specfs operation. + * + * (2) All we want is the size, anyhow. + */ + result = (*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev, + DIOCGPART, (caddr_t)&pi, FREAD, curproc); + if (result == 0) { + /* XXX should remember blocksize */ + used_vnode_size = (u_quad_t)pi.disklab->d_secsize * + (u_quad_t)pi.part->p_size; + } + } else { + result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc); + if (result == 0) + used_vnode_size = vattr.va_size; + } + + /* relock object */ + simple_lock(&uvn->u_obj.vmobjlock); + + if (result != 0) { + if (uvn->u_flags & UVM_VNODE_WANTED) + wakeup(uvn); + uvn->u_flags = 0; + simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ + UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0); + return(NULL); + } + + /* + * make sure that the newsize fits within a vaddr_t + * XXX: need to revise addressing data types + */ +#ifdef DEBUG + if (vp->v_type == VBLK) + printf("used_vnode_size = %qu\n", used_vnode_size); +#endif + if (used_vnode_size > (vaddr_t) -PAGE_SIZE) { +#ifdef DEBUG + printf("uvn_attach: vn %p size truncated %qx->%x\n", vp, + used_vnode_size, -PAGE_SIZE); +#endif + used_vnode_size = (vaddr_t) -PAGE_SIZE; + } + + /* + * now set up the uvn. + */ + uvn->u_obj.pgops = &uvm_vnodeops; + TAILQ_INIT(&uvn->u_obj.memq); + uvn->u_obj.uo_npages = 0; + uvn->u_obj.uo_refs = 1; /* just us... */ + oldflags = uvn->u_flags; + uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST; + uvn->u_nio = 0; + uvn->u_size = used_vnode_size; + + /* if write access, we need to add it to the wlist */ + if (accessprot & VM_PROT_WRITE) { + simple_lock(&uvn_wl_lock); + LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); + simple_unlock(&uvn_wl_lock); + uvn->u_flags |= UVM_VNODE_WRITEABLE; /* we are on wlist! */ + } + + /* + * add a reference to the vnode. this reference will stay as long + * as there is a valid mapping of the vnode. dropped when the + * reference count goes to zero [and we either free or persist]. + */ + VREF(vp); + simple_unlock(&uvn->u_obj.vmobjlock); + if (oldflags & UVM_VNODE_WANTED) + wakeup(uvn); + + UVMHIST_LOG(maphist,"<- done/VREF, ret 0x%x", &uvn->u_obj,0,0,0); + return(&uvn->u_obj); +} + + +/* + * uvn_reference + * + * duplicate a reference to a VM object. Note that the reference + * count must already be at least one (the passed in reference) so + * there is no chance of the uvn being killed or locked out here. + * + * => caller must call with object unlocked. + * => caller must be using the same accessprot as was used at attach time + */ + + +static void +uvn_reference(uobj) + struct uvm_object *uobj; +{ +#ifdef DIAGNOSTIC + struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; +#endif + UVMHIST_FUNC("uvn_reference"); UVMHIST_CALLED(maphist); + + simple_lock(&uobj->vmobjlock); +#ifdef DIAGNOSTIC + if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { + printf("uvn_reference: ref=%d, flags=0x%x\n", uvn->u_flags, + uobj->uo_refs); + panic("uvn_reference: invalid state"); + } +#endif + uobj->uo_refs++; + UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", + uobj, uobj->uo_refs,0,0); + simple_unlock(&uobj->vmobjlock); +} + +/* + * uvn_detach + * + * remove a reference to a VM object. + * + * => caller must call with object unlocked and map locked. + * => this starts the detach process, but doesn't have to finish it + * (async i/o could still be pending). + */ +static void +uvn_detach(uobj) + struct uvm_object *uobj; +{ + struct uvm_vnode *uvn; + struct vnode *vp; + int oldflags; + UVMHIST_FUNC("uvn_detach"); UVMHIST_CALLED(maphist); + + simple_lock(&uobj->vmobjlock); + + UVMHIST_LOG(maphist," (uobj=0x%x) ref=%d", uobj,uobj->uo_refs,0,0); + uobj->uo_refs--; /* drop ref! */ + if (uobj->uo_refs) { /* still more refs */ + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0); + return; + } + + /* + * get other pointers ... + */ + + uvn = (struct uvm_vnode *) uobj; + vp = (struct vnode *) uobj; + + /* + * clear VTEXT flag now that there are no mappings left (VTEXT is used + * to keep an active text file from being overwritten). + */ + vp->v_flag &= ~VTEXT; + + /* + * we just dropped the last reference to the uvn. see if we can + * let it "stick around". + */ + + if (uvn->u_flags & UVM_VNODE_CANPERSIST) { + /* won't block */ + uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES); + simple_unlock(&uobj->vmobjlock); + vrele(vp); /* drop vnode reference */ + UVMHIST_LOG(maphist,"<- done/vrele! (persist)", 0,0,0,0); + return; + } + + /* + * its a goner! + */ + + UVMHIST_LOG(maphist," its a goner (flushing)!", 0,0,0,0); + + uvn->u_flags |= UVM_VNODE_DYING; + + /* + * even though we may unlock in flush, no one can gain a reference + * to us until we clear the "dying" flag [because it blocks + * attaches]. we will not do that until after we've disposed of all + * the pages with uvn_flush(). note that before the flush the only + * pages that could be marked PG_BUSY are ones that are in async + * pageout by the daemon. (there can't be any pending "get"'s + * because there are no references to the object). + */ + + (void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); + + UVMHIST_LOG(maphist," its a goner (done flush)!", 0,0,0,0); + + /* + * given the structure of this pager, the above flush request will + * create the following state: all the pages that were in the object + * have either been free'd or they are marked PG_BUSY|PG_RELEASED. + * the PG_BUSY bit was set either by us or the daemon for async I/O. + * in either case, if we have pages left we can't kill the object + * yet because i/o is pending. in this case we set the "relkill" + * flag which will cause pgo_releasepg to kill the object once all + * the I/O's are done [pgo_releasepg will be called from the aiodone + * routine or from the page daemon]. + */ + + if (uobj->uo_npages) { /* I/O pending. iodone will free */ +#ifdef DIAGNOSTIC + /* + * XXXCDC: very unlikely to happen until we have async i/o + * so print a little info message in case it does. + */ + printf("uvn_detach: vn %p has pages left after flush - " + "relkill mode\n", uobj); +#endif + uvn->u_flags |= UVM_VNODE_RELKILL; + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0, + 0, 0); + return; + } + + /* + * kill object now. note that we can't be on the sync q because + * all references are gone. + */ + if (uvn->u_flags & UVM_VNODE_WRITEABLE) { + simple_lock(&uvn_wl_lock); /* protect uvn_wlist */ + LIST_REMOVE(uvn, u_wlist); + simple_unlock(&uvn_wl_lock); + } +#ifdef DIAGNOSTIC + if (uobj->memq.tqh_first != NULL) + panic("uvn_deref: vnode VM object still has pages afer " + "syncio/free flush"); +#endif + oldflags = uvn->u_flags; + uvn->u_flags = 0; + simple_unlock(&uobj->vmobjlock); + + /* wake up any sleepers */ + if (oldflags & UVM_VNODE_WANTED) + wakeup(uvn); + + /* + * drop our reference to the vnode. + */ + vrele(vp); + UVMHIST_LOG(maphist,"<- done (vrele) final", 0,0,0,0); + + return; +} + +/* + * uvm_vnp_terminate: external hook to clear out a vnode's VM + * + * called in two cases: + * [1] when a persisting vnode vm object (i.e. one with a zero reference + * count) needs to be freed so that a vnode can be reused. this + * happens under "getnewvnode" in vfs_subr.c. if the vnode from + * the free list is still attached (i.e. not VBAD) then vgone is + * called. as part of the vgone trace this should get called to + * free the vm object. this is the common case. + * [2] when a filesystem is being unmounted by force (MNT_FORCE, + * "umount -f") the vgone() function is called on active vnodes + * on the mounted file systems to kill their data (the vnodes become + * "dead" ones [see src/sys/miscfs/deadfs/...]). that results in a + * call here (even if the uvn is still in use -- i.e. has a non-zero + * reference count). this case happens at "umount -f" and during a + * "reboot/halt" operation. + * + * => the caller must XLOCK and VOP_LOCK the vnode before calling us + * [protects us from getting a vnode that is already in the DYING + * state...] + * => unlike uvn_detach, this function must not return until all the + * uvn's pages are disposed of. + * => in case [2] the uvn is still alive after this call, but all I/O + * ops will fail (due to the backing vnode now being "dead"). this + * will prob. kill any process using the uvn due to pgo_get failing. + */ + +void +uvm_vnp_terminate(vp) + struct vnode *vp; +{ + struct uvm_vnode *uvn = &vp->v_uvm; + int oldflags; + UVMHIST_FUNC("uvm_vnp_terminate"); UVMHIST_CALLED(maphist); + + /* + * lock object and check if it is valid + */ + simple_lock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist, " vp=0x%x, ref=%d, flag=0x%x", vp, + uvn->u_obj.uo_refs, uvn->u_flags, 0); + if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist, "<- done (not active)", 0, 0, 0, 0); + return; + } + + /* + * must be a valid uvn that is not already dying (because XLOCK + * protects us from that). the uvn can't in the the ALOCK state + * because it is valid, and uvn's that are in the ALOCK state haven't + * been marked valid yet. + */ + +#ifdef DEBUG + /* + * debug check: are we yanking the vnode out from under our uvn? + */ + if (uvn->u_obj.uo_refs) { + printf("uvm_vnp_terminate(%p): terminating active vnode " + "(refs=%d)\n", uvn, uvn->u_obj.uo_refs); + } +#endif + + /* + * it is possible that the uvn was detached and is in the relkill + * state [i.e. waiting for async i/o to finish so that releasepg can + * kill object]. we take over the vnode now and cancel the relkill. + * we want to know when the i/o is done so we can recycle right + * away. note that a uvn can only be in the RELKILL state if it + * has a zero reference count. + */ + + if (uvn->u_flags & UVM_VNODE_RELKILL) + uvn->u_flags &= ~UVM_VNODE_RELKILL; /* cancel RELKILL */ + + /* + * block the uvn by setting the dying flag, and then flush the + * pages. (note that flush may unlock object while doing I/O, but + * it will re-lock it before it returns control here). + * + * also, note that we tell I/O that we are already VOP_LOCK'd so + * that uvn_io doesn't attempt to VOP_LOCK again. + * + * XXXCDC: setting VNISLOCKED on an active uvn which is being terminated + * due to a forceful unmount might not be a good idea. maybe we + * need a way to pass in this info to uvn_flush through a + * pager-defined PGO_ constant [currently there are none]. + */ + uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED; + + (void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); + + /* + * as we just did a flush we expect all the pages to be gone or in + * the process of going. sleep to wait for the rest to go [via iosync]. + */ + + while (uvn->u_obj.uo_npages) { +#ifdef DIAGNOSTIC + struct vm_page *pp; + for (pp = uvn->u_obj.memq.tqh_first ; pp != NULL ; + pp = pp->listq.tqe_next) { + if ((pp->flags & PG_BUSY) == 0) + panic("uvm_vnp_terminate: detected unbusy pg"); + } + if (uvn->u_nio == 0) + panic("uvm_vnp_terminate: no I/O to wait for?"); + printf("uvm_vnp_terminate: waiting for I/O to fin.\n"); + /* + * XXXCDC: this is unlikely to happen without async i/o so we + * put a printf in just to keep an eye on it. + */ +#endif + uvn->u_flags |= UVM_VNODE_IOSYNC; + UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE, + "uvn_term",0); + simple_lock(&uvn->u_obj.vmobjlock); + } + + /* + * done. now we free the uvn if its reference count is zero + * (true if we are zapping a persisting uvn). however, if we are + * terminating a uvn with active mappings we let it live ... future + * calls down to the vnode layer will fail. + */ + + oldflags = uvn->u_flags; + if (uvn->u_obj.uo_refs) { + + /* + * uvn must live on it is dead-vnode state until all references + * are gone. restore flags. clear CANPERSIST state. + */ + + uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED| + UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST); + + } else { + + /* + * free the uvn now. note that the VREF reference is already + * gone [it is dropped when we enter the persist state]. + */ + if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) + panic("uvm_vnp_terminate: io sync wanted bit set"); + + if (uvn->u_flags & UVM_VNODE_WRITEABLE) { + simple_lock(&uvn_wl_lock); + LIST_REMOVE(uvn, u_wlist); + simple_unlock(&uvn_wl_lock); + } + uvn->u_flags = 0; /* uvn is history, clear all bits */ + } + + if (oldflags & UVM_VNODE_WANTED) + wakeup(uvn); /* object lock still held */ + + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0); + +} + +/* + * uvn_releasepg: handled a released page in a uvn + * + * => "pg" is a PG_BUSY [caller owns it], PG_RELEASED page that we need + * to dispose of. + * => caller must handled PG_WANTED case + * => called with page's object locked, pageq's unlocked + * => returns TRUE if page's object is still alive, FALSE if we + * killed the page's object. if we return TRUE, then we + * return with the object locked. + * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return + * with the page queues locked [for pagedaemon] + * => if (nextpgp == NULL) => we return with page queues unlocked [normal case] + * => we kill the uvn if it is not referenced and we are suppose to + * kill it ("relkill"). + */ + +boolean_t +uvn_releasepg(pg, nextpgp) + struct vm_page *pg; + struct vm_page **nextpgp; /* OUT */ +{ + struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject; +#ifdef DIAGNOSTIC + if ((pg->flags & PG_RELEASED) == 0) + panic("uvn_releasepg: page not released!"); +#endif + + /* + * dispose of the page [caller handles PG_WANTED] + */ + pmap_page_protect(PMAP_PGARG(pg), VM_PROT_NONE); + uvm_lock_pageq(); + if (nextpgp) + *nextpgp = pg->pageq.tqe_next; /* next page for daemon */ + uvm_pagefree(pg); + if (!nextpgp) + uvm_unlock_pageq(); + + /* + * now see if we need to kill the object + */ + if (uvn->u_flags & UVM_VNODE_RELKILL) { + if (uvn->u_obj.uo_refs) + panic("uvn_releasepg: kill flag set on referenced " + "object!"); + if (uvn->u_obj.uo_npages == 0) { + if (uvn->u_flags & UVM_VNODE_WRITEABLE) { + simple_lock(&uvn_wl_lock); + LIST_REMOVE(uvn, u_wlist); + simple_unlock(&uvn_wl_lock); + } +#ifdef DIAGNOSTIC + if (uvn->u_obj.memq.tqh_first) + panic("uvn_releasepg: pages in object with npages == 0"); +#endif + if (uvn->u_flags & UVM_VNODE_WANTED) + /* still holding object lock */ + wakeup(uvn); + + uvn->u_flags = 0; /* DEAD! */ + simple_unlock(&uvn->u_obj.vmobjlock); + return (FALSE); + } + } + return (TRUE); +} + +/* + * NOTE: currently we have to use VOP_READ/VOP_WRITE because they go + * through the buffer cache and allow I/O in any size. These VOPs use + * synchronous i/o. [vs. VOP_STRATEGY which can be async, but doesn't + * go through the buffer cache or allow I/O sizes larger than a + * block]. we will eventually want to change this. + * + * issues to consider: + * uvm provides the uvm_aiodesc structure for async i/o management. + * there are two tailq's in the uvm. structure... one for pending async + * i/o and one for "done" async i/o. to do an async i/o one puts + * an aiodesc on the "pending" list (protected by splbio()), starts the + * i/o and returns VM_PAGER_PEND. when the i/o is done, we expect + * some sort of "i/o done" function to be called (at splbio(), interrupt + * time). this function should remove the aiodesc from the pending list + * and place it on the "done" list and wakeup the daemon. the daemon + * will run at normal spl() and will remove all items from the "done" + * list and call the "aiodone" hook for each done request (see uvm_pager.c). + * [in the old vm code, this was done by calling the "put" routine with + * null arguments which made the code harder to read and understand because + * you had one function ("put") doing two things.] + * + * so the current pager needs: + * int uvn_aiodone(struct uvm_aiodesc *) + * + * => return KERN_SUCCESS (aio finished, free it). otherwise requeue for + * later collection. + * => called with pageq's locked by the daemon. + * + * general outline: + * - "try" to lock object. if fail, just return (will try again later) + * - drop "u_nio" (this req is done!) + * - if (object->iosync && u_naio == 0) { wakeup &uvn->u_naio } + * - get "page" structures (atop?). + * - handle "wanted" pages + * - handle "released" pages [using pgo_releasepg] + * >>> pgo_releasepg may kill the object + * dont forget to look at "object" wanted flag in all cases. + */ + + +/* + * uvn_flush: flush pages out of a uvm object. + * + * => object should be locked by caller. we may _unlock_ the object + * if (and only if) we need to clean a page (PGO_CLEANIT). + * we return with the object locked. + * => if PGO_CLEANIT is set, we may block (due to I/O). thus, a caller + * might want to unlock higher level resources (e.g. vm_map) + * before calling flush. + * => if PGO_CLEANIT is not set, then we will neither unlock the object + * or block. + * => if PGO_ALLPAGE is set, then all pages in the object are valid targets + * for flushing. + * => NOTE: we rely on the fact that the object's memq is a TAILQ and + * that new pages are inserted on the tail end of the list. thus, + * we can make a complete pass through the object in one go by starting + * at the head and working towards the tail (new pages are put in + * front of us). + * => NOTE: we are allowed to lock the page queues, so the caller + * must not be holding the lock on them [e.g. pagedaemon had + * better not call us with the queues locked] + * => we return TRUE unless we encountered some sort of I/O error + * + * comment on "cleaning" object and PG_BUSY pages: + * this routine is holding the lock on the object. the only time + * that it can run into a PG_BUSY page that it does not own is if + * some other process has started I/O on the page (e.g. either + * a pagein, or a pageout). if the PG_BUSY page is being paged + * in, then it can not be dirty (!PG_CLEAN) because no one has + * had a chance to modify it yet. if the PG_BUSY page is being + * paged out then it means that someone else has already started + * cleaning the page for us (how nice!). in this case, if we + * have syncio specified, then after we make our pass through the + * object we need to wait for the other PG_BUSY pages to clear + * off (i.e. we need to do an iosync). also note that once a + * page is PG_BUSY it must stay in its object until it is un-busyed. + * + * note on page traversal: + * we can traverse the pages in an object either by going down the + * linked list in "uobj->memq", or we can go over the address range + * by page doing hash table lookups for each address. depending + * on how many pages are in the object it may be cheaper to do one + * or the other. we set "by_list" to true if we are using memq. + * if the cost of a hash lookup was equal to the cost of the list + * traversal we could compare the number of pages in the start->stop + * range to the total number of pages in the object. however, it + * seems that a hash table lookup is more expensive than the linked + * list traversal, so we multiply the number of pages in the + * start->stop range by a penalty which we define below. + */ + +#define UVN_HASH_PENALTY 4 /* XXX: a guess */ + +static boolean_t +uvn_flush(uobj, start, stop, flags) + struct uvm_object *uobj; + vaddr_t start, stop; + int flags; +{ + struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; + struct vm_page *pp, *ppnext, *ptmp; + struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; + int npages, result, lcv; + boolean_t retval, need_iosync, by_list, needs_clean; + vaddr_t curoff; + u_short pp_version; + UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist); + + curoff = 0; /* XXX: shut up gcc */ + /* + * get init vals and determine how we are going to traverse object + */ + + need_iosync = FALSE; + retval = TRUE; /* return value */ + if (flags & PGO_ALLPAGES) { + start = 0; + stop = round_page(uvn->u_size); + by_list = TRUE; /* always go by the list */ + } else { + start = trunc_page(start); + stop = round_page(stop); + if (stop > round_page(uvn->u_size)) + printf("uvn_flush: strange, got an out of range " + "flush (fixed)\n"); + + by_list = (uobj->uo_npages <= + ((stop - start) >> PAGE_SHIFT) * UVN_HASH_PENALTY); + } + + UVMHIST_LOG(maphist, + " flush start=0x%x, stop=0x%x, by_list=%d, flags=0x%x", + start, stop, by_list, flags); + + /* + * PG_CLEANCHK: this bit is used by the pgo_mk_pcluster function as + * a _hint_ as to how up to date the PG_CLEAN bit is. if the hint + * is wrong it will only prevent us from clustering... it won't break + * anything. we clear all PG_CLEANCHK bits here, and pgo_mk_pcluster + * will set them as it syncs PG_CLEAN. This is only an issue if we + * are looking at non-inactive pages (because inactive page's PG_CLEAN + * bit is always up to date since there are no mappings). + * [borrowed PG_CLEANCHK idea from FreeBSD VM] + */ + + if ((flags & PGO_CLEANIT) != 0 && + uobj->pgops->pgo_mk_pcluster != NULL) { + if (by_list) { + for (pp = uobj->memq.tqh_first ; pp != NULL ; + pp = pp->listq.tqe_next) { + if (pp->offset < start || pp->offset >= stop) + continue; + pp->flags &= ~PG_CLEANCHK; + } + + } else { /* by hash */ + for (curoff = start ; curoff < stop; + curoff += PAGE_SIZE) { + pp = uvm_pagelookup(uobj, curoff); + if (pp) + pp->flags &= ~PG_CLEANCHK; + } + } + } + + /* + * now do it. note: we must update ppnext in body of loop or we + * will get stuck. we need to use ppnext because we may free "pp" + * before doing the next loop. + */ + + if (by_list) { + pp = uobj->memq.tqh_first; + } else { + curoff = start; + pp = uvm_pagelookup(uobj, curoff); + } + + ppnext = NULL; /* XXX: shut up gcc */ + ppsp = NULL; /* XXX: shut up gcc */ + uvm_lock_pageq(); /* page queues locked */ + + /* locked: both page queues and uobj */ + for ( ; (by_list && pp != NULL) || + (!by_list && curoff < stop) ; pp = ppnext) { + + if (by_list) { + + /* + * range check + */ + + if (pp->offset < start || pp->offset >= stop) { + ppnext = pp->listq.tqe_next; + continue; + } + + } else { + + /* + * null check + */ + + curoff += PAGE_SIZE; + if (pp == NULL) { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, curoff); + continue; + } + + } + + /* + * handle case where we do not need to clean page (either + * because we are not clean or because page is not dirty or + * is busy): + * + * NOTE: we are allowed to deactivate a non-wired active + * PG_BUSY page, but once a PG_BUSY page is on the inactive + * queue it must stay put until it is !PG_BUSY (so as not to + * confuse pagedaemon). + */ + + if ((flags & PGO_CLEANIT) == 0 || (pp->flags & PG_BUSY) != 0) { + needs_clean = FALSE; + if ((pp->flags & PG_BUSY) != 0 && + (flags & (PGO_CLEANIT|PGO_SYNCIO)) == + (PGO_CLEANIT|PGO_SYNCIO)) + need_iosync = TRUE; + } else { + /* + * freeing: nuke all mappings so we can sync + * PG_CLEAN bit with no race + */ + if ((pp->flags & PG_CLEAN) != 0 && + (flags & PGO_FREE) != 0 && + (pp->pqflags & PQ_ACTIVE) != 0) + pmap_page_protect(PMAP_PGARG(pp), VM_PROT_NONE); + if ((pp->flags & PG_CLEAN) != 0 && + pmap_is_modified(PMAP_PGARG(pp))) + pp->flags &= ~(PG_CLEAN); + pp->flags |= PG_CLEANCHK; /* update "hint" */ + + needs_clean = ((pp->flags & PG_CLEAN) == 0); + } + + /* + * if we don't need a clean... load ppnext and dispose of pp + */ + if (!needs_clean) { + /* load ppnext */ + if (by_list) + ppnext = pp->listq.tqe_next; + else { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, curoff); + } + + /* now dispose of pp */ + if (flags & PGO_DEACTIVATE) { + if ((pp->pqflags & PQ_INACTIVE) == 0 && + pp->wire_count == 0) { + pmap_page_protect(PMAP_PGARG(pp), + VM_PROT_NONE); + uvm_pagedeactivate(pp); + } + + } else if (flags & PGO_FREE) { + if (pp->flags & PG_BUSY) { + /* release busy pages */ + pp->flags |= PG_RELEASED; + } else { + pmap_page_protect(PMAP_PGARG(pp), + VM_PROT_NONE); + /* removed page from object */ + uvm_pagefree(pp); + } + } + /* ppnext is valid so we can continue... */ + continue; + } + + /* + * pp points to a page in the locked object that we are + * working on. if it is !PG_CLEAN,!PG_BUSY and we asked + * for cleaning (PGO_CLEANIT). we clean it now. + * + * let uvm_pager_put attempted a clustered page out. + * note: locked: uobj and page queues. + */ + + pp->flags |= PG_BUSY; /* we 'own' page now */ + UVM_PAGE_OWN(pp, "uvn_flush"); + pmap_page_protect(PMAP_PGARG(pp), VM_PROT_READ); + pp_version = pp->version; +ReTry: + ppsp = pps; + npages = sizeof(pps) / sizeof(struct vm_page *); + + /* locked: page queues, uobj */ + result = uvm_pager_put(uobj, pp, &ppsp, &npages, + flags | PGO_DOACTCLUST, start, stop); + /* unlocked: page queues, uobj */ + + /* + * at this point nothing is locked. if we did an async I/O + * it is remotely possible for the async i/o to complete and + * the page "pp" be freed or what not before we get a chance + * to relock the object. in order to detect this, we have + * saved the version number of the page in "pp_version". + */ + + /* relock! */ + simple_lock(&uobj->vmobjlock); + uvm_lock_pageq(); + + /* + * VM_PAGER_AGAIN: given the structure of this pager, this + * can only happen when we are doing async I/O and can't + * map the pages into kernel memory (pager_map) due to lack + * of vm space. if this happens we drop back to sync I/O. + */ + + if (result == VM_PAGER_AGAIN) { + /* + * it is unlikely, but page could have been released + * while we had the object lock dropped. we ignore + * this now and retry the I/O. we will detect and + * handle the released page after the syncio I/O + * completes. + */ +#ifdef DIAGNOSTIC + if (flags & PGO_SYNCIO) + panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)"); +#endif + flags |= PGO_SYNCIO; + goto ReTry; + } + + /* + * the cleaning operation is now done. finish up. note that + * on error (!OK, !PEND) uvm_pager_put drops the cluster for us. + * if success (OK, PEND) then uvm_pager_put returns the cluster + * to us in ppsp/npages. + */ + + /* + * for pending async i/o if we are not deactivating/freeing + * we can move on to the next page. + */ + + if (result == VM_PAGER_PEND) { + + if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { + /* + * no per-page ops: refresh ppnext and continue + */ + if (by_list) { + if (pp->version == pp_version) + ppnext = pp->listq.tqe_next; + else + /* reset */ + ppnext = uobj->memq.tqh_first; + } else { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, + curoff); + } + continue; + } + + /* need to do anything here? */ + } + + /* + * need to look at each page of the I/O operation. we defer + * processing "pp" until the last trip through this "for" loop + * so that we can load "ppnext" for the main loop after we + * play with the cluster pages [thus the "npages + 1" in the + * loop below]. + */ + + for (lcv = 0 ; lcv < npages + 1 ; lcv++) { + + /* + * handle ppnext for outside loop, and saving pp + * until the end. + */ + if (lcv < npages) { + if (ppsp[lcv] == pp) + continue; /* skip pp until the end */ + ptmp = ppsp[lcv]; + } else { + ptmp = pp; + + /* set up next page for outer loop */ + if (by_list) { + if (pp->version == pp_version) + ppnext = pp->listq.tqe_next; + else + /* reset */ + ppnext = uobj->memq.tqh_first; + } else { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, curoff); + } + } + + /* + * verify the page didn't get moved while obj was + * unlocked + */ + if (result == VM_PAGER_PEND && ptmp->uobject != uobj) + continue; + + /* + * unbusy the page if I/O is done. note that for + * pending I/O it is possible that the I/O op + * finished before we relocked the object (in + * which case the page is no longer busy). + */ + + if (result != VM_PAGER_PEND) { + if (ptmp->flags & PG_WANTED) + /* still holding object lock */ + thread_wakeup(ptmp); + + ptmp->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(ptmp, NULL); + if (ptmp->flags & PG_RELEASED) { + + /* pgo_releasepg wants this */ + uvm_unlock_pageq(); + if (!uvn_releasepg(ptmp, NULL)) + return (TRUE); + + uvm_lock_pageq(); /* relock */ + continue; /* next page */ + + } else { + ptmp->flags |= (PG_CLEAN|PG_CLEANCHK); + if ((flags & PGO_FREE) == 0) + pmap_clear_modify( + PMAP_PGARG(ptmp)); + } + } + + /* + * dispose of page + */ + + if (flags & PGO_DEACTIVATE) { + if ((pp->pqflags & PQ_INACTIVE) == 0 && + pp->wire_count == 0) { + pmap_page_protect(PMAP_PGARG(ptmp), + VM_PROT_NONE); + uvm_pagedeactivate(ptmp); + } + + } else if (flags & PGO_FREE) { + if (result == VM_PAGER_PEND) { + if ((ptmp->flags & PG_BUSY) != 0) + /* signal for i/o done */ + ptmp->flags |= PG_RELEASED; + } else { + if (result != VM_PAGER_OK) { + printf("uvn_flush: obj=%p, " + "offset=0x%lx. error " + "during pageout.\n", + pp->uobject, pp->offset); + printf("uvn_flush: WARNING: " + "changes to page may be " + "lost!\n"); + retval = FALSE; + } + pmap_page_protect(PMAP_PGARG(ptmp), + VM_PROT_NONE); + uvm_pagefree(ptmp); + } + } + + } /* end of "lcv" for loop */ + + } /* end of "pp" for loop */ + + /* + * done with pagequeues: unlock + */ + uvm_unlock_pageq(); + + /* + * now wait for all I/O if required. + */ + if (need_iosync) { + + UVMHIST_LOG(maphist," <<DOING IOSYNC>>",0,0,0,0); + while (uvn->u_nio != 0) { + uvn->u_flags |= UVM_VNODE_IOSYNC; + UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, + FALSE, "uvn_flush",0); + simple_lock(&uvn->u_obj.vmobjlock); + } + if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) + wakeup(&uvn->u_flags); + uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED); + } + + /* return, with object locked! */ + UVMHIST_LOG(maphist,"<- done (retval=0x%x)",retval,0,0,0); + return(retval); +} + +/* + * uvn_cluster + * + * we are about to do I/O in an object at offset. this function is called + * to establish a range of offsets around "offset" in which we can cluster + * I/O. + * + * - currently doesn't matter if obj locked or not. + */ + +static void +uvn_cluster(uobj, offset, loffset, hoffset) + struct uvm_object *uobj; + vaddr_t offset; + vaddr_t *loffset, *hoffset; /* OUT */ +{ + struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; + *loffset = offset; + + if (*loffset >= uvn->u_size) + panic("uvn_cluster: offset out of range"); + + /* + * XXX: old pager claims we could use VOP_BMAP to get maxcontig value. + */ + *hoffset = *loffset + MAXBSIZE; + if (*hoffset > round_page(uvn->u_size)) /* past end? */ + *hoffset = round_page(uvn->u_size); + + return; +} + +/* + * uvn_put: flush page data to backing store. + * + * => prefer map unlocked (not required) + * => object must be locked! we will _unlock_ it before starting I/O. + * => flags: PGO_SYNCIO -- use sync. I/O + * => note: caller must set PG_CLEAN and pmap_clear_modify (if needed) + * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. + * [thus we never do async i/o! see iodone comment] + */ + +static int +uvn_put(uobj, pps, npages, flags) + struct uvm_object *uobj; + struct vm_page **pps; + int npages, flags; +{ + int retval; + + /* note: object locked */ + retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE); + /* note: object unlocked */ + + return(retval); +} + + +/* + * uvn_get: get pages (synchronously) from backing store + * + * => prefer map unlocked (not required) + * => object must be locked! we will _unlock_ it before starting any I/O. + * => flags: PGO_ALLPAGES: get all of the pages + * PGO_LOCKED: fault data structures are locked + * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx] + * => NOTE: caller must check for released pages!! + */ + +static int +uvn_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags) + struct uvm_object *uobj; + vaddr_t offset; + struct vm_page **pps; /* IN/OUT */ + int *npagesp; /* IN (OUT if PGO_LOCKED) */ + int centeridx, advice, flags; + vm_prot_t access_type; +{ + vaddr_t current_offset; + struct vm_page *ptmp; + int lcv, result, gotpages; + boolean_t done; + UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist, "flags=%d", flags,0,0,0); + + /* + * step 1: handled the case where fault data structures are locked. + */ + + if (flags & PGO_LOCKED) { + + /* + * gotpages is the current number of pages we've gotten (which + * we pass back up to caller via *npagesp. + */ + + gotpages = 0; + + /* + * step 1a: get pages that are already resident. only do this + * if the data structures are locked (i.e. the first time + * through). + */ + + done = TRUE; /* be optimistic */ + + for (lcv = 0, current_offset = offset ; lcv < *npagesp ; + lcv++, current_offset += PAGE_SIZE) { + + /* do we care about this page? if not, skip it */ + if (pps[lcv] == PGO_DONTCARE) + continue; + + /* lookup page */ + ptmp = uvm_pagelookup(uobj, current_offset); + + /* to be useful must get a non-busy, non-released pg */ + if (ptmp == NULL || + (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + if (lcv == centeridx || (flags & PGO_ALLPAGES) + != 0) + done = FALSE; /* need to do a wait or I/O! */ + continue; + } + + /* + * useful page: busy/lock it and plug it in our + * result array + */ + ptmp->flags |= PG_BUSY; /* loan up to caller */ + UVM_PAGE_OWN(ptmp, "uvn_get1"); + pps[lcv] = ptmp; + gotpages++; + + } /* "for" lcv loop */ + + /* + * XXX: given the "advice", should we consider async read-ahead? + * XXX: fault current does deactive of pages behind us. is + * this good (other callers might now). + */ + /* + * XXX: read-ahead currently handled by buffer cache (bread) + * level. + * XXX: no async i/o available. + * XXX: so we don't do anything now. + */ + + /* + * step 1c: now we've either done everything needed or we to + * unlock and do some waiting or I/O. + */ + + *npagesp = gotpages; /* let caller know */ + if (done) + return(VM_PAGER_OK); /* bingo! */ + else + /* EEK! Need to unlock and I/O */ + return(VM_PAGER_UNLOCK); + } + + /* + * step 2: get non-resident or busy pages. + * object is locked. data structures are unlocked. + * + * XXX: because we can't do async I/O at this level we get things + * page at a time (otherwise we'd chunk). the VOP_READ() will do + * async-read-ahead for us at a lower level. + */ + + for (lcv = 0, current_offset = offset ; + lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) { + + /* skip over pages we've already gotten or don't want */ + /* skip over pages we don't _have_ to get */ + if (pps[lcv] != NULL || (lcv != centeridx && + (flags & PGO_ALLPAGES) == 0)) + continue; + + /* + * we have yet to locate the current page (pps[lcv]). we first + * look for a page that is already at the current offset. if + * we fine a page, we check to see if it is busy or released. + * if that is the case, then we sleep on the page until it is + * no longer busy or released and repeat the lookup. if the + * page we found is neither busy nor released, then we busy it + * (so we own it) and plug it into pps[lcv]. this breaks the + * following while loop and indicates we are ready to move on + * to the next page in the "lcv" loop above. + * + * if we exit the while loop with pps[lcv] still set to NULL, + * then it means that we allocated a new busy/fake/clean page + * ptmp in the object and we need to do I/O to fill in the data. + */ + + while (pps[lcv] == NULL) { /* top of "pps" while loop */ + + /* look for a current page */ + ptmp = uvm_pagelookup(uobj, current_offset); + + /* nope? allocate one now (if we can) */ + if (ptmp == NULL) { + + ptmp = uvm_pagealloc(uobj, current_offset, + NULL); /* alloc */ + + /* out of RAM? */ + if (ptmp == NULL) { + simple_unlock(&uobj->vmobjlock); + uvm_wait("uvn_getpage"); + simple_lock(&uobj->vmobjlock); + + /* goto top of pps while loop */ + continue; + } + + /* + * got new page ready for I/O. break pps + * while loop. pps[lcv] is still NULL. + */ + break; + } + + /* page is there, see if we need to wait on it */ + if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { + ptmp->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(ptmp, + &uobj->vmobjlock, 0, "uvn_get",0); + simple_lock(&uobj->vmobjlock); + continue; /* goto top of pps while loop */ + } + + /* + * if we get here then the page has become resident + * and unbusy between steps 1 and 2. we busy it + * now (so we own it) and set pps[lcv] (so that we + * exit the while loop). + */ + ptmp->flags |= PG_BUSY; + UVM_PAGE_OWN(ptmp, "uvn_get2"); + pps[lcv] = ptmp; + } + + /* + * if we own the a valid page at the correct offset, pps[lcv] + * will point to it. nothing more to do except go to the + * next page. + */ + + if (pps[lcv]) + continue; /* next lcv */ + + /* + * we have a "fake/busy/clean" page that we just allocated. do + * I/O to fill it with valid data. note that object must be + * locked going into uvn_io, but will be unlocked afterwards. + */ + + result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1, + PGO_SYNCIO, UIO_READ); + + /* + * I/O done. object is unlocked (by uvn_io). because we used + * syncio the result can not be PEND or AGAIN. we must relock + * and check for errors. + */ + + /* lock object. check for errors. */ + simple_lock(&uobj->vmobjlock); + if (result != VM_PAGER_OK) { + if (ptmp->flags & PG_WANTED) + /* object lock still held */ + thread_wakeup(ptmp); + + ptmp->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(ptmp, NULL); + uvm_lock_pageq(); + uvm_pagefree(ptmp); + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + return(result); + } + + /* + * we got the page! clear the fake flag (indicates valid + * data now in page) and plug into our result array. note + * that page is still busy. + * + * it is the callers job to: + * => check if the page is released + * => unbusy the page + * => activate the page + */ + + ptmp->flags &= ~PG_FAKE; /* data is valid ... */ + pmap_clear_modify(PMAP_PGARG(ptmp)); /* ... and clean */ + pps[lcv] = ptmp; + + } /* lcv loop */ + + /* + * finally, unlock object and return. + */ + + simple_unlock(&uobj->vmobjlock); + return (VM_PAGER_OK); +} + +/* + * uvn_asyncget: start async I/O to bring pages into ram + * + * => caller must lock object(???XXX: see if this is best) + * => could be called from uvn_get or a madvise() fault-ahead. + * => if it fails, it doesn't matter. + */ + +static int +uvn_asyncget(uobj, offset, npages) + struct uvm_object *uobj; + vaddr_t offset; + int npages; +{ + + /* + * XXXCDC: we can't do async I/O yet + */ + printf("uvn_asyncget called\n"); + return (KERN_SUCCESS); +} + +/* + * uvn_io: do I/O to a vnode + * + * => prefer map unlocked (not required) + * => object must be locked! we will _unlock_ it before starting I/O. + * => flags: PGO_SYNCIO -- use sync. I/O + * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. + * [thus we never do async i/o! see iodone comment] + */ + +static int +uvn_io(uvn, pps, npages, flags, rw) + struct uvm_vnode *uvn; + vm_page_t *pps; + int npages, flags, rw; +{ + struct vnode *vn; + struct uio uio; + struct iovec iov; + vaddr_t kva, file_offset; + int waitf, result, got, wanted; + UVMHIST_FUNC("uvn_io"); UVMHIST_CALLED(maphist); + + UVMHIST_LOG(maphist, "rw=%d", rw,0,0,0); + + /* + * init values + */ + + waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT; + vn = (struct vnode *) uvn; + file_offset = pps[0]->offset; + + /* + * check for sync'ing I/O. + */ + + while (uvn->u_flags & UVM_VNODE_IOSYNC) { + if (waitf == M_NOWAIT) { + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist,"<- try again (iosync)",0,0,0,0); + return(VM_PAGER_AGAIN); + } + uvn->u_flags |= UVM_VNODE_IOSYNCWANTED; + UVM_UNLOCK_AND_WAIT(&uvn->u_flags, &uvn->u_obj.vmobjlock, + FALSE, "uvn_iosync",0); + simple_lock(&uvn->u_obj.vmobjlock); + } + + /* + * check size + */ + + if (file_offset >= uvn->u_size) { + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist,"<- BAD (size check)",0,0,0,0); +#ifdef DIAGNOSTIC + printf("uvn_io: note: size check fired\n"); +#endif + return(VM_PAGER_BAD); + } + + /* + * first try and map the pages in (without waiting) + */ + + kva = uvm_pagermapin(pps, npages, NULL, M_NOWAIT); + if (kva == NULL && waitf == M_NOWAIT) { + simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(maphist,"<- mapin failed (try again)",0,0,0,0); + return(VM_PAGER_AGAIN); + } + + /* + * ok, now bump u_nio up. at this point we are done with uvn + * and can unlock it. if we still don't have a kva, try again + * (this time with sleep ok). + */ + + uvn->u_nio++; /* we have an I/O in progress! */ + simple_unlock(&uvn->u_obj.vmobjlock); + /* NOTE: object now unlocked */ + if (kva == NULL) { + kva = uvm_pagermapin(pps, npages, NULL, M_WAITOK); + } + + /* + * ok, mapped in. our pages are PG_BUSY so they are not going to + * get touched (so we can look at "offset" without having to lock + * the object). set up for I/O. + */ + + /* + * fill out uio/iov + */ + + iov.iov_base = (caddr_t) kva; + wanted = npages << PAGE_SHIFT; + if (file_offset + wanted > uvn->u_size) + wanted = uvn->u_size - file_offset; /* XXX: needed? */ + iov.iov_len = wanted; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = file_offset; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = rw; + uio.uio_resid = wanted; + uio.uio_procp = NULL; + + /* + * do the I/O! (XXX: curproc?) + */ + + UVMHIST_LOG(maphist, "calling VOP",0,0,0,0); + + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) + vn_lock(vn, LK_EXCLUSIVE | LK_RETRY, curproc /*XXX*/); + /* NOTE: vnode now locked! */ + + if (rw == UIO_READ) + result = VOP_READ(vn, &uio, 0, curproc->p_ucred); + else + result = VOP_WRITE(vn, &uio, 0, curproc->p_ucred); + + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) + VOP_UNLOCK(vn, 0, curproc /*XXX*/); + /* NOTE: vnode now unlocked (unless vnislocked) */ + + UVMHIST_LOG(maphist, "done calling VOP",0,0,0,0); + + /* + * result == unix style errno (0 == OK!) + * + * zero out rest of buffer (if needed) + */ + + if (result == 0) { + got = wanted - uio.uio_resid; + + if (wanted && got == 0) { + result = EIO; /* XXX: error? */ + } else if (got < PAGE_SIZE * npages && rw == UIO_READ) { + bzero((void *) (kva + got), + (npages << PAGE_SHIFT) - got); + } + } + + /* + * now remove pager mapping + */ + uvm_pagermapout(kva, npages); + + /* + * now clean up the object (i.e. drop I/O count) + */ + + simple_lock(&uvn->u_obj.vmobjlock); + /* NOTE: object now locked! */ + + uvn->u_nio--; /* I/O DONE! */ + if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) { + wakeup(&uvn->u_nio); + } + simple_unlock(&uvn->u_obj.vmobjlock); + /* NOTE: object now unlocked! */ + + /* + * done! + */ + + UVMHIST_LOG(maphist, "<- done (result %d)", result,0,0,0); + if (result == 0) + return(VM_PAGER_OK); + else + return(VM_PAGER_ERROR); +} + +/* + * uvm_vnp_uncache: disable "persisting" in a vnode... when last reference + * is gone we will kill the object (flushing dirty pages back to the vnode + * if needed). + * + * => returns TRUE if there was no uvm_object attached or if there was + * one and we killed it [i.e. if there is no active uvn] + * => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if + * needed] + * + * => XXX: given that we now kill uvn's when a vnode is recycled (without + * having to hold a reference on the vnode) and given a working + * uvm_vnp_sync(), how does that effect the need for this function? + * [XXXCDC: seems like it can die?] + * + * => XXX: this function should DIE once we merge the VM and buffer + * cache. + * + * research shows that this is called in the following places: + * ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode + * changes sizes + * ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we + * are written to + * ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit + * is off + * ffs_realloccg: when we can't extend the current block and have + * to allocate a new one we call this [XXX: why?] + * nfsrv_rename, rename_files: called when the target filename is there + * and we want to remove it + * nfsrv_remove, sys_unlink: called on file we are removing + * nfsrv_access: if VTEXT and we want WRITE access and we don't uncache + * then return "text busy" + * nfs_open: seems to uncache any file opened with nfs + * vn_writechk: if VTEXT vnode and can't uncache return "text busy" + */ + +boolean_t +uvm_vnp_uncache(vp) + struct vnode *vp; +{ + struct uvm_vnode *uvn = &vp->v_uvm; + + /* + * lock uvn part of the vnode and check to see if we need to do anything + */ + + simple_lock(&uvn->u_obj.vmobjlock); + if ((uvn->u_flags & UVM_VNODE_VALID) == 0 || + (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { + simple_unlock(&uvn->u_obj.vmobjlock); + return(TRUE); + } + + /* + * we have a valid, non-blocked uvn. clear persist flag. + * if uvn is currently active we can return now. + */ + + uvn->u_flags &= ~UVM_VNODE_CANPERSIST; + if (uvn->u_obj.uo_refs) { + simple_unlock(&uvn->u_obj.vmobjlock); + return(FALSE); + } + + /* + * uvn is currently persisting! we have to gain a reference to + * it so that we can call uvn_detach to kill the uvn. + */ + + VREF(vp); /* seems ok, even with VOP_LOCK */ + uvn->u_obj.uo_refs++; /* value is now 1 */ + simple_unlock(&uvn->u_obj.vmobjlock); + + +#ifdef DEBUG + /* + * carry over sanity check from old vnode pager: the vnode should + * be VOP_LOCK'd, and we confirm it here. + */ + if (!VOP_ISLOCKED(vp)) { + boolean_t is_ok_anyway = FALSE; +#ifdef NFS + extern int (**nfsv2_vnodeop_p) __P((void *)); + extern int (**spec_nfsv2nodeop_p) __P((void *)); + extern int (**fifo_nfsv2nodeop_p) __P((void *)); + + /* vnode is NOT VOP_LOCKed: some vnode types _never_ lock */ + if (vp->v_op == nfsv2_vnodeop_p || + vp->v_op == spec_nfsv2nodeop_p) { + is_ok_anyway = TRUE; + } + if (vp->v_op == fifo_nfsv2nodeop_p) { + is_ok_anyway = TRUE; + } +#endif /* NFS */ + if (!is_ok_anyway) + panic("uvm_vnp_uncache: vnode not locked!"); + } +#endif /* DEBUG */ + + /* + * now drop our reference to the vnode. if we have the sole + * reference to the vnode then this will cause it to die [as we + * just cleared the persist flag]. we have to unlock the vnode + * while we are doing this as it may trigger I/O. + * + * XXX: it might be possible for uvn to get reclaimed while we are + * unlocked causing us to return TRUE when we should not. we ignore + * this as a false-positive return value doesn't hurt us. + */ + VOP_UNLOCK(vp, 0, curproc /*XXX*/); + uvn_detach(&uvn->u_obj); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curproc/*XXX*/); + + /* + * and return... + */ + + return(TRUE); +} + +/* + * uvm_vnp_setsize: grow or shrink a vnode uvn + * + * grow => just update size value + * shrink => toss un-needed pages + * + * => we assume that the caller has a reference of some sort to the + * vnode in question so that it will not be yanked out from under + * us. + * + * called from: + * => truncate fns (ext2fs_truncate, ffs_truncate, detrunc[msdos]) + * => "write" fns (ext2fs_write, WRITE [ufs/ufs], msdosfs_write, nfs_write) + * => ffs_balloc [XXX: why? doesn't WRITE handle?] + * => NFS: nfs_loadattrcache, nfs_getattrcache, nfs_setattr + * => union fs: union_newsize + */ + +void +uvm_vnp_setsize(vp, newsize) + struct vnode *vp; + u_quad_t newsize; +{ + struct uvm_vnode *uvn = &vp->v_uvm; + + /* + * lock uvn and check for valid object, and if valid: do it! + */ + simple_lock(&uvn->u_obj.vmobjlock); + if (uvn->u_flags & UVM_VNODE_VALID) { + + /* + * make sure that the newsize fits within a vaddr_t + * XXX: need to revise addressing data types + */ + + if (newsize > (vaddr_t) -PAGE_SIZE) { +#ifdef DEBUG + printf("uvm_vnp_setsize: vn %p size truncated " + "%qx->%lx\n", vp, newsize, (vaddr_t)-PAGE_SIZE); +#endif + newsize = (vaddr_t)-PAGE_SIZE; + } + + /* + * now check if the size has changed: if we shrink we had better + * toss some pages... + */ + + if (uvn->u_size > newsize) { + (void)uvn_flush(&uvn->u_obj, (vaddr_t) newsize, + uvn->u_size, PGO_FREE); + } + uvn->u_size = (vaddr_t)newsize; + } + simple_unlock(&uvn->u_obj.vmobjlock); + + /* + * done + */ + return; +} + +/* + * uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes. + * + * => called from sys_sync with no VM structures locked + * => only one process can do a sync at a time (because the uvn + * structure only has one queue for sync'ing). we ensure this + * by holding the uvn_sync_lock while the sync is in progress. + * other processes attempting a sync will sleep on this lock + * until we are done. + */ + +void +uvm_vnp_sync(mp) + struct mount *mp; +{ + struct uvm_vnode *uvn; + struct vnode *vp; + boolean_t got_lock; + + /* + * step 1: ensure we are only ones using the uvn_sync_q by locking + * our lock... + */ + lockmgr(&uvn_sync_lock, LK_EXCLUSIVE, (void *)0, curproc /*XXX*/); + + /* + * step 2: build up a simpleq of uvns of interest based on the + * write list. we gain a reference to uvns of interest. must + * be careful about locking uvn's since we will be holding uvn_wl_lock + * in the body of the loop. + */ + SIMPLEQ_INIT(&uvn_sync_q); + simple_lock(&uvn_wl_lock); + for (uvn = uvn_wlist.lh_first ; uvn != NULL ; + uvn = uvn->u_wlist.le_next) { + + vp = (struct vnode *) uvn; + if (mp && vp->v_mount != mp) + continue; + + /* attempt to gain reference */ + while ((got_lock = simple_lock_try(&uvn->u_obj.vmobjlock)) == + FALSE && + (uvn->u_flags & UVM_VNODE_BLOCKED) == 0) + /* spin */ ; + + /* + * we will exit the loop if either if the following are true: + * - we got the lock [always true if NCPU == 1] + * - we failed to get the lock but noticed the vnode was + * "blocked" -- in this case the vnode must be a dying + * vnode, and since dying vnodes are in the process of + * being flushed out, we can safely skip this one + * + * we want to skip over the vnode if we did not get the lock, + * or if the vnode is already dying (due to the above logic). + * + * note that uvn must already be valid because we found it on + * the wlist (this also means it can't be ALOCK'd). + */ + if (!got_lock || (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { + if (got_lock) + simple_unlock(&uvn->u_obj.vmobjlock); + continue; /* skip it */ + } + + /* + * gain reference. watch out for persisting uvns (need to + * regain vnode REF). + */ + if (uvn->u_obj.uo_refs == 0) + VREF(vp); + uvn->u_obj.uo_refs++; + simple_unlock(&uvn->u_obj.vmobjlock); + + /* + * got it! + */ + SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq); + } + simple_unlock(&uvn_wl_lock); + + /* + * step 3: we now have a list of uvn's that may need cleaning. + * we are holding the uvn_sync_lock, but have dropped the uvn_wl_lock + * (so we can now safely lock uvn's again). + */ + + for (uvn = uvn_sync_q.sqh_first ; uvn ; uvn = uvn->u_syncq.sqe_next) { + simple_lock(&uvn->u_obj.vmobjlock); +#ifdef DIAGNOSTIC + if (uvn->u_flags & UVM_VNODE_DYING) { + printf("uvm_vnp_sync: dying vnode on sync list\n"); + } +#endif + uvn_flush(&uvn->u_obj, 0, 0, + PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST); + + /* + * if we have the only reference and we just cleaned the uvn, + * then we can pull it out of the UVM_VNODE_WRITEABLE state + * thus allowing us to avoid thinking about flushing it again + * on later sync ops. + */ + if (uvn->u_obj.uo_refs == 1 && + (uvn->u_flags & UVM_VNODE_WRITEABLE)) { + LIST_REMOVE(uvn, u_wlist); + uvn->u_flags &= ~UVM_VNODE_WRITEABLE; + } + + simple_unlock(&uvn->u_obj.vmobjlock); + + /* now drop our reference to the uvn */ + uvn_detach(&uvn->u_obj); + } + + /* + * done! release sync lock + */ + lockmgr(&uvn_sync_lock, LK_RELEASE, (void *)0, curproc /*XXX*/); +} diff --git a/sys/uvm/uvm_vnode.h b/sys/uvm/uvm_vnode.h new file mode 100644 index 00000000000..edd4f7b698a --- /dev/null +++ b/sys/uvm/uvm_vnode.h @@ -0,0 +1,110 @@ +/* $NetBSD: uvm_vnode.h,v 1.6 1998/08/13 02:11:04 eeh Exp $ */ + +/* + * XXXCDC: "ROUGH DRAFT" QUALITY UVM PRE-RELEASE FILE! + * >>>USE AT YOUR OWN RISK, WORK IS NOT FINISHED<<< + */ +/* + * + * Copyright (c) 1997 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: Id: uvm_vnode.h,v 1.1.2.4 1997/10/03 21:18:24 chuck Exp + */ + +#ifndef _UVM_UVM_VNODE_H_ +#define _UVM_UVM_VNODE_H_ + +/* + * uvm_vnode.h + * + * vnode handle into the VM system. + */ + +/* + * the uvm_vnode structure. put at the top of the vnode data structure. + * this allows: + * (struct vnode *) == (struct uvm_vnode *) == (struct uvm_object *) + */ + +struct uvm_vnode { + struct uvm_object u_obj; /* the actual VM object */ + int u_flags; /* flags */ + int u_nio; /* number of running I/O requests */ + vsize_t u_size; /* size of object */ + + /* the following entry is locked by uvn_wl_lock */ + LIST_ENTRY(uvm_vnode) u_wlist; /* list of writeable vnode objects */ + + /* the following entry is locked by uvn_sync_lock */ + SIMPLEQ_ENTRY(uvm_vnode) u_syncq; /* vnode objects due for a "sync" */ +}; + +/* + * u_flags values + */ +#define UVM_VNODE_VALID 0x001 /* we are attached to the vnode */ +#define UVM_VNODE_CANPERSIST 0x002 /* we can persist after ref == 0 */ +#define UVM_VNODE_ALOCK 0x004 /* uvn_attach is locked out */ +#define UVM_VNODE_DYING 0x008 /* final detach/terminate in + progress */ +#define UVM_VNODE_RELKILL 0x010 /* uvn should be killed by releasepg + when final i/o is done */ +#define UVM_VNODE_WANTED 0x020 /* someone is waiting for alock, + dying, or relkill to clear */ +#define UVM_VNODE_VNISLOCKED 0x040 /* underlying vnode struct is locked + (valid when DYING is true) */ +#define UVM_VNODE_IOSYNC 0x080 /* I/O sync in progress ... setter + sleeps on &uvn->u_nio */ +#define UVM_VNODE_IOSYNCWANTED 0x100 /* a process is waiting for the + i/o sync to clear so it can do + i/o */ +#define UVM_VNODE_WRITEABLE 0x200 /* uvn has pages that are writeable */ + +/* + * UVM_VNODE_BLOCKED: any condition that should new processes from + * touching the vnode [set WANTED and sleep to wait for it to clear] + */ +#define UVM_VNODE_BLOCKED (UVM_VNODE_ALOCK|UVM_VNODE_DYING|UVM_VNODE_RELKILL) + + +/* + * prototypes + */ + +#if 0 +/* + * moved uvn_attach to uvm_extern.h because uvm_vnode.h is needed to + * include sys/vnode.h, and files that include sys/vnode.h don't know + * what a vm_prot_t is. + */ +struct uvm_object *uvn_attach __P((void *, vm_prot_t)); +#endif + +#endif /* _UVM_UVM_VNODE_H_ */ |