UBC was a disaster. It worked very good when it worked, but on some

machines or some configurations or in some phase of the moon (we actually don't know when or why) files disappeared. Since we've not been able to track down the problem in two weeks intense debugging and we need -current to be stable, back out everything to a state it had before UBC. We apologise for the inconvenience.
author: art <art@openbsd.org> 2001-12-19 08:58:05 +0000
committer: art <art@openbsd.org> 2001-12-19 08:58:05 +0000
commit: 1414b0faee630b2a2d49752ab9b1fe4209fe6bfd (patch)
tree: 489b0b0b16606f48bda8c6e8ea28957b5601320d /sys/uvm/uvm_vnode.c
parent: basic KNF done while i was looking for something else (diff)
download: wireguard-openbsd-1414b0faee630b2a2d49752ab9b1fe4209fe6bfd.tar.xz
wireguard-openbsd-1414b0faee630b2a2d49752ab9b1fe4209fe6bfd.zip
1 files changed, 1354 insertions, 368 deletions
diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c
index d58d0cf93f4..4783597df3d 100644
--- a/sys/uvm/uvm_vnode.c
+++ b/sys/uvm/uvm_vnode.c
@@ -1,10 +1,10 @@
-/*	$OpenBSD: uvm_vnode.c,v 1.31 2001/12/10 02:19:34 art Exp $	*/
-/*	$NetBSD: uvm_vnode.c,v 1.51 2001/08/17 05:53:02 chs Exp $	*/
+/*	$OpenBSD: uvm_vnode.c,v 1.32 2001/12/19 08:58:07 art Exp $	*/
+/*	$NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
  * Copyright (c) 1991, 1993
- *      The Regents of the University of California.
+ *      The Regents of the University of California.  
  * Copyright (c) 1990 University of Utah.
  *
  * All rights reserved.
@@ -24,7 +24,7 @@
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Charles D. Cranor,
- *	Washington University, the University of California, Berkeley and
+ *	Washington University, the University of California, Berkeley and 
  *	its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
@@ -52,7 +52,6 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
@@ -60,47 +59,62 @@
 #include <sys/ioctl.h>
 #include <sys/fcntl.h>
 #include <sys/conf.h>
-#include <sys/pool.h>
-#include <sys/mount.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <uvm/uvm.h>
+#include <uvm/uvm_vnode.h>
+
+/*
+ * private global data structure
+ *
+ * we keep a list of writeable active vnode-backed VM objects for sync op.
+ * we keep a simpleq of vnodes that are currently being sync'd.
+ */
+
+LIST_HEAD(uvn_list_struct, uvm_vnode);
+static struct uvn_list_struct uvn_wlist;	/* writeable uvns */
+static simple_lock_data_t uvn_wl_lock;		/* locks uvn_wlist */
+
+SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode);
+static struct uvn_sq_struct uvn_sync_q;		/* sync'ing uvns */
+lock_data_t uvn_sync_lock;			/* locks sync operation */
 
 /*
  * functions
  */
 
-static void		uvn_cluster __P((struct uvm_object *, voff_t, voff_t *,
-					 voff_t *));
-static void		uvn_detach __P((struct uvm_object *));
-static int		uvn_findpage __P((struct uvm_object *, voff_t,
-					  struct vm_page **, int));
-boolean_t		uvn_flush __P((struct uvm_object *, voff_t, voff_t,
-				       int));
-int			uvn_get __P((struct uvm_object *, voff_t,
-				     struct vm_page **, int *, int, vm_prot_t,
-				     int, int));
-int			uvn_put __P((struct uvm_object *, struct vm_page **,
-				     int, boolean_t));
-static void		uvn_reference __P((struct uvm_object *));
-static boolean_t	uvn_releasepg __P((struct vm_page *,
-					   struct vm_page **));
+static void		   uvn_cluster __P((struct uvm_object *, voff_t,
+					   voff_t *, voff_t *));
+static void                uvn_detach __P((struct uvm_object *));
+static boolean_t           uvn_flush __P((struct uvm_object *, voff_t, 
+					 voff_t, int));
+static int                 uvn_get __P((struct uvm_object *, voff_t,
+					vm_page_t *, int *, int, 
+					vm_prot_t, int, int));
+static void		   uvn_init __P((void));
+static int		   uvn_io __P((struct uvm_vnode *, vm_page_t *,
+				      int, int, int));
+static int		   uvn_put __P((struct uvm_object *, vm_page_t *,
+					int, boolean_t));
+static void                uvn_reference __P((struct uvm_object *));
+static boolean_t	   uvn_releasepg __P((struct vm_page *, 
+					      struct vm_page **));
 
 /*
  * master pager structure
  */
 
 struct uvm_pagerops uvm_vnodeops = {
-	NULL,
+	uvn_init,
 	uvn_reference,
 	uvn_detach,
-	NULL,
+	NULL,			/* no specialized fault routine required */
 	uvn_flush,
 	uvn_get,
 	uvn_put,
 	uvn_cluster,
-	uvm_mk_pcluster,
+	uvm_mk_pcluster, /* use generic version of this: see uvm_pager.c */
 	uvn_releasepg,
 };
 
@@ -109,6 +123,22 @@ struct uvm_pagerops uvm_vnodeops = {
  */
 
 /*
+ * uvn_init
+ *
+ * init pager private data structures.
+ */
+
+static void
+uvn_init()
+{
+
+	LIST_INIT(&uvn_wlist);
+	simple_lock_init(&uvn_wl_lock);
+	/* note: uvn_sync_q init'd in uvm_vnp_sync() */
+	lockinit(&uvn_sync_lock, PVM, "uvnsync", 0, 0);
+}
+
+/*
  * uvn_attach
  *
  * attach a vnode structure to a VM object.  if the vnode is already
@@ -129,26 +159,29 @@ uvn_attach(arg, accessprot)
 	vm_prot_t accessprot;
 {
 	struct vnode *vp = arg;
-	struct uvm_object *uobj = &vp->v_uobj;
+	struct uvm_vnode *uvn = &vp->v_uvm;
 	struct vattr vattr;
-	int result;
+	int oldflags, result;
 	struct partinfo pi;
-	voff_t used_vnode_size;
+	u_quad_t used_vnode_size;
 	UVMHIST_FUNC("uvn_attach"); UVMHIST_CALLED(maphist);
 
 	UVMHIST_LOG(maphist, "(vn=0x%x)", arg,0,0,0);
-	used_vnode_size = (voff_t)0;
+
+	used_vnode_size = (u_quad_t)0;	/* XXX gcc -Wuninitialized */
 
 	/*
 	 * first get a lock on the uvn.
 	 */
-	simple_lock(uobj->vmobjlock);
-	while (vp->v_flag & VXLOCK) {
-		vp->v_flag |= VXWANT;
+	simple_lock(&uvn->u_obj.vmobjlock);
+	while (uvn->u_flags & UVM_VNODE_BLOCKED) {
+		printf("uvn_attach: blocked at 0x%p flags 0x%x\n",
+		    uvn, uvn->u_flags);
+		uvn->u_flags |= UVM_VNODE_WANTED;
 		UVMHIST_LOG(maphist, "  SLEEPING on blocked vn",0,0,0,0);
-		UVM_UNLOCK_AND_WAIT(vp, &uobj->vmobjlock, FALSE,
+		UVM_UNLOCK_AND_WAIT(uvn, &uvn->u_obj.vmobjlock, FALSE,
 		    "uvn_attach", 0);
-		simple_lock(&uobj->vmobjlock);
+		simple_lock(&uvn->u_obj.vmobjlock);
 		UVMHIST_LOG(maphist,"  WOKE UP",0,0,0,0);
 	}
 
@@ -156,21 +189,56 @@ uvn_attach(arg, accessprot)
 	 * if we're mapping a BLK device, make sure it is a disk.
 	 */
 	if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) {
-		simple_unlock(&uobj->vmobjlock);
+		simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */
 		UVMHIST_LOG(maphist,"<- done (VBLK not D_DISK!)", 0,0,0,0);
 		return(NULL);
 	}
-	KASSERT(vp->v_type == VREG || vp->v_type == VBLK);
 
 	/*
-	 * set up our idea of the size
-	 * if this hasn't been done already.
+	 * now we have lock and uvn must not be in a blocked state.
+	 * first check to see if it is already active, in which case
+	 * we can bump the reference count, check to see if we need to
+	 * add it to the writeable list, and then return.
 	 */
-	if (vp->v_size == VSIZENOTSET) {
+	if (uvn->u_flags & UVM_VNODE_VALID) {	/* already active? */
+
+		/* regain VREF if we were persisting */
+		if (uvn->u_obj.uo_refs == 0) {
+			VREF(vp);
+			UVMHIST_LOG(maphist," VREF (reclaim persisting vnode)",
+			    0,0,0,0);
+		}
+		uvn->u_obj.uo_refs++;		/* bump uvn ref! */
+
+		/* check for new writeable uvn */
+		if ((accessprot & VM_PROT_WRITE) != 0 && 
+		    (uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) {
+			simple_lock(&uvn_wl_lock);
+			LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
+			simple_unlock(&uvn_wl_lock);
+			/* we are now on wlist! */
+			uvn->u_flags |= UVM_VNODE_WRITEABLE;
+		}
+
+		/* unlock and return */
+		simple_unlock(&uvn->u_obj.vmobjlock);
+		UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs,
+		    0, 0, 0);
+		return (&uvn->u_obj);
+	} 
 
-	vp->v_flag |= VXLOCK;
-	simple_unlock(&uobj->vmobjlock); /* drop lock in case we sleep */
+	/*
+	 * need to call VOP_GETATTR() to get the attributes, but that could
+	 * block (due to I/O), so we want to unlock the object before calling.
+	 * however, we want to keep anyone else from playing with the object
+	 * while it is unlocked.   to do this we set UVM_VNODE_ALOCK which
+	 * prevents anyone from attaching to the vnode until we are done with
+	 * it.
+	 */
+	uvn->u_flags = UVM_VNODE_ALOCK;
+	simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */
 		/* XXX: curproc? */
+
 	if (vp->v_type == VBLK) {
 		/*
 		 * We could implement this as a specfs getattr call, but:
@@ -184,8 +252,8 @@ uvn_attach(arg, accessprot)
 		    DIOCGPART, (caddr_t)&pi, FREAD, curproc);
 		if (result == 0) {
 			/* XXX should remember blocksize */
-			used_vnode_size = (voff_t)pi.disklab->d_secsize *
-			    (voff_t)pi.part->p_size;
+			used_vnode_size = (u_quad_t)pi.disklab->d_secsize *
+			    (u_quad_t)pi.part->p_size;
 		}
 	} else {
 		result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc);
@@ -194,26 +262,58 @@ uvn_attach(arg, accessprot)
 	}
 
 	/* relock object */
-	simple_lock(&uobj->vmobjlock);
-
-	if (vp->v_flag & VXWANT)
-		wakeup(vp);
-	vp->v_flag &= ~(VXLOCK|VXWANT);
+	simple_lock(&uvn->u_obj.vmobjlock); 
 
 	if (result != 0) {
-		simple_unlock(&uobj->vmobjlock); /* drop lock */
+		if (uvn->u_flags & UVM_VNODE_WANTED)
+			wakeup(uvn);
+		uvn->u_flags = 0;
+		simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */
 		UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0);
 		return(NULL);
 	}
-	vp->v_size = used_vnode_size;
 
+	/*
+	 * make sure that the newsize fits within a vaddr_t
+	 * XXX: need to revise addressing data types
+	 */
+#ifdef DEBUG
+	if (vp->v_type == VBLK)
+		printf("used_vnode_size = %llu\n", (long long)used_vnode_size);
+#endif
+
+	/*
+	 * now set up the uvn.
+	 */
+	uvn->u_obj.pgops = &uvm_vnodeops;
+	TAILQ_INIT(&uvn->u_obj.memq);
+	uvn->u_obj.uo_npages = 0;
+	uvn->u_obj.uo_refs = 1;			/* just us... */
+	oldflags = uvn->u_flags;
+	uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST;
+	uvn->u_nio = 0;
+	uvn->u_size = used_vnode_size;
+
+	/* if write access, we need to add it to the wlist */
+	if (accessprot & VM_PROT_WRITE) {
+		simple_lock(&uvn_wl_lock);
+		LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
+		simple_unlock(&uvn_wl_lock);
+		uvn->u_flags |= UVM_VNODE_WRITEABLE;	/* we are on wlist! */
 	}
 
-	/* unlock and return */
-	simple_unlock(&uobj->vmobjlock);
-	UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs,
-	    0, 0, 0);
-	return (uobj);
+	/*
+	 * add a reference to the vnode.   this reference will stay as long
+	 * as there is a valid mapping of the vnode.   dropped when the
+	 * reference count goes to zero [and we either free or persist].
+	 */
+	VREF(vp);
+	simple_unlock(&uvn->u_obj.vmobjlock);
+	if (oldflags & UVM_VNODE_WANTED)
+		wakeup(uvn);
+
+	UVMHIST_LOG(maphist,"<- done/VREF, ret 0x%x", &uvn->u_obj,0,0,0);
+	return(&uvn->u_obj);
 }
 
 
@@ -221,10 +321,10 @@ uvn_attach(arg, accessprot)
  * uvn_reference
  *
  * duplicate a reference to a VM object.  Note that the reference
- * count must already be at least one (the passed in reference) so
+ * count must already be at least one (the passed in reference) so 
  * there is no chance of the uvn being killed or locked out here.
  *
- * => caller must call with object unlocked.
+ * => caller must call with object unlocked.  
  * => caller must be using the same accessprot as was used at attach time
  */
 
@@ -233,7 +333,23 @@ static void
 uvn_reference(uobj)
 	struct uvm_object *uobj;
 {
-	VREF((struct vnode *)uobj);
+#ifdef DEBUG
+	struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
+#endif
+	UVMHIST_FUNC("uvn_reference"); UVMHIST_CALLED(maphist);
+
+	simple_lock(&uobj->vmobjlock);
+#ifdef DEBUG
+	if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
+		printf("uvn_reference: ref=%d, flags=0x%x\n", uvn->u_flags,
+		    uobj->uo_refs);
+		panic("uvn_reference: invalid state");
+	}
+#endif
+	uobj->uo_refs++;
+	UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", 
+	uobj, uobj->uo_refs,0,0);
+	simple_unlock(&uobj->vmobjlock);
 }
 
 /*
@@ -242,12 +358,298 @@ uvn_reference(uobj)
  * remove a reference to a VM object.
  *
  * => caller must call with object unlocked and map locked.
+ * => this starts the detach process, but doesn't have to finish it
+ *    (async i/o could still be pending).
  */
 static void
 uvn_detach(uobj)
 	struct uvm_object *uobj;
 {
-	vrele((struct vnode *)uobj);
+	struct uvm_vnode *uvn;
+	struct vnode *vp;
+	int oldflags;
+	UVMHIST_FUNC("uvn_detach"); UVMHIST_CALLED(maphist);
+
+	simple_lock(&uobj->vmobjlock);
+
+	UVMHIST_LOG(maphist,"  (uobj=0x%x)  ref=%d", uobj,uobj->uo_refs,0,0);
+	uobj->uo_refs--;			/* drop ref! */
+	if (uobj->uo_refs) {			/* still more refs */
+		simple_unlock(&uobj->vmobjlock);
+		UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0);
+		return;
+	}
+
+	/*
+	 * get other pointers ...
+	 */
+
+	uvn = (struct uvm_vnode *) uobj;
+	vp = (struct vnode *) uobj;
+
+	/*
+	 * clear VTEXT flag now that there are no mappings left (VTEXT is used
+	 * to keep an active text file from being overwritten).
+	 */
+	vp->v_flag &= ~VTEXT;
+
+	/*
+	 * we just dropped the last reference to the uvn.   see if we can
+	 * let it "stick around".
+	 */
+
+	if (uvn->u_flags & UVM_VNODE_CANPERSIST) {
+		/* won't block */
+		uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES);
+		simple_unlock(&uobj->vmobjlock);
+		vrele(vp);			/* drop vnode reference */
+		UVMHIST_LOG(maphist,"<- done/vrele!  (persist)", 0,0,0,0);
+		return;
+	}
+
+	/*
+	 * its a goner!
+	 */
+
+	UVMHIST_LOG(maphist,"  its a goner (flushing)!", 0,0,0,0);
+
+	uvn->u_flags |= UVM_VNODE_DYING;
+
+	/*
+	 * even though we may unlock in flush, no one can gain a reference
+	 * to us until we clear the "dying" flag [because it blocks
+	 * attaches].  we will not do that until after we've disposed of all
+	 * the pages with uvn_flush().  note that before the flush the only
+	 * pages that could be marked PG_BUSY are ones that are in async
+	 * pageout by the daemon.  (there can't be any pending "get"'s
+	 * because there are no references to the object).
+	 */
+
+	(void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
+
+	UVMHIST_LOG(maphist,"  its a goner (done flush)!", 0,0,0,0);
+
+	/*
+	 * given the structure of this pager, the above flush request will
+	 * create the following state: all the pages that were in the object
+	 * have either been free'd or they are marked PG_BUSY|PG_RELEASED.
+	 * the PG_BUSY bit was set either by us or the daemon for async I/O.
+	 * in either case, if we have pages left we can't kill the object
+	 * yet because i/o is pending.  in this case we set the "relkill"
+	 * flag which will cause pgo_releasepg to kill the object once all
+	 * the I/O's are done [pgo_releasepg will be called from the aiodone
+	 * routine or from the page daemon].
+	 */
+
+	if (uobj->uo_npages) {		/* I/O pending.  iodone will free */
+#ifdef DEBUG
+		/* 
+		 * XXXCDC: very unlikely to happen until we have async i/o
+		 * so print a little info message in case it does.
+		 */
+		printf("uvn_detach: vn %p has pages left after flush - "
+		    "relkill mode\n", uobj);
+#endif
+		uvn->u_flags |= UVM_VNODE_RELKILL;
+		simple_unlock(&uobj->vmobjlock);
+		UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0,
+		    0, 0);
+		return;
+	}
+
+	/*
+	 * kill object now.   note that we can't be on the sync q because
+	 * all references are gone.
+	 */
+	if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
+		simple_lock(&uvn_wl_lock);		/* protect uvn_wlist */
+		LIST_REMOVE(uvn, u_wlist);
+		simple_unlock(&uvn_wl_lock);
+	}
+#ifdef DIAGNOSTIC
+	if (uobj->memq.tqh_first != NULL)
+		panic("uvn_deref: vnode VM object still has pages afer "
+		    "syncio/free flush");
+#endif
+	oldflags = uvn->u_flags;
+	uvn->u_flags = 0;
+	simple_unlock(&uobj->vmobjlock);
+	
+	/* wake up any sleepers */
+	if (oldflags & UVM_VNODE_WANTED)
+		wakeup(uvn);
+
+	/*
+	 * drop our reference to the vnode.
+	 */
+	vrele(vp);
+	UVMHIST_LOG(maphist,"<- done (vrele) final", 0,0,0,0);
+
+	return;
+}
+
+/*
+ * uvm_vnp_terminate: external hook to clear out a vnode's VM
+ *
+ * called in two cases:
+ *  [1] when a persisting vnode vm object (i.e. one with a zero reference
+ *      count) needs to be freed so that a vnode can be reused.  this
+ *      happens under "getnewvnode" in vfs_subr.c.   if the vnode from
+ *      the free list is still attached (i.e. not VBAD) then vgone is
+ *	called.   as part of the vgone trace this should get called to
+ *	free the vm object.   this is the common case.
+ *  [2] when a filesystem is being unmounted by force (MNT_FORCE, 
+ *	"umount -f") the vgone() function is called on active vnodes
+ *	on the mounted file systems to kill their data (the vnodes become
+ *	"dead" ones [see src/sys/miscfs/deadfs/...]).  that results in a
+ *	call here (even if the uvn is still in use -- i.e. has a non-zero
+ *	reference count).  this case happens at "umount -f" and during a
+ *	"reboot/halt" operation.
+ *
+ * => the caller must XLOCK and VOP_LOCK the vnode before calling us
+ *	[protects us from getting a vnode that is already in the DYING
+ *	 state...]
+ * => unlike uvn_detach, this function must not return until all the
+ *	uvn's pages are disposed of.
+ * => in case [2] the uvn is still alive after this call, but all I/O
+ *	ops will fail (due to the backing vnode now being "dead").  this
+ *	will prob. kill any process using the uvn due to pgo_get failing.
+ */
+
+void
+uvm_vnp_terminate(vp)
+	struct vnode *vp;
+{
+	struct uvm_vnode *uvn = &vp->v_uvm;
+	int oldflags;
+	UVMHIST_FUNC("uvm_vnp_terminate"); UVMHIST_CALLED(maphist);
+
+	/*
+	 * lock object and check if it is valid
+	 */
+	simple_lock(&uvn->u_obj.vmobjlock);
+	UVMHIST_LOG(maphist, "  vp=0x%x, ref=%d, flag=0x%x", vp,
+	    uvn->u_obj.uo_refs, uvn->u_flags, 0);
+	if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
+		simple_unlock(&uvn->u_obj.vmobjlock);
+		UVMHIST_LOG(maphist, "<- done (not active)", 0, 0, 0, 0);
+		return;
+	}
+
+	/*
+	 * must be a valid uvn that is not already dying (because XLOCK
+	 * protects us from that).   the uvn can't in the ALOCK state
+	 * because it is valid, and uvn's that are in the ALOCK state haven't
+	 * been marked valid yet.
+	 */
+
+#ifdef DEBUG
+	/*
+	 * debug check: are we yanking the vnode out from under our uvn?
+	 */
+	if (uvn->u_obj.uo_refs) {
+		printf("uvm_vnp_terminate(%p): terminating active vnode "
+		    "(refs=%d)\n", uvn, uvn->u_obj.uo_refs);
+	} 
+#endif
+	
+	/*
+	 * it is possible that the uvn was detached and is in the relkill
+	 * state [i.e. waiting for async i/o to finish so that releasepg can
+	 * kill object].  we take over the vnode now and cancel the relkill.
+	 * we want to know when the i/o is done so we can recycle right
+	 * away.   note that a uvn can only be in the RELKILL state if it
+	 * has a zero reference count.
+	 */
+	
+	if (uvn->u_flags & UVM_VNODE_RELKILL)
+		uvn->u_flags &= ~UVM_VNODE_RELKILL;	/* cancel RELKILL */
+
+	/*
+	 * block the uvn by setting the dying flag, and then flush the
+	 * pages.  (note that flush may unlock object while doing I/O, but
+	 * it will re-lock it before it returns control here).
+	 *
+	 * also, note that we tell I/O that we are already VOP_LOCK'd so
+	 * that uvn_io doesn't attempt to VOP_LOCK again.
+	 *
+	 * XXXCDC: setting VNISLOCKED on an active uvn which is being terminated
+	 *	due to a forceful unmount might not be a good idea.  maybe we
+	 *	need a way to pass in this info to uvn_flush through a
+	 *	pager-defined PGO_ constant [currently there are none].
+	 */
+	uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED;
+
+	(void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
+
+	/*
+	 * as we just did a flush we expect all the pages to be gone or in 
+	 * the process of going.  sleep to wait for the rest to go [via iosync].
+	 */
+
+	while (uvn->u_obj.uo_npages) {
+#ifdef DEBUG
+		struct vm_page *pp;
+		for (pp = uvn->u_obj.memq.tqh_first ; pp != NULL ; 
+		     pp = pp->listq.tqe_next) {
+			if ((pp->flags & PG_BUSY) == 0)
+				panic("uvm_vnp_terminate: detected unbusy pg");
+		}
+		if (uvn->u_nio == 0)
+			panic("uvm_vnp_terminate: no I/O to wait for?");
+		printf("uvm_vnp_terminate: waiting for I/O to fin.\n");
+		/* 
+		 * XXXCDC: this is unlikely to happen without async i/o so we 
+		 * put a printf in just to keep an eye on it.
+		 */
+#endif
+		uvn->u_flags |= UVM_VNODE_IOSYNC;
+		UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE, 
+		    "uvn_term",0);
+		simple_lock(&uvn->u_obj.vmobjlock);
+	}
+
+	/*
+	 * done.   now we free the uvn if its reference count is zero
+	 * (true if we are zapping a persisting uvn).   however, if we are
+	 * terminating a uvn with active mappings we let it live ... future
+	 * calls down to the vnode layer will fail.
+	 */
+
+	oldflags = uvn->u_flags;
+	if (uvn->u_obj.uo_refs) {
+
+		/*
+		 * uvn must live on it is dead-vnode state until all references 
+		 * are gone.   restore flags.    clear CANPERSIST state.
+		 */
+
+		uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED|
+		      UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST);
+	
+	} else {
+
+		/*
+		 * free the uvn now.   note that the VREF reference is already
+		 * gone [it is dropped when we enter the persist state].
+		 */
+		if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED)
+			panic("uvm_vnp_terminate: io sync wanted bit set");
+
+		if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
+			simple_lock(&uvn_wl_lock);
+			LIST_REMOVE(uvn, u_wlist);
+			simple_unlock(&uvn_wl_lock);
+		}
+		uvn->u_flags = 0;	/* uvn is history, clear all bits */
+	}
+
+	if (oldflags & UVM_VNODE_WANTED)
+		wakeup(uvn);		/* object lock still held */
+
+	simple_unlock(&uvn->u_obj.vmobjlock);
+	UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0);
+
 }
 
 /*
@@ -260,7 +662,7 @@ uvn_detach(uobj)
  * => returns TRUE if page's object is still alive, FALSE if we
  *	killed the page's object.    if we return TRUE, then we
  *	return with the object locked.
- * => if (nextpgp != NULL) => we return the next page on the queue, and return
+ * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return
  *				with the page queues locked [for pagedaemon]
  * => if (nextpgp == NULL) => we return with page queues unlocked [normal case]
  * => we kill the uvn if it is not referenced and we are suppose to
@@ -272,33 +674,76 @@ uvn_releasepg(pg, nextpgp)
 	struct vm_page *pg;
 	struct vm_page **nextpgp;	/* OUT */
 {
-	KASSERT(pg->flags & PG_RELEASED);
-
+	struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject;
+#ifdef DIAGNOSTIC
+	if ((pg->flags & PG_RELEASED) == 0)
+		panic("uvn_releasepg: page not released!");
+#endif
+	
 	/*
 	 * dispose of the page [caller handles PG_WANTED]
 	 */
 	pmap_page_protect(pg, VM_PROT_NONE);
 	uvm_lock_pageq();
 	if (nextpgp)
-		*nextpgp = TAILQ_NEXT(pg, pageq);
+		*nextpgp = pg->pageq.tqe_next;	/* next page for daemon */
 	uvm_pagefree(pg);
 	if (!nextpgp)
 		uvm_unlock_pageq();
 
+	/*
+	 * now see if we need to kill the object
+	 */
+	if (uvn->u_flags & UVM_VNODE_RELKILL) {
+		if (uvn->u_obj.uo_refs)
+			panic("uvn_releasepg: kill flag set on referenced "
+			    "object!");
+		if (uvn->u_obj.uo_npages == 0) {
+			if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
+				simple_lock(&uvn_wl_lock);
+				LIST_REMOVE(uvn, u_wlist);
+				simple_unlock(&uvn_wl_lock);
+			}
+#ifdef DIAGNOSTIC
+			if (uvn->u_obj.memq.tqh_first)
+	panic("uvn_releasepg: pages in object with npages == 0");
+#endif
+			if (uvn->u_flags & UVM_VNODE_WANTED)
+				/* still holding object lock */
+				wakeup(uvn);
+
+			uvn->u_flags = 0;		/* DEAD! */
+			simple_unlock(&uvn->u_obj.vmobjlock);
+			return (FALSE);
+		}
+	}
 	return (TRUE);
 }
 
 /*
+ * NOTE: currently we have to use VOP_READ/VOP_WRITE because they go
+ * through the buffer cache and allow I/O in any size.  These VOPs use
+ * synchronous i/o.  [vs. VOP_STRATEGY which can be async, but doesn't
+ * go through the buffer cache or allow I/O sizes larger than a
+ * block].  we will eventually want to change this.
+ *
  * issues to consider:
+ *   uvm provides the uvm_aiodesc structure for async i/o management.
  * there are two tailq's in the uvm. structure... one for pending async
  * i/o and one for "done" async i/o.   to do an async i/o one puts
- * a buf on the "pending" list (protected by splbio()), starts the
- * i/o and returns 0.    when the i/o is done, we expect
+ * an aiodesc on the "pending" list (protected by splbio()), starts the
+ * i/o and returns VM_PAGER_PEND.    when the i/o is done, we expect
  * some sort of "i/o done" function to be called (at splbio(), interrupt
- * time).   this function should remove the buf from the pending list
+ * time).   this function should remove the aiodesc from the pending list
  * and place it on the "done" list and wakeup the daemon.   the daemon
  * will run at normal spl() and will remove all items from the "done"
- * list and call the iodone hook for each done request (see uvm_pager.c).
+ * list and call the "aiodone" hook for each done request (see uvm_pager.c).
+ * [in the old vm code, this was done by calling the "put" routine with
+ * null arguments which made the code harder to read and understand because
+ * you had one function ("put") doing two things.]  
+ *
+ * so the current pager needs: 
+ *   int uvn_aiodone(struct uvm_aiodesc *)
  *
  * => return KERN_SUCCESS (aio finished, free it).  otherwise requeue for
  *	later collection.
@@ -319,17 +764,15 @@ uvn_releasepg(pg, nextpgp)
 /*
  * uvn_flush: flush pages out of a uvm object.
  *
- * => "stop == 0" means flush all pages at or after "start".
  * => object should be locked by caller.   we may _unlock_ the object
- *	if (and only if) we need to clean a page (PGO_CLEANIT), or
- *	if PGO_SYNCIO is set and there are pages busy.
+ *	if (and only if) we need to clean a page (PGO_CLEANIT).
  *	we return with the object locked.
- * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
- *	thus, a caller might want to unlock higher level resources
- *	(e.g. vm_map) before calling flush.
- * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, then we will neither
- *	unlock the object nor block.
- * => if PGO_ALLPAGES is set, then all pages in the object are valid targets
+ * => if PGO_CLEANIT is set, we may block (due to I/O).   thus, a caller
+ *	might want to unlock higher level resources (e.g. vm_map)
+ *	before calling flush.
+ * => if PGO_CLEANIT is not set, then we will neither unlock the object
+ *	or block.
+ * => if PGO_ALLPAGE is set, then all pages in the object are valid targets
  *	for flushing.
  * => NOTE: we rely on the fact that the object's memq is a TAILQ and
  *	that new pages are inserted on the tail end of the list.   thus,
@@ -349,9 +792,9 @@ uvn_releasepg(pg, nextpgp)
  *	in, then it can not be dirty (!PG_CLEAN) because no one has
  *	had a chance to modify it yet.    if the PG_BUSY page is being
  *	paged out then it means that someone else has already started
- *	cleaning the page for us (how nice!).    in this case, if we
+ *	cleaning the page for us (how nice!).    in this case, if we 
  *	have syncio specified, then after we make our pass through the
- *	object we need to wait for the other PG_BUSY pages to clear
+ *	object we need to wait for the other PG_BUSY pages to clear 
  *	off (i.e. we need to do an iosync).   also note that once a
  *	page is PG_BUSY it must stay in its object until it is un-busyed.
  *
@@ -359,76 +802,53 @@ uvn_releasepg(pg, nextpgp)
  *	we can traverse the pages in an object either by going down the
  *	linked list in "uobj->memq", or we can go over the address range
  *	by page doing hash table lookups for each address.    depending
- *	on how many pages are in the object it may be cheaper to do one
+ *	on how many pages are in the object it may be cheaper to do one 
  *	or the other.   we set "by_list" to true if we are using memq.
  *	if the cost of a hash lookup was equal to the cost of the list
  *	traversal we could compare the number of pages in the start->stop
  *	range to the total number of pages in the object.   however, it
  *	seems that a hash table lookup is more expensive than the linked
- *	list traversal, so we multiply the number of pages in the
+ *	list traversal, so we multiply the number of pages in the 
  *	start->stop range by a penalty which we define below.
  */
 
 #define UVN_HASH_PENALTY 4	/* XXX: a guess */
 
-boolean_t
+static boolean_t
 uvn_flush(uobj, start, stop, flags)
 	struct uvm_object *uobj;
 	voff_t start, stop;
 	int flags;
 {
-	struct vnode *vp = (struct vnode *)uobj;
+	struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
 	struct vm_page *pp, *ppnext, *ptmp;
-	struct vm_page *pps[256], **ppsp;
-	int s;
+	struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
 	int npages, result, lcv;
-	boolean_t retval, need_iosync, by_list, needs_clean, all, wasclean;
-	boolean_t async = (flags & PGO_SYNCIO) == 0;
+	boolean_t retval, need_iosync, by_list, needs_clean, all;
 	voff_t curoff;
 	u_short pp_version;
 	UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist);
-	UVMHIST_LOG(maphist, "uobj %p start 0x%x stop 0x%x flags 0x%x",
-		    uobj, start, stop, flags);
-	KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
-
-	if (uobj->uo_npages == 0) {
-		s = splbio();
-		if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
-		    (vp->v_bioflag & VBIOONSYNCLIST)) {
-			vp->v_bioflag &= ~VBIOONSYNCLIST;
-			LIST_REMOVE(vp, v_synclist);
-		}
-		splx(s);
-		return TRUE;
-	}
-
-#ifdef DIAGNOSTIC
-	if (vp->v_size == VSIZENOTSET) {
-		printf("uvn_flush: size not set vp %p\n", vp);
-		vprint("uvn_flush VSIZENOTSET", vp);
-		flags |= PGO_ALLPAGES;
-	}
-#endif
 
+	curoff = 0;	/* XXX: shut up gcc */
 	/*
 	 * get init vals and determine how we are going to traverse object
 	 */
 
-	if (stop == 0) {
-		stop = trunc_page(LLONG_MAX);
-	}
-	curoff = 0;
 	need_iosync = FALSE;
-	retval = TRUE;
-	wasclean = TRUE;
+	retval = TRUE;		/* return value */
 	if (flags & PGO_ALLPAGES) {
 		all = TRUE;
-		by_list = TRUE;
+		by_list = TRUE;		/* always go by the list */
 	} else {
 		start = trunc_page(start);
 		stop = round_page(stop);
+#ifdef DEBUG
+		if (stop > round_page(uvn->u_size))
+			printf("uvn_flush: strange, got an out of range "
+			    "flush (fixed)\n");
+#endif
 		all = FALSE;
-		by_list = (uobj->uo_npages <=
+		by_list = (uobj->uo_npages <= 
 		    ((stop - start) >> PAGE_SHIFT) * UVN_HASH_PENALTY);
 	}
 
@@ -450,7 +870,8 @@ uvn_flush(uobj, start, stop, flags)
 	if ((flags & PGO_CLEANIT) != 0 &&
 	    uobj->pgops->pgo_mk_pcluster != NULL) {
 		if (by_list) {
-			TAILQ_FOREACH(pp, &uobj->memq, listq) {
+			for (pp = uobj->memq.tqh_first ; pp != NULL ;
+			    pp = pp->listq.tqe_next) {
 				if (!all &&
 				    (pp->offset < start || pp->offset >= stop))
 					continue;
@@ -474,39 +895,52 @@ uvn_flush(uobj, start, stop, flags)
 	 */
 
 	if (by_list) {
-		pp = TAILQ_FIRST(&uobj->memq);
+		pp = uobj->memq.tqh_first;
 	} else {
 		curoff = start;
 		pp = uvm_pagelookup(uobj, curoff);
 	}
 
-	ppnext = NULL;
-	ppsp = NULL;
-	uvm_lock_pageq();
+	ppnext = NULL;	/* XXX: shut up gcc */ 
+	ppsp = NULL;		/* XXX: shut up gcc */
+	uvm_lock_pageq();	/* page queues locked */
 
 	/* locked: both page queues and uobj */
-	for ( ; (by_list && pp != NULL) ||
-		      (!by_list && curoff < stop) ; pp = ppnext) {
+	for ( ; (by_list && pp != NULL) || 
+	  (!by_list && curoff < stop) ; pp = ppnext) {
+
 		if (by_list) {
+
+			/*
+			 * range check
+			 */
+
 			if (!all &&
 			    (pp->offset < start || pp->offset >= stop)) {
-				ppnext = TAILQ_NEXT(pp, listq);
+				ppnext = pp->listq.tqe_next;
 				continue;
 			}
+
 		} else {
+
+			/*
+			 * null check
+			 */
+
 			curoff += PAGE_SIZE;
 			if (pp == NULL) {
 				if (curoff < stop)
 					ppnext = uvm_pagelookup(uobj, curoff);
 				continue;
 			}
+
 		}
 
 		/*
 		 * handle case where we do not need to clean page (either
 		 * because we are not clean or because page is not dirty or
 		 * is busy):
-		 *
+		 * 
 		 * NOTE: we are allowed to deactivate a non-wired active
 		 * PG_BUSY page, but once a PG_BUSY page is on the inactive
 		 * queue it must stay put until it is !PG_BUSY (so as not to
@@ -515,23 +949,24 @@ uvn_flush(uobj, start, stop, flags)
 
 		if ((flags & PGO_CLEANIT) == 0 || (pp->flags & PG_BUSY) != 0) {
 			needs_clean = FALSE;
-			if (!async)
+			if ((pp->flags & PG_BUSY) != 0 &&
+			    (flags & (PGO_CLEANIT|PGO_SYNCIO)) ==
+			             (PGO_CLEANIT|PGO_SYNCIO))
 				need_iosync = TRUE;
 		} else {
-
 			/*
 			 * freeing: nuke all mappings so we can sync
 			 * PG_CLEAN bit with no race
 			 */
-			if ((pp->flags & PG_CLEAN) != 0 &&
+			if ((pp->flags & PG_CLEAN) != 0 && 
 			    (flags & PGO_FREE) != 0 &&
-			    /* XXX ACTIVE|INACTIVE test unnecessary? */
-			    (pp->pqflags & (PQ_ACTIVE|PQ_INACTIVE)) != 0)
+			    (pp->pqflags & PQ_ACTIVE) != 0)
 				pmap_page_protect(pp, VM_PROT_NONE);
 			if ((pp->flags & PG_CLEAN) != 0 &&
 			    pmap_is_modified(pp))
 				pp->flags &= ~(PG_CLEAN);
-			pp->flags |= PG_CLEANCHK;
+			pp->flags |= PG_CLEANCHK;	/* update "hint" */
+
 			needs_clean = ((pp->flags & PG_CLEAN) == 0);
 		}
 
@@ -539,26 +974,29 @@ uvn_flush(uobj, start, stop, flags)
 		 * if we don't need a clean... load ppnext and dispose of pp
 		 */
 		if (!needs_clean) {
+			/* load ppnext */
 			if (by_list)
-				ppnext = TAILQ_NEXT(pp, listq);
+				ppnext = pp->listq.tqe_next;
 			else {
 				if (curoff < stop)
 					ppnext = uvm_pagelookup(uobj, curoff);
 			}
 
+			/* now dispose of pp */
 			if (flags & PGO_DEACTIVATE) {
 				if ((pp->pqflags & PQ_INACTIVE) == 0 &&
-				    (pp->flags & PG_BUSY) == 0 &&
 				    pp->wire_count == 0) {
-					pmap_clear_reference(pp);
+					pmap_page_protect(pp, VM_PROT_NONE);
 					uvm_pagedeactivate(pp);
 				}
 
 			} else if (flags & PGO_FREE) {
 				if (pp->flags & PG_BUSY) {
+					/* release busy pages */
 					pp->flags |= PG_RELEASED;
 				} else {
 					pmap_page_protect(pp, VM_PROT_NONE);
+					/* removed page from object */
 					uvm_pagefree(pp);
 				}
 			}
@@ -575,23 +1013,23 @@ uvn_flush(uobj, start, stop, flags)
 		 * note: locked: uobj and page queues.
 		 */
 
-		wasclean = FALSE;
 		pp->flags |= PG_BUSY;	/* we 'own' page now */
 		UVM_PAGE_OWN(pp, "uvn_flush");
 		pmap_page_protect(pp, VM_PROT_READ);
 		pp_version = pp->version;
+ReTry:
 		ppsp = pps;
 		npages = sizeof(pps) / sizeof(struct vm_page *);
 
 		/* locked: page queues, uobj */
-		result = uvm_pager_put(uobj, pp, &ppsp, &npages,
-				       flags | PGO_DOACTCLUST, start, stop);
+		result = uvm_pager_put(uobj, pp, &ppsp, &npages, 
+			   flags | PGO_DOACTCLUST, start, stop);
 		/* unlocked: page queues, uobj */
 
 		/*
 		 * at this point nothing is locked.   if we did an async I/O
-		 * it is remotely possible for the async i/o to complete and
-		 * the page "pp" be freed or what not before we get a chance
+		 * it is remotely possible for the async i/o to complete and 
+		 * the page "pp" be freed or what not before we get a chance 
 		 * to relock the object.   in order to detect this, we have
 		 * saved the version number of the page in "pp_version".
 		 */
@@ -601,10 +1039,33 @@ uvn_flush(uobj, start, stop, flags)
 		uvm_lock_pageq();
 
 		/*
-		 * the cleaning operation is now done.  finish up.  note that
-		 * on error uvm_pager_put drops the cluster for us.
-		 * on success uvm_pager_put returns the cluster to us in
-		 * ppsp/npages.
+		 * VM_PAGER_AGAIN: given the structure of this pager, this 
+		 * can only happen when  we are doing async I/O and can't
+		 * map the pages into kernel memory (pager_map) due to lack
+		 * of vm space.   if this happens we drop back to sync I/O.
+		 */
+
+		if (result == VM_PAGER_AGAIN) {
+			/* 
+			 * it is unlikely, but page could have been released
+			 * while we had the object lock dropped.   we ignore
+			 * this now and retry the I/O.  we will detect and
+			 * handle the released page after the syncio I/O
+			 * completes.
+			 */
+#ifdef DIAGNOSTIC
+			if (flags & PGO_SYNCIO)
+	panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");
+#endif
+			flags |= PGO_SYNCIO;
+			goto ReTry;
+		}
+
+		/*
+		 * the cleaning operation is now done.   finish up.  note that
+		 * on error (!OK, !PEND) uvm_pager_put drops the cluster for us.
+		 * if success (OK, PEND) then uvm_pager_put returns the cluster
+		 * to us in ppsp/npages.
 		 */
 
 		/*
@@ -612,29 +1073,34 @@ uvn_flush(uobj, start, stop, flags)
 		 * we can move on to the next page.
 		 */
 
-		if (result == 0 && async &&
-		    (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
+		if (result == VM_PAGER_PEND) {
 
-			/*
-			 * no per-page ops: refresh ppnext and continue
-			 */
-			if (by_list) {
-				if (pp->version == pp_version)
-					ppnext = TAILQ_NEXT(pp, listq);
-				else
-					ppnext = TAILQ_FIRST(&uobj->memq);
-			} else {
-				if (curoff < stop)
-					ppnext = uvm_pagelookup(uobj, curoff);
+			if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
+				/*
+				 * no per-page ops: refresh ppnext and continue
+				 */
+				if (by_list) {
+					if (pp->version == pp_version)
+						ppnext = pp->listq.tqe_next;
+					else
+						/* reset */
+						ppnext = uobj->memq.tqh_first;
+				} else {
+					if (curoff < stop)
+						ppnext = uvm_pagelookup(uobj,
+						    curoff);
+				}
+				continue;
 			}
-			continue;
+
+			/* need to do anything here? */
 		}
 
 		/*
-		 * need to look at each page of the I/O operation.  we defer
-		 * processing "pp" until the last trip through this "for" loop
+		 * need to look at each page of the I/O operation.  we defer 
+		 * processing "pp" until the last trip through this "for" loop 
 		 * so that we can load "ppnext" for the main loop after we
-		 * play with the cluster pages [thus the "npages + 1" in the
+		 * play with the cluster pages [thus the "npages + 1" in the 
 		 * loop below].
 		 */
 
@@ -654,84 +1120,77 @@ uvn_flush(uobj, start, stop, flags)
 				/* set up next page for outer loop */
 				if (by_list) {
 					if (pp->version == pp_version)
-						ppnext = TAILQ_NEXT(pp, listq);
+						ppnext = pp->listq.tqe_next;
 					else
-						ppnext = TAILQ_FIRST(
-						    &uobj->memq);
+						/* reset */
+						ppnext = uobj->memq.tqh_first;
 				} else {
 					if (curoff < stop)
-						ppnext = uvm_pagelookup(uobj,
-						    curoff);
+					ppnext = uvm_pagelookup(uobj, curoff);
 				}
 			}
 
 			/*
-			 * verify the page wasn't moved while obj was
+			 * verify the page didn't get moved while obj was
 			 * unlocked
 			 */
-			if (result == 0 && async && ptmp->uobject != uobj)
+			if (result == VM_PAGER_PEND && ptmp->uobject != uobj)
 				continue;
 
 			/*
 			 * unbusy the page if I/O is done.   note that for
-			 * async I/O it is possible that the I/O op
+			 * pending I/O it is possible that the I/O op
 			 * finished before we relocked the object (in
 			 * which case the page is no longer busy).
 			 */
 
-			if (result != 0 || !async) {
-				if (ptmp->flags & PG_WANTED) {
+			if (result != VM_PAGER_PEND) {
+				if (ptmp->flags & PG_WANTED)
 					/* still holding object lock */
 					wakeup(ptmp);
-				}
+
 				ptmp->flags &= ~(PG_WANTED|PG_BUSY);
 				UVM_PAGE_OWN(ptmp, NULL);
 				if (ptmp->flags & PG_RELEASED) {
+
+					/* pgo_releasepg wants this */
 					uvm_unlock_pageq();
-					if (!uvn_releasepg(ptmp, NULL)) {
-						UVMHIST_LOG(maphist,
-							    "released %p",
-							    ptmp, 0,0,0);
+					if (!uvn_releasepg(ptmp, NULL))
 						return (TRUE);
-					}
-					uvm_lock_pageq();
-					continue;
+
+					uvm_lock_pageq();	/* relock */
+					continue;		/* next page */
+
 				} else {
-					if ((flags & PGO_WEAK) == 0 &&
-					    !(result == EIO &&
-					      curproc == uvm.pagedaemon_proc)) {
-						ptmp->flags |=
-							(PG_CLEAN|PG_CLEANCHK);
-						if ((flags & PGO_FREE) == 0) {
-							pmap_clear_modify(ptmp);
-						}
-					}
+					ptmp->flags |= (PG_CLEAN|PG_CLEANCHK);
+					if ((flags & PGO_FREE) == 0)
+						pmap_clear_modify(ptmp);
 				}
 			}
-
+	  
 			/*
 			 * dispose of page
 			 */
 
 			if (flags & PGO_DEACTIVATE) {
 				if ((pp->pqflags & PQ_INACTIVE) == 0 &&
-				    (pp->flags & PG_BUSY) == 0 &&
 				    pp->wire_count == 0) {
-					pmap_clear_reference(ptmp);
+					pmap_page_protect(ptmp, VM_PROT_NONE);
 					uvm_pagedeactivate(ptmp);
 				}
+
 			} else if (flags & PGO_FREE) {
-				if (result == 0 && async) {
+				if (result == VM_PAGER_PEND) {
 					if ((ptmp->flags & PG_BUSY) != 0)
 						/* signal for i/o done */
 						ptmp->flags |= PG_RELEASED;
 				} else {
-					if (result != 0) {
+					if (result != VM_PAGER_OK) {
 						printf("uvn_flush: obj=%p, "
-						   "offset=0x%llx.  error %d\n",
+						   "offset=0x%llx.  error "
+						   "during pageout.\n",
 						    pp->uobject,
-						    (long long)pp->offset,
-						    result);
+						    (long long)pp->offset);
 						printf("uvn_flush: WARNING: "
 						    "changes to page may be "
 						    "lost!\n");
@@ -741,38 +1200,31 @@ uvn_flush(uobj, start, stop, flags)
 					uvm_pagefree(ptmp);
 				}
 			}
+
 		}		/* end of "lcv" for loop */
+
 	}		/* end of "pp" for loop */
 
+	/*
+	 * done with pagequeues: unlock
+	 */
 	uvm_unlock_pageq();
-	s = splbio();
-	if ((flags & PGO_CLEANIT) && all && wasclean &&
-	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
-	    (vp->v_bioflag & VBIOONSYNCLIST)) {
-		vp->v_bioflag &= ~VBIOONSYNCLIST;
-		LIST_REMOVE(vp, v_synclist);
-	}
-	splx(s);
-	if (need_iosync) {
-		UVMHIST_LOG(maphist,"  <<DOING IOSYNC>>",0,0,0,0);
-
-		/*
-		 * XXX this doesn't use the new two-flag scheme,
-		 * but to use that, all i/o initiators will have to change.
-		 */
 
-		s = splbio();
-		while (vp->v_numoutput != 0) {
-			UVMHIST_LOG(ubchist, "waiting for vp %p num %d",
-				    vp, vp->v_numoutput,0,0);
+	/*
+	 * now wait for all I/O if required.
+	 */
+	if (need_iosync) {
 
-	                vp->v_bioflag |= VBIOWAIT;
-			UVM_UNLOCK_AND_WAIT(&vp->v_numoutput,
-					    &uobj->vmobjlock,
-					    FALSE, "uvn_flush",0);
-			simple_lock(&uobj->vmobjlock);
+		UVMHIST_LOG(maphist,"  <<DOING IOSYNC>>",0,0,0,0);
+		while (uvn->u_nio != 0) {
+			uvn->u_flags |= UVM_VNODE_IOSYNC;
+			UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, 
+			  FALSE, "uvn_flush",0);
+			simple_lock(&uvn->u_obj.vmobjlock);
 		}
-		splx(s);
+		if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED)
+			wakeup(&uvn->u_flags);
+		uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
 	}
 
 	/* return, with object locked! */
@@ -796,31 +1248,46 @@ uvn_cluster(uobj, offset, loffset, hoffset)
 	voff_t offset;
 	voff_t *loffset, *hoffset; /* OUT */
 {
-	struct vnode *vp = (struct vnode *)uobj;
-
+	struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
 	*loffset = offset;
-	*hoffset = MIN(offset + MAXBSIZE, round_page(vp->v_size));
+
+	if (*loffset >= uvn->u_size)
+		panic("uvn_cluster: offset out of range");
+
+	/*
+	 * XXX: old pager claims we could use VOP_BMAP to get maxcontig value.
+	 */
+	*hoffset = *loffset + MAXBSIZE;
+	if (*hoffset > round_page(uvn->u_size))	/* past end? */
+		*hoffset = round_page(uvn->u_size);
+
+	return;
 }
 
 /*
  * uvn_put: flush page data to backing store.
  *
+ * => prefer map unlocked (not required)
  * => object must be locked!   we will _unlock_ it before starting I/O.
  * => flags: PGO_SYNCIO -- use sync. I/O
  * => note: caller must set PG_CLEAN and pmap_clear_modify (if needed)
+ * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
+ *	[thus we never do async i/o!  see iodone comment]
  */
 
-int
+static int
 uvn_put(uobj, pps, npages, flags)
 	struct uvm_object *uobj;
 	struct vm_page **pps;
 	int npages, flags;
 {
-	struct vnode *vp = (struct vnode *)uobj;
-	int error;
+	int retval;
+
+	/* note: object locked */
+	retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
+	/* note: object unlocked */
 
-	error = VOP_PUTPAGES(vp, pps, npages, flags, NULL);
-	return error;
+	return(retval);
 }
 
 
@@ -834,140 +1301,558 @@ uvn_put(uobj, pps, npages, flags)
  * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
  * => NOTE: caller must check for released pages!!
  */
-
-int
+ 
+static int
 uvn_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags)
 	struct uvm_object *uobj;
 	voff_t offset;
 	struct vm_page **pps;		/* IN/OUT */
 	int *npagesp;			/* IN (OUT if PGO_LOCKED) */
-	int centeridx;
+	int centeridx, advice, flags;
 	vm_prot_t access_type;
-	int advice, flags;
 {
-	struct vnode *vp = (struct vnode *)uobj;
-	struct proc *p = curproc;
-	int error;
-	UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(ubchist);
-
-	UVMHIST_LOG(ubchist, "vp %p off 0x%x", vp, (int)offset, 0,0);
-	error = vn_lock(vp, LK_EXCLUSIVE|LK_RECURSEFAIL|LK_NOWAIT, p);
-	if (error) {
-		if (error == EBUSY)
-			return EAGAIN;
-		return error;
+	voff_t current_offset;
+	struct vm_page *ptmp;
+	int lcv, result, gotpages;
+	boolean_t done;
+	UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(maphist);
+	UVMHIST_LOG(maphist, "flags=%d", flags,0,0,0);
+
+	/*
+	 * step 1: handled the case where fault data structures are locked.
+	 */
+
+	if (flags & PGO_LOCKED) {
+
+		/*
+		 * gotpages is the current number of pages we've gotten (which
+		 * we pass back up to caller via *npagesp.
+		 */
+
+		gotpages = 0;
+
+		/*
+		 * step 1a: get pages that are already resident.   only do this
+		 * if the data structures are locked (i.e. the first time
+		 * through).
+		 */
+
+		done = TRUE;	/* be optimistic */
+
+		for (lcv = 0, current_offset = offset ; lcv < *npagesp ;
+		    lcv++, current_offset += PAGE_SIZE) {
+
+			/* do we care about this page?  if not, skip it */
+			if (pps[lcv] == PGO_DONTCARE)
+				continue;
+
+			/* lookup page */
+			ptmp = uvm_pagelookup(uobj, current_offset);
+
+			/* to be useful must get a non-busy, non-released pg */
+			if (ptmp == NULL ||
+			    (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) {
+				if (lcv == centeridx || (flags & PGO_ALLPAGES)
+				    != 0)
+				done = FALSE;	/* need to do a wait or I/O! */
+				continue;
+			}
+
+			/*
+			 * useful page: busy/lock it and plug it in our
+			 * result array
+			 */
+			ptmp->flags |= PG_BUSY;		/* loan up to caller */
+			UVM_PAGE_OWN(ptmp, "uvn_get1");
+			pps[lcv] = ptmp;
+			gotpages++;
+
+		}	/* "for" lcv loop */
+
+		/*
+		 * XXX: given the "advice", should we consider async read-ahead?
+		 * XXX: fault current does deactive of pages behind us.  is 
+		 * this good (other callers might now).
+		 */
+		/* 
+		 * XXX: read-ahead currently handled by buffer cache (bread)
+		 * level.
+		 * XXX: no async i/o available.
+		 * XXX: so we don't do anything now.
+		 */
+
+		/*
+		 * step 1c: now we've either done everything needed or we to
+		 * unlock and do some waiting or I/O.
+		 */
+
+		*npagesp = gotpages;		/* let caller know */
+		if (done)
+			return(VM_PAGER_OK);		/* bingo! */
+		else
+			/* EEK!   Need to unlock and I/O */
+			return(VM_PAGER_UNLOCK);
 	}
-	error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx,
-		     access_type, advice, flags);
-	VOP_UNLOCK(vp, LK_RELEASE, p);
-	return error;
-}
 
+	/*
+	 * step 2: get non-resident or busy pages.
+	 * object is locked.   data structures are unlocked.
+	 *
+	 * XXX: because we can't do async I/O at this level we get things
+	 * page at a time (otherwise we'd chunk).   the VOP_READ() will do 
+	 * async-read-ahead for us at a lower level.
+	 */
+
+	for (lcv = 0, current_offset = offset ; 
+			 lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) {
+		
+		/* skip over pages we've already gotten or don't want */
+		/* skip over pages we don't _have_ to get */
+		if (pps[lcv] != NULL || (lcv != centeridx &&
+		    (flags & PGO_ALLPAGES) == 0))
+			continue;
+
+		/*
+		 * we have yet to locate the current page (pps[lcv]).   we first
+		 * look for a page that is already at the current offset.   if
+		 * we fine a page, we check to see if it is busy or released.
+		 * if that is the case, then we sleep on the page until it is
+		 * no longer busy or released and repeat the lookup.    if the
+		 * page we found is neither busy nor released, then we busy it
+		 * (so we own it) and plug it into pps[lcv].   this breaks the
+		 * following while loop and indicates we are ready to move on
+		 * to the next page in the "lcv" loop above.
+		 *
+		 * if we exit the while loop with pps[lcv] still set to NULL,
+		 * then it means that we allocated a new busy/fake/clean page
+		 * ptmp in the object and we need to do I/O to fill in the data.
+		 */
+
+		while (pps[lcv] == NULL) {	/* top of "pps" while loop */
+			
+			/* look for a current page */
+			ptmp = uvm_pagelookup(uobj, current_offset);
+
+			/* nope?   allocate one now (if we can) */
+			if (ptmp == NULL) {
+
+				ptmp = uvm_pagealloc(uobj, current_offset,
+				    NULL, 0);
+
+				/* out of RAM? */
+				if (ptmp == NULL) {
+					simple_unlock(&uobj->vmobjlock);
+					uvm_wait("uvn_getpage");
+					simple_lock(&uobj->vmobjlock);
+
+					/* goto top of pps while loop */
+					continue;	
+				}
+
+				/* 
+				 * got new page ready for I/O.  break pps
+				 * while loop.  pps[lcv] is still NULL.
+				 */
+				break;		
+			}
+
+			/* page is there, see if we need to wait on it */
+			if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) {
+				ptmp->flags |= PG_WANTED;
+				UVM_UNLOCK_AND_WAIT(ptmp,
+				    &uobj->vmobjlock, FALSE, "uvn_get",0);
+				simple_lock(&uobj->vmobjlock);
+				continue;	/* goto top of pps while loop */
+			}
+			
+			/* 
+			 * if we get here then the page has become resident
+			 * and unbusy between steps 1 and 2.  we busy it
+			 * now (so we own it) and set pps[lcv] (so that we
+			 * exit the while loop).
+			 */
+			ptmp->flags |= PG_BUSY;
+			UVM_PAGE_OWN(ptmp, "uvn_get2");
+			pps[lcv] = ptmp;
+		}
+
+		/*
+		 * if we own the a valid page at the correct offset, pps[lcv]
+		 * will point to it.   nothing more to do except go to the
+		 * next page.
+		 */
+
+		if (pps[lcv])
+			continue;			/* next lcv */
+
+		/*
+		 * we have a "fake/busy/clean" page that we just allocated.  do
+		 * I/O to fill it with valid data.  note that object must be
+		 * locked going into uvn_io, but will be unlocked afterwards.
+		 */
+
+		result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
+		    PGO_SYNCIO, UIO_READ);
+
+		/*
+		 * I/O done.   object is unlocked (by uvn_io).   because we used
+		 * syncio the result can not be PEND or AGAIN.   we must relock
+		 * and check for errors.
+		 */
+
+		/* lock object.   check for errors.   */
+		simple_lock(&uobj->vmobjlock);
+		if (result != VM_PAGER_OK) {
+			if (ptmp->flags & PG_WANTED)
+				/* object lock still held */
+				wakeup(ptmp);
+
+			ptmp->flags &= ~(PG_WANTED|PG_BUSY);
+			UVM_PAGE_OWN(ptmp, NULL);
+			uvm_lock_pageq();
+			uvm_pagefree(ptmp);
+			uvm_unlock_pageq();
+			simple_unlock(&uobj->vmobjlock);
+			return(result);
+		}
+
+		/* 
+		 * we got the page!   clear the fake flag (indicates valid
+		 * data now in page) and plug into our result array.   note
+		 * that page is still busy.   
+		 *
+		 * it is the callers job to:
+		 * => check if the page is released
+		 * => unbusy the page
+		 * => activate the page
+		 */
+
+		ptmp->flags &= ~PG_FAKE;		/* data is valid ... */
+		pmap_clear_modify(ptmp);		/* ... and clean */
+		pps[lcv] = ptmp;
+
+	}	/* lcv loop */
+
+	/*
+	 * finally, unlock object and return.
+	 */
+
+	simple_unlock(&uobj->vmobjlock);
+	return (VM_PAGER_OK);
+}
 
 /*
- * uvn_findpages:
- * return the page for the uobj and offset requested, allocating if needed.
- * => uobj must be locked.
- * => returned page will be BUSY.
+ * uvn_io: do I/O to a vnode
+ *
+ * => prefer map unlocked (not required)
+ * => object must be locked!   we will _unlock_ it before starting I/O.
+ * => flags: PGO_SYNCIO -- use sync. I/O
+ * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
+ *	[thus we never do async i/o!  see iodone comment]
  */
 
-void
-uvn_findpages(uobj, offset, npagesp, pps, flags)
-	struct uvm_object *uobj;
-	voff_t offset;
-	int *npagesp;
-	struct vm_page **pps;
-	int flags;
+static int
+uvn_io(uvn, pps, npages, flags, rw)
+	struct uvm_vnode *uvn;
+	vm_page_t *pps;
+	int npages, flags, rw;
 {
-	int i, rv, npages;
+	struct vnode *vn;
+	struct uio uio;
+	struct iovec iov;
+	vaddr_t kva;
+	off_t file_offset;
+	int waitf, result, mapinflags;
+	size_t got, wanted;
+	UVMHIST_FUNC("uvn_io"); UVMHIST_CALLED(maphist);
+
+	UVMHIST_LOG(maphist, "rw=%d", rw,0,0,0);
+	
+	/*
+	 * init values
+	 */
+
+	waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
+	vn = (struct vnode *) uvn;
+	file_offset = pps[0]->offset;
+	
+	/*
+	 * check for sync'ing I/O.
+	 */
+	
+	while (uvn->u_flags & UVM_VNODE_IOSYNC) {
+		if (waitf == M_NOWAIT) { 
+			simple_unlock(&uvn->u_obj.vmobjlock);
+			UVMHIST_LOG(maphist,"<- try again (iosync)",0,0,0,0);
+			return(VM_PAGER_AGAIN);
+		}
+		uvn->u_flags |= UVM_VNODE_IOSYNCWANTED;
+		UVM_UNLOCK_AND_WAIT(&uvn->u_flags, &uvn->u_obj.vmobjlock, 
+			FALSE, "uvn_iosync",0);
+		simple_lock(&uvn->u_obj.vmobjlock);
+	}
+
+	/*
+	 * check size
+	 */
+
+	if (file_offset >= uvn->u_size) {
+			simple_unlock(&uvn->u_obj.vmobjlock);
+			UVMHIST_LOG(maphist,"<- BAD (size check)",0,0,0,0);
+			return(VM_PAGER_BAD);
+	}
+
+	/*
+	 * first try and map the pages in (without waiting)
+	 */
+
+	mapinflags = (rw == UIO_READ) ?
+	    UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
+
+	kva = uvm_pagermapin(pps, npages, mapinflags);
+	if (kva == 0 && waitf == M_NOWAIT) {
+		simple_unlock(&uvn->u_obj.vmobjlock);
+		UVMHIST_LOG(maphist,"<- mapin failed (try again)",0,0,0,0);
+		return(VM_PAGER_AGAIN);
+	}
+
+	/*
+	 * ok, now bump u_nio up.   at this point we are done with uvn
+	 * and can unlock it.   if we still don't have a kva, try again
+	 * (this time with sleep ok).
+	 */
+	
+	uvn->u_nio++;			/* we have an I/O in progress! */
+	simple_unlock(&uvn->u_obj.vmobjlock);
+	/* NOTE: object now unlocked */
+	if (kva == 0)
+		kva = uvm_pagermapin(pps, npages,
+		    mapinflags | UVMPAGER_MAPIN_WAITOK);
+
+	/*
+	 * ok, mapped in.  our pages are PG_BUSY so they are not going to
+	 * get touched (so we can look at "offset" without having to lock
+	 * the object).  set up for I/O.
+	 */
+
+	/*
+	 * fill out uio/iov
+	 */
+	
+	iov.iov_base = (caddr_t) kva;
+	wanted = npages << PAGE_SHIFT;
+	if (file_offset + wanted > uvn->u_size)
+		wanted = uvn->u_size - file_offset;	/* XXX: needed? */
+	iov.iov_len = wanted;
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = file_offset;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_rw = rw;
+	uio.uio_resid = wanted;
+	uio.uio_procp = curproc;
+
+	/*
+	 * do the I/O!  (XXX: curproc?)
+	 */
+
+	UVMHIST_LOG(maphist, "calling VOP",0,0,0,0);
+
+	/*
+	 * This process may already have this vnode locked, if we faulted in
+	 * copyin() or copyout() on a region backed by this vnode
+	 * while doing I/O to the vnode.  If this is the case, don't
+	 * panic.. instead, return the error to the user.
+	 *
+	 * XXX this is a stopgap to prevent a panic.
+	 * Ideally, this kind of operation *should* work.
+	 */
+	result = 0;
+	if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+		result = vn_lock(vn, LK_EXCLUSIVE | LK_RETRY | LK_RECURSEFAIL, curproc);
+
+	if (result == 0) {
+		/* NOTE: vnode now locked! */
+
+		if (rw == UIO_READ)
+			result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
+		else
+			result = VOP_WRITE(vn, &uio, 0, curproc->p_ucred);
 
-	rv = 0;
-	npages = *npagesp;
-	for (i = 0; i < npages; i++, offset += PAGE_SIZE) {
-		rv += uvn_findpage(uobj, offset, &pps[i], flags);
+		if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+			VOP_UNLOCK(vn, 0, curproc);
+	}
+	
+	/* NOTE: vnode now unlocked (unless vnislocked) */
+
+	UVMHIST_LOG(maphist, "done calling VOP",0,0,0,0);
+
+	/*
+	 * result == unix style errno (0 == OK!)
+	 *
+	 * zero out rest of buffer (if needed)
+	 */
+
+	if (result == 0) {
+		got = wanted - uio.uio_resid;
+
+		if (wanted && got == 0) {
+			result = EIO;		/* XXX: error? */
+		} else if (got < PAGE_SIZE * npages && rw == UIO_READ) {
+			memset((void *) (kva + got), 0,
+			       (npages << PAGE_SHIFT) - got);
+		}
 	}
-	*npagesp = rv;
+
+	/*
+	 * now remove pager mapping
+	 */
+	uvm_pagermapout(kva, npages);
+		
+	/*
+	 * now clean up the object (i.e. drop I/O count)
+	 */
+
+	simple_lock(&uvn->u_obj.vmobjlock);
+	/* NOTE: object now locked! */
+
+	uvn->u_nio--;			/* I/O DONE! */
+	if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) {
+		wakeup(&uvn->u_nio);
+	}
+	simple_unlock(&uvn->u_obj.vmobjlock);
+	/* NOTE: object now unlocked! */
+
+	/*
+	 * done!
+	 */
+
+	UVMHIST_LOG(maphist, "<- done (result %d)", result,0,0,0);
+	if (result == 0)
+		return(VM_PAGER_OK);
+	else
+		return(VM_PAGER_ERROR);
 }
 
-static int
-uvn_findpage(uobj, offset, pgp, flags)
-	struct uvm_object *uobj;
-	voff_t offset;
-	struct vm_page **pgp;
-	int flags;
+/*
+ * uvm_vnp_uncache: disable "persisting" in a vnode... when last reference
+ * is gone we will kill the object (flushing dirty pages back to the vnode
+ * if needed).
+ *
+ * => returns TRUE if there was no uvm_object attached or if there was
+ *	one and we killed it [i.e. if there is no active uvn]
+ * => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if
+ *	needed]
+ *
+ * => XXX: given that we now kill uvn's when a vnode is recycled (without
+ *	having to hold a reference on the vnode) and given a working
+ *	uvm_vnp_sync(), how does that effect the need for this function?
+ *      [XXXCDC: seems like it can die?]
+ *
+ * => XXX: this function should DIE once we merge the VM and buffer 
+ *	cache.
+ *
+ * research shows that this is called in the following places:
+ * ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode
+ *	changes sizes
+ * ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we
+ *	are written to
+ * ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit
+ *	is off
+ * ffs_realloccg: when we can't extend the current block and have 
+ *	to allocate a new one we call this [XXX: why?]
+ * nfsrv_rename, rename_files: called when the target filename is there
+ *	and we want to remove it
+ * nfsrv_remove, sys_unlink: called on file we are removing
+ * nfsrv_access: if VTEXT and we want WRITE access and we don't uncache
+ *	then return "text busy"
+ * nfs_open: seems to uncache any file opened with nfs
+ * vn_writechk: if VTEXT vnode and can't uncache return "text busy"
+ */
+
+boolean_t
+uvm_vnp_uncache(vp)
+	struct vnode *vp;
 {
-	struct vm_page *pg;
-	int s;
-	UVMHIST_FUNC("uvn_findpage"); UVMHIST_CALLED(ubchist);
-	UVMHIST_LOG(ubchist, "vp %p off 0x%lx", uobj, offset,0,0);
+	struct uvm_vnode *uvn = &vp->v_uvm;
 
-	if (*pgp != NULL) {
-		UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0);
-		return 0;
+	/*
+	 * lock uvn part of the vnode and check to see if we need to do anything
+	 */
+
+	simple_lock(&uvn->u_obj.vmobjlock);
+	if ((uvn->u_flags & UVM_VNODE_VALID) == 0 || 
+			(uvn->u_flags & UVM_VNODE_BLOCKED) != 0) {
+		simple_unlock(&uvn->u_obj.vmobjlock);
+		return(TRUE);
 	}
-	for (;;) {
-		/* look for an existing page */
-		pg = uvm_pagelookup(uobj, offset);
-
-		/* nope?   allocate one now */
-		if (pg == NULL) {
-			if (flags & UFP_NOALLOC) {
-				UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0);
-				return 0;
-			}
-			pg = uvm_pagealloc(uobj, offset, NULL, 0);
-			if (pg == NULL) {
-				if (flags & UFP_NOWAIT) {
-					UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
-					return 0;
-				}
-				simple_unlock(&uobj->vmobjlock);
-				uvm_wait("uvn_fp1");
-				simple_lock(&uobj->vmobjlock);
-				continue;
-			}
-			if (UVM_OBJ_IS_VTEXT(uobj)) {
-				uvmexp.vtextpages++;
-			} else {
-				uvmexp.vnodepages++;
-			}
-			s = splbio();
-			vhold((struct vnode *)uobj);
-			splx(s);
-			UVMHIST_LOG(ubchist, "alloced",0,0,0,0);
-			break;
-		} else if (flags & UFP_NOCACHE) {
-			UVMHIST_LOG(ubchist, "nocache",0,0,0,0);
-			return 0;
-		}
 
-		/* page is there, see if we need to wait on it */
-		if ((pg->flags & (PG_BUSY|PG_RELEASED)) != 0) {
-			if (flags & UFP_NOWAIT) {
-				UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
-				return 0;
-			}
-			pg->flags |= PG_WANTED;
-			UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0,
-					    "uvn_fp2", 0);
-			simple_lock(&uobj->vmobjlock);
-			continue;
-		}
+	/*
+	 * we have a valid, non-blocked uvn.   clear persist flag.
+	 * if uvn is currently active we can return now.
+	 */
 
-		/* skip PG_RDONLY pages if requested */
-		if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) {
-			UVMHIST_LOG(ubchist, "nordonly",0,0,0,0);
-			return 0;
-		}
+	uvn->u_flags &= ~UVM_VNODE_CANPERSIST;
+	if (uvn->u_obj.uo_refs) {
+		simple_unlock(&uvn->u_obj.vmobjlock);
+		return(FALSE);
+	}
 
-		/* mark the page BUSY and we're done. */
-		pg->flags |= PG_BUSY;
-		UVM_PAGE_OWN(pg, "uvn_findpage");
-		UVMHIST_LOG(ubchist, "found",0,0,0,0);
-		break;
+	/*
+	 * uvn is currently persisting!   we have to gain a reference to
+	 * it so that we can call uvn_detach to kill the uvn.
+	 */
+
+	VREF(vp);			/* seems ok, even with VOP_LOCK */
+	uvn->u_obj.uo_refs++;		/* value is now 1 */
+	simple_unlock(&uvn->u_obj.vmobjlock);
+
+
+#ifdef DEBUG
+	/*
+	 * carry over sanity check from old vnode pager: the vnode should
+	 * be VOP_LOCK'd, and we confirm it here.
+	 */
+	if (!VOP_ISLOCKED(vp)) {
+		boolean_t is_ok_anyway = FALSE;
+#if defined(NFSCLIENT)
+		extern int (**nfsv2_vnodeop_p) __P((void *));
+		extern int (**spec_nfsv2nodeop_p) __P((void *));
+		extern int (**fifo_nfsv2nodeop_p) __P((void *));
+
+		/* vnode is NOT VOP_LOCKed: some vnode types _never_ lock */
+		if (vp->v_op == nfsv2_vnodeop_p ||
+		    vp->v_op == spec_nfsv2nodeop_p) {
+			is_ok_anyway = TRUE;
+		}
+		if (vp->v_op == fifo_nfsv2nodeop_p) {
+			is_ok_anyway = TRUE;
+		}
+#endif	/* defined(NFSSERVER) || defined(NFSCLIENT) */
+		if (!is_ok_anyway)
+			panic("uvm_vnp_uncache: vnode not locked!");
 	}
-	*pgp = pg;
-	return 1;
+#endif	/* DEBUG */
+
+	/*
+	 * now drop our reference to the vnode.   if we have the sole 
+	 * reference to the vnode then this will cause it to die [as we
+	 * just cleared the persist flag].   we have to unlock the vnode 
+	 * while we are doing this as it may trigger I/O.
+	 *
+	 * XXX: it might be possible for uvn to get reclaimed while we are
+	 * unlocked causing us to return TRUE when we should not.   we ignore
+	 * this as a false-positive return value doesn't hurt us.
+	 */
+	VOP_UNLOCK(vp, 0, curproc);
+	uvn_detach(&uvn->u_obj);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curproc);
+	
+	/*
+	 * and return...
+	 */
+	
+	return(TRUE);
 }
 
 /*
@@ -976,7 +1861,7 @@ uvn_findpage(uobj, offset, pgp, flags)
  * grow   => just update size value
  * shrink => toss un-needed pages
  *
- * => we assume that the caller has a reference of some sort to the
+ * => we assume that the caller has a reference of some sort to the 
  *	vnode in question so that it will not be yanked out from under
  *	us.
  *
@@ -993,50 +1878,151 @@ uvm_vnp_setsize(vp, newsize)
 	struct vnode *vp;
 	voff_t newsize;
 {
-	struct uvm_object *uobj = &vp->v_uobj;
-	voff_t pgend = round_page(newsize);
-	UVMHIST_FUNC("uvm_vnp_setsize"); UVMHIST_CALLED(ubchist);
-
-	simple_lock(&uobj->vmobjlock);
-
-	UVMHIST_LOG(ubchist, "old 0x%x new 0x%x", vp->v_size, newsize, 0,0);
+	struct uvm_vnode *uvn = &vp->v_uvm;
 
 	/*
-	 * now check if the size has changed: if we shrink we had better
-	 * toss some pages...
+	 * lock uvn and check for valid object, and if valid: do it!
 	 */
+	simple_lock(&uvn->u_obj.vmobjlock);
+	if (uvn->u_flags & UVM_VNODE_VALID) {
+
+		/*
+		 * now check if the size has changed: if we shrink we had better
+		 * toss some pages...
+		 */
 
-	if (vp->v_size > pgend && vp->v_size != VSIZENOTSET) {
-		(void) uvn_flush(uobj, pgend, 0, PGO_FREE);
+		if (uvn->u_size > newsize) {
+			(void)uvn_flush(&uvn->u_obj, newsize,
+			    uvn->u_size, PGO_FREE);
+		}
+		uvn->u_size = newsize;
 	}
-	vp->v_size = newsize;
-	simple_unlock(&uobj->vmobjlock);
+	simple_unlock(&uvn->u_obj.vmobjlock);
+
+	/*
+	 * done
+	 */
+	return;
 }
 
 /*
- * uvm_vnp_zerorange:  set a range of bytes in a file to zero.
+ * uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes.
+ *
+ * => called from sys_sync with no VM structures locked
+ * => only one process can do a sync at a time (because the uvn
+ *    structure only has one queue for sync'ing).  we ensure this
+ *    by holding the uvn_sync_lock while the sync is in progress.
+ *    other processes attempting a sync will sleep on this lock
+ *    until we are done.
  */
 
 void
-uvm_vnp_zerorange(vp, off, len)
-	struct vnode *vp;
-	off_t off;
-	size_t len;
+uvm_vnp_sync(mp)
+	struct mount *mp;
 {
-        void *win;
+	struct uvm_vnode *uvn;
+	struct vnode *vp;
+	boolean_t got_lock;
+
+	/*
+	 * step 1: ensure we are only ones using the uvn_sync_q by locking
+	 * our lock...
+	 */
+	lockmgr(&uvn_sync_lock, LK_EXCLUSIVE, NULL, curproc);
 
-        /*
-         * XXXUBC invent kzero() and use it
-         */
+	/*
+	 * step 2: build up a simpleq of uvns of interest based on the 
+	 * write list.   we gain a reference to uvns of interest.  must 
+	 * be careful about locking uvn's since we will be holding uvn_wl_lock
+	 * in the body of the loop.
+	 */
+	SIMPLEQ_INIT(&uvn_sync_q);
+	simple_lock(&uvn_wl_lock);
+	for (uvn = uvn_wlist.lh_first ; uvn != NULL ;
+	    uvn = uvn->u_wlist.le_next) {
+
+		vp = (struct vnode *) uvn;
+		if (mp && vp->v_mount != mp)
+			continue;
+
+		/* attempt to gain reference */
+		while ((got_lock = simple_lock_try(&uvn->u_obj.vmobjlock)) ==
+		    						FALSE && 
+				(uvn->u_flags & UVM_VNODE_BLOCKED) == 0)
+			/* spin */ ;
+
+		/*
+		 * we will exit the loop if either if the following are true:
+		 *  - we got the lock [always true if NCPU == 1]
+		 *  - we failed to get the lock but noticed the vnode was
+		 * 	"blocked" -- in this case the vnode must be a dying
+		 *	vnode, and since dying vnodes are in the process of
+		 *	being flushed out, we can safely skip this one
+		 *
+		 * we want to skip over the vnode if we did not get the lock,
+		 * or if the vnode is already dying (due to the above logic).
+		 *
+		 * note that uvn must already be valid because we found it on
+		 * the wlist (this also means it can't be ALOCK'd).
+		 */
+		if (!got_lock || (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) {
+			if (got_lock)
+				simple_unlock(&uvn->u_obj.vmobjlock);
+			continue;		/* skip it */
+		}
+		
+		/*
+		 * gain reference.   watch out for persisting uvns (need to
+		 * regain vnode REF).
+		 */
+		if (uvn->u_obj.uo_refs == 0)
+			VREF(vp);
+		uvn->u_obj.uo_refs++;
+		simple_unlock(&uvn->u_obj.vmobjlock);
+
+		/*
+		 * got it!
+		 */
+		SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq);
+	}
+	simple_unlock(&uvn_wl_lock);
 
-        while (len) {
-                vsize_t bytelen = len;
+	/*
+	 * step 3: we now have a list of uvn's that may need cleaning.
+	 * we are holding the uvn_sync_lock, but have dropped the uvn_wl_lock
+	 * (so we can now safely lock uvn's again).
+	 */
 
-                win = ubc_alloc(&vp->v_uobj, off, &bytelen, UBC_WRITE);
-                memset(win, 0, bytelen);
-                ubc_release(win, 0);
+	for (uvn = uvn_sync_q.sqh_first ; uvn ; uvn = uvn->u_syncq.sqe_next) {
+		simple_lock(&uvn->u_obj.vmobjlock);
+#ifdef DEBUG
+		if (uvn->u_flags & UVM_VNODE_DYING) {
+			printf("uvm_vnp_sync: dying vnode on sync list\n");
+		}
+#endif
+		uvn_flush(&uvn->u_obj, 0, 0,
+		    PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST);
 
-                off += bytelen;
-                len -= bytelen;
-        }
+		/*
+		 * if we have the only reference and we just cleaned the uvn,
+		 * then we can pull it out of the UVM_VNODE_WRITEABLE state
+		 * thus allowing us to avoid thinking about flushing it again
+		 * on later sync ops.
+		 */
+		if (uvn->u_obj.uo_refs == 1 &&
+		    (uvn->u_flags & UVM_VNODE_WRITEABLE)) {
+			LIST_REMOVE(uvn, u_wlist);
+			uvn->u_flags &= ~UVM_VNODE_WRITEABLE;
+		}
+
+		simple_unlock(&uvn->u_obj.vmobjlock);
+
+		/* now drop our reference to the uvn */
+		uvn_detach(&uvn->u_obj);
+	}
+
+	/*
+	 * done!  release sync lock
+	 */
+	lockmgr(&uvn_sync_lock, LK_RELEASE, (void *)0, curproc);
 }
author	art <art@openbsd.org>	2001-12-19 08:58:05 +0000
committer	art <art@openbsd.org>	2001-12-19 08:58:05 +0000
commit	1414b0faee630b2a2d49752ab9b1fe4209fe6bfd (patch)
tree	489b0b0b16606f48bda8c6e8ea28957b5601320d /sys/uvm/uvm_vnode.c
parent	basic KNF done while i was looking for something else (diff)
download	wireguard-openbsd-1414b0faee630b2a2d49752ab9b1fe4209fe6bfd.tar.xz wireguard-openbsd-1414b0faee630b2a2d49752ab9b1fe4209fe6bfd.zip