summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorbeck <beck@openbsd.org>2013-06-11 19:01:20 +0000
committerbeck <beck@openbsd.org>2013-06-11 19:01:20 +0000
commit1abf8f944ca0988d2f6a89ce315e27ae764e2a1b (patch)
treed5e1eca58e7a306ac14bbbf30d043a5a8878b9fd /sys
parentReplace more ovbcopy with memmove; swap the src and dst arguments too (diff)
downloadwireguard-openbsd-1abf8f944ca0988d2f6a89ce315e27ae764e2a1b.tar.xz
wireguard-openbsd-1abf8f944ca0988d2f6a89ce315e27ae764e2a1b.zip
High memory page flipping for the buffer cache.
This change splits the buffer cache free lists into lists of dma reachable buffers and high memory buffers based on the ranges returned by pmemrange. Buffers move from dma to high memory as they age, but are flipped to dma reachable memory if IO is needed to/from and high mem buffer. The total amount of buffers allocated is now bufcachepercent of both the dma and the high memory region. This change allows the use of large buffer caches on amd64 using more than 4 GB of memory ok tedu@ krw@ - testing by many.
Diffstat (limited to 'sys')
-rw-r--r--sys/kern/kern_sysctl.c13
-rw-r--r--sys/kern/spec_vnops.c6
-rw-r--r--sys/kern/vfs_bio.c364
-rw-r--r--sys/kern/vfs_biomem.c82
-rw-r--r--sys/kern/vfs_vops.c7
-rw-r--r--sys/sys/buf.h10
-rw-r--r--sys/sys/mount.h3
-rw-r--r--sys/uvm/uvm_extern.h6
-rw-r--r--sys/uvm/uvm_page.c24
9 files changed, 384 insertions, 131 deletions
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 9f1a51d8dad..094720abbea 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_sysctl.c,v 1.236 2013/06/09 13:10:19 miod Exp $ */
+/* $OpenBSD: kern_sysctl.c,v 1.237 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */
/*-
@@ -110,6 +110,7 @@ extern struct disklist_head disklist;
extern fixpt_t ccpu;
extern long numvnodes;
extern u_int mcllivelocks;
+extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
extern void nmbclust_update(void);
@@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
return (sysctl_cptime2(name + 1, namelen -1, oldp, oldlenp,
newp, newlen));
case KERN_CACHEPCT: {
- u_int64_t dmapages;
- int opct, pgs;
+ psize_t pgs;
+ int opct;
opct = bufcachepercent;
error = sysctl_int(oldp, oldlenp, newp, newlen,
&bufcachepercent);
@@ -577,9 +578,11 @@ kern_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
bufcachepercent = opct;
return (EINVAL);
}
- dmapages = uvm_pagecount(&dma_constraint);
if (bufcachepercent != opct) {
- pgs = bufcachepercent * dmapages / 100;
+ pgs = (b_highpages_total + b_dmapages_total)
+ * bufcachepercent / 100;
+ b_dmamaxpages = b_dmapages_total * bufcachepercent
+ / 100;
bufadjust(pgs); /* adjust bufpages */
bufhighpages = bufpages; /* set high water mark */
}
diff --git a/sys/kern/spec_vnops.c b/sys/kern/spec_vnops.c
index 2bc2e43711b..d2bc271cfb0 100644
--- a/sys/kern/spec_vnops.c
+++ b/sys/kern/spec_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: spec_vnops.c,v 1.72 2013/06/11 16:42:16 deraadt Exp $ */
+/* $OpenBSD: spec_vnops.c,v 1.73 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */
/*
@@ -457,7 +457,9 @@ spec_strategy(void *v)
struct vop_strategy_args *ap = v;
struct buf *bp = ap->a_bp;
int maj = major(bp->b_dev);
-
+
+ if (!ISSET(bp->b_flags, B_DMA) && ISSET(bp->b_flags, B_BC))
+ panic("bogus buf %p passed to spec_strategy", bp);
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_start(bp);
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 943b6f4c80c..53c3ca0b495 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,7 +1,8 @@
-/* $OpenBSD: vfs_bio.c,v 1.147 2013/06/11 16:42:16 deraadt Exp $ */
+/* $OpenBSD: vfs_bio.c,v 1.148 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
/*
+ * Copyright (c) 2012,2013 Bob Beck <beck@openbsd.org>
* Copyright (c) 1994 Christopher G. Demetriou
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -63,12 +64,17 @@
/*
* Definitions for the buffer free lists.
*/
-#define BQUEUES 2 /* number of free buffer queues */
+#define BQUEUES 3 /* number of free buffer queues */
#define BQ_DIRTY 0 /* LRU queue with dirty buffers */
-#define BQ_CLEAN 1 /* LRU queue with clean buffers */
+#define BQ_CLEANL 1 /* LRU queue with clean low buffers */
+#define BQ_CLEANH 2 /* LRU queue with clean high buffers */
TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+int bfreeclean(int, struct bqueues *);
+struct uvm_constraint_range high_constraint;
+psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
+int needda;
int nobuffers;
int needbuffer;
struct bio_ops bioops;
@@ -110,30 +116,49 @@ bremfree(struct buf *bp)
struct bqueues *dp = NULL;
splassert(IPL_BIO);
+ KASSERT(ISSET(bp->b_flags, B_BC));
+ KASSERT(!ISSET(bp->b_flags, B_BUSY));
+ if (bp->b_freelist.tqe_next == NOLIST ||
+ bp->b_freelist.tqe_next == (void *)-1)
+ panic("bremfree: - buf %p not on a free list!", bp);
- /*
- * We only calculate the head of the freelist when removing
- * the last element of the list as that is the only time that
- * it is needed (e.g. to reset the tail pointer).
- *
- * NB: This makes an assumption about how tailq's are implemented.
- */
- if (TAILQ_NEXT(bp, b_freelist) == NULL) {
- for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
- if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist))
- break;
- if (dp == &bufqueues[BQUEUES])
- panic("bremfree: lost tail");
- }
if (!ISSET(bp->b_flags, B_DELWRI)) {
+ if (ISSET(bp->b_flags, B_DMA))
+ dp = &bufqueues[BQ_CLEANL];
+ else
+ dp = &bufqueues[BQ_CLEANH];
bcstats.numcleanpages -= atop(bp->b_bufsize);
} else {
+ dp = &bufqueues[BQ_DIRTY];
bcstats.numdirtypages -= atop(bp->b_bufsize);
bcstats.delwribufs--;
}
TAILQ_REMOVE(dp, bp, b_freelist);
}
+int
+bfreeclean(int npages, struct bqueues *dp)
+{
+ struct buf *bp;
+ int i = 0;
+
+ splassert(IPL_BIO);
+ while (i < npages) {
+ bp = TAILQ_FIRST(dp);
+ if (bp == NULL)
+ return(-1);
+ i += atop(bp->b_bufsize);
+ bremfree(bp);
+ if (bp->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &bp->b_vp->v_bufs_tree, bp);
+ brelvp(bp);
+ }
+ buf_put(bp);
+ }
+ return(0);
+}
+
void
buf_put(struct buf *bp)
{
@@ -158,7 +183,7 @@ buf_put(struct buf *bp)
bcstats.numbufs--;
if (buf_dealloc_mem(bp) != 0)
- return;
+ return;
pool_put(&bufpool, bp);
}
@@ -168,12 +193,21 @@ buf_put(struct buf *bp)
void
bufinit(void)
{
- u_int64_t dmapages;
struct bqueues *dp;
- dmapages = uvm_pagecount(&dma_constraint);
- /* take away a guess at how much of this the kernel will consume */
- dmapages -= (atop(physmem) - atop(uvmexp.free));
+ /* How much DMA accessible memory will we consider? */
+ b_dmapages_total = uvm_pagecount(&dma_constraint);
+ /* Take away a guess at how much of this the kernel will consume. */
+ b_dmapages_total -= (atop(physmem) - atop(uvmexp.free));
+
+ /* See if we have memory above the dma accessible region. */
+ high_constraint.ucr_low = dma_constraint.ucr_high;
+ high_constraint.ucr_high = no_constraint.ucr_high;
+ if (high_constraint.ucr_low != high_constraint.ucr_high) {
+ high_constraint.ucr_low++;
+ b_highpages_total = uvm_pagecount(&high_constraint);
+ } else
+ b_highpages_total = 0;
/*
* If MD code doesn't say otherwise, use up to 10% of DMA'able
@@ -189,18 +223,18 @@ bufinit(void)
KASSERT(bufcachepercent <= 90);
KASSERT(bufcachepercent >= 5);
if (bufpages == 0)
- bufpages = dmapages * bufcachepercent / 100;
+ bufpages = (b_dmapages_total + b_highpages_total)
+ * bufcachepercent / 100;
if (bufpages < BCACHE_MIN)
bufpages = BCACHE_MIN;
- KASSERT(bufpages < dmapages);
bufhighpages = bufpages;
-
+ b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
/*
* Set the base backoff level for the buffer cache. We will
* not allow uvm to steal back more than this number of pages.
*/
- buflowpages = dmapages * 5 / 100;
+ buflowpages = b_dmapages_total * 5 / 100;
if (buflowpages < BCACHE_MIN)
buflowpages = BCACHE_MIN;
@@ -267,7 +301,6 @@ bufinit(void)
void
bufadjust(int newbufpages)
{
- struct buf *bp;
int s, growing = 0;
if (newbufpages < buflowpages)
@@ -290,15 +323,11 @@ bufadjust(int newbufpages)
* If we have more buffers allocated than our new low water mark,
* immediately free them.
*/
- while (!growing && (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
- (bcstats.numbufpages > lopages)) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
- &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
+ if (!growing && (bcstats.numbufpages > lopages)) {
+ if (bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANH]) != 0)
+ (void) bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANL]);
}
/*
@@ -321,8 +350,10 @@ bufbackoff(struct uvm_constraint_range *range, long size)
/*
* Back off "size" buffer cache pages. Called by the page
* daemon to consume buffer cache pages rather than scanning.
+ * Also called buy the buffer cache to back off if memory
+ * allocation in a particular range fails.
*
- * It returns 0 to the pagedaemon to indicate that it has
+ * It returns 0 to the caller to indicate that it has
* succeeded in freeing enough pages. It returns -1 to
* indicate that it could not and the pagedaemon should take
* other measures.
@@ -340,8 +371,23 @@ bufbackoff(struct uvm_constraint_range *range, long size)
return(-1);
if (bufpages - pdelta < buflowpages)
pdelta = bufpages - buflowpages;
+
oldbufpages = bufpages;
- bufadjust(bufpages - pdelta);
+ if (b_highpages_total
+ && (range->ucr_high <= dma_constraint.ucr_high)) {
+ /*
+ * Free up DMA accessible memory by moving pages to
+ * the high range.
+ */
+ if (bufhigh(pdelta) == 0)
+ return(0); /* we moved enough pages up high */
+ else {
+ bufadjust(bufpages - pdelta); /* shrink the cache. */
+ }
+ } else {
+ /* Free memory by shrinking the cache. */
+ bufadjust(bufpages - pdelta);
+ }
if (oldbufpages - bufpages < size)
return (-1); /* we did not free what we were asked */
else
@@ -526,12 +572,18 @@ bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
for (i = 1; i < howmany; i++) {
bcstats.pendingreads++;
bcstats.numreads++;
- SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
+ /*
+ * We set B_DMA here because bp above will be B_DMA,
+ * and we are playing buffer slice-n-dice games from
+ * the memory allocated in bp.
+ */
+ SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
xbpp[i]->b_blkno = sblkno + (i * inc);
xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
xbpp[i]->b_data = NULL;
xbpp[i]->b_pobj = bp->b_pobj;
xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+ buf_dma(xbpp[i]);
}
KASSERT(bp->b_lblkno == blkno + 1);
@@ -760,8 +812,11 @@ brelse(struct buf *bp)
if (ISSET(bp->b_flags, B_INVAL)) {
/*
- * If the buffer is invalid, place it in the clean queue, so it
- * can be reused.
+ * If the buffer is invalid, free it now rather than
+ * putting it on any queue and wasting cache space.
+ *
+ * XXX we could queue it here for a later TRIM operation.
+ *
*/
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_deallocate(bp);
@@ -778,44 +833,35 @@ brelse(struct buf *bp)
bp->b_vp = NULL;
/*
- * If the buffer has no associated data, place it back in the
- * pool.
+ * Wake up any processes waiting for _this_ buffer to
+ * become free. They are not allowed to grab it
+ * since it will be freed. But the only sleeper is
+ * getblk and it's restarting the operation after
+ * sleep.
*/
- if (bp->b_data == NULL && bp->b_pobj == NULL) {
- /*
- * Wake up any processes waiting for _this_ buffer to
- * become free. They are not allowed to grab it
- * since it will be freed. But the only sleeper is
- * getblk and it's restarting the operation after
- * sleep.
- */
- if (ISSET(bp->b_flags, B_WANTED)) {
- CLR(bp->b_flags, B_WANTED);
- wakeup(bp);
- }
- if (bp->b_vp != NULL)
- RB_REMOVE(buf_rb_bufs,
- &bp->b_vp->v_bufs_tree, bp);
- buf_put(bp);
- splx(s);
- return;
+ if (ISSET(bp->b_flags, B_WANTED)) {
+ CLR(bp->b_flags, B_WANTED);
+ wakeup(bp);
}
-
- bcstats.numcleanpages += atop(bp->b_bufsize);
- binsheadfree(bp, &bufqueues[BQ_CLEAN]);
+ if (ISSET(bp->b_flags, B_DMA) && needda)
+ wakeup(&needda);
+ buf_put(bp);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
*/
- if (!ISSET(bp->b_flags, B_DELWRI)) {
- bcstats.numcleanpages += atop(bp->b_bufsize);
- bufq = &bufqueues[BQ_CLEAN];
- } else {
+ if (ISSET(bp->b_flags, B_DELWRI)) {
bcstats.numdirtypages += atop(bp->b_bufsize);
bcstats.delwribufs++;
bufq = &bufqueues[BQ_DIRTY];
+ } else {
+ bcstats.numcleanpages += atop(bp->b_bufsize);
+ if (ISSET(bp->b_flags, B_DMA))
+ bufq = &bufqueues[BQ_CLEANL];
+ else
+ bufq = &bufqueues[BQ_CLEANH];
}
if (ISSET(bp->b_flags, B_AGE)) {
binsheadfree(bp, bufq);
@@ -824,12 +870,20 @@ brelse(struct buf *bp)
binstailfree(bp, bufq);
bp->b_synctime = time_uptime + 300;
}
- }
-
- /* Unlock the buffer. */
- CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
- buf_release(bp);
+ /* Unlock the buffer. */
+ CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
+ buf_release(bp);
+ if (ISSET(bp->b_flags, B_DMA) && needda) {
+ wakeup(&needda);
+ }
+ /* Wake up any processes waiting for _this_ buffer to
+ * become free. */
+ if (ISSET(bp->b_flags, B_WANTED)) {
+ CLR(bp->b_flags, B_WANTED);
+ wakeup(bp);
+ }
+ }
/* Wake up syncer and cleaner processes waiting for buffers. */
if (nobuffers) {
nobuffers = 0;
@@ -843,12 +897,6 @@ brelse(struct buf *bp)
wakeup(&needbuffer);
}
- /* Wake up any processes waiting for _this_ buffer to become free. */
- if (ISSET(bp->b_flags, B_WANTED)) {
- CLR(bp->b_flags, B_WANTED);
- wakeup(bp);
- }
-
splx(s);
}
@@ -890,16 +938,6 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
struct buf b;
int s, error;
- /*
- * XXX
- * The following is an inlined version of 'incore()', but with
- * the 'invalid' test moved to after the 'busy' test. It's
- * necessary because there are some cases in which the NFS
- * code sets B_INVAL prior to writing data to the server, but
- * in which the buffers actually contain valid data. In this
- * case, we can't allow the system to allocate a new buffer for
- * the block until the write is finished.
- */
start:
s = splbio();
b.b_lblkno = blkno;
@@ -987,18 +1025,17 @@ buf_get(struct vnode *vp, daddr_t blkno, size_t size)
* free down to the low water mark.
*/
if (bcstats.numbufpages + npages > hipages) {
- while ((bcstats.numbufpages > lopages) &&
- (bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]))) {
- bremfree(bp);
- if (bp->b_vp) {
- RB_REMOVE(buf_rb_bufs,
- &bp->b_vp->v_bufs_tree, bp);
- brelvp(bp);
- }
- buf_put(bp);
- }
+ if (bfreeclean(bcstats.numbufpages - lopages,
+ &bufqueues[BQ_CLEANH]) != 0)
+ (void) bfreeclean(bcstats.numbufpages
+ - lopages, &bufqueues[BQ_CLEANL]);
}
+
+ if (b_highpages_total && bcstats.dmapages + npages >
+ b_dmamaxpages)
+ bufhigh(bcstats.dmapages + npages - b_dmamaxpages);
+
/*
* If we get here, we tried to free the world down
* above, and couldn't get down - Wake the cleaner
@@ -1029,6 +1066,8 @@ buf_get(struct vnode *vp, daddr_t blkno, size_t size)
return (NULL);
}
+ /* Mark buffer as the cache's */
+ SET(bp->b_flags, B_BC);
bp->b_freelist.tqe_next = NOLIST;
bp->b_synctime = time_uptime + 300;
bp->b_dev = NODEV;
@@ -1068,6 +1107,7 @@ buf_get(struct vnode *vp, daddr_t blkno, size_t size)
if (size) {
buf_alloc_pages(bp, round_page(size));
buf_map(bp);
+ buf_dma(bp);
}
splx(s);
@@ -1238,6 +1278,128 @@ biodone(struct buf *bp)
}
}
+/*
+ * Ensure buffer is DMA reachable
+ */
+void
+buf_dma(struct buf *buf)
+{
+ struct buf *b;
+ int s;
+
+start:
+ KASSERT(ISSET(buf->b_flags, B_BC));
+ KASSERT(ISSET(buf->b_flags, B_BUSY));
+ KASSERT(buf->b_pobj != NULL);
+ s = splbio();
+ /*
+ * If we are adding to the queue, and we are not the cleaner or
+ * the syncer, ensure we free down below the max
+ */
+ while (b_highpages_total &&
+ curproc != syncerproc && curproc != cleanerproc &&
+ (!ISSET(buf->b_flags, B_DMA)) &&
+ (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
+ b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+ KASSERT(!ISSET(b->b_flags, B_BUSY));
+ if (b == NULL) {
+ /* no non-busy buffers. */
+ needda++;
+ tsleep(&needda, PRIBIO, "needda", 0);
+ needda--;
+ splx(s);
+ goto start;
+ } else {
+ bremfree(b);
+ buf_acquire_nomap(b);
+ if (buf_realloc_pages(b, &high_constraint,
+ UVM_PLA_NOWAIT) == 0) {
+ /* move the buffer to high memory if we can */
+ if (ISSET(b->b_flags, B_DMA))
+ panic("B_DMA after high flip %p", b);
+ binstailfree(b, &bufqueues[BQ_CLEANH]);
+ buf_release(b);
+ } else {
+ /* otherwise just free the buffer */
+ buf_release(b);
+ if (b->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &b->b_vp->v_bufs_tree, b);
+ brelvp(b);
+ }
+ buf_put(b);
+ }
+ }
+ }
+ if (!ISSET(buf->b_flags, B_DMA)) {
+ /* move buf to dma reachable memory */
+ (void) buf_realloc_pages(buf, &dma_constraint, UVM_PLA_WAITOK);
+ if (!ISSET(buf->b_flags, B_DMA))
+ panic("non-dma buffer after dma move %p\n", buf);
+ }
+ splx(s);
+ return;
+}
+
+/*
+ * Attempt to flip "delta" dma reachable cache pages high. return 0 if we can,
+ * -1 otherwise.
+ */
+int
+bufhigh(int delta)
+{
+ psize_t newdmapages;
+ struct buf *b, *bn;
+ int s;
+ if (!b_highpages_total)
+ return(-1);
+ s = splbio();
+ newdmapages = bcstats.dmapages - delta;
+ b = TAILQ_FIRST(&bufqueues[BQ_CLEANL]);
+ while ((bcstats.dmapages > newdmapages) && (b != NULL)) {
+ while (ISSET(b->b_flags, B_BUSY)) {
+ b = TAILQ_NEXT(b, b_freelist);
+ }
+ if (b != NULL) {
+ bn = TAILQ_NEXT(b, b_freelist);
+ bremfree(b);
+ buf_acquire_nomap(b);
+ moveit:
+ if (buf_realloc_pages(b, &high_constraint,
+ UVM_PLA_NOWAIT) == 0) {
+ /* move the buffer to high memory if we can */
+ if (ISSET(b->b_flags, B_DMA))
+ panic("B_DMA after high flip %p", b);
+ binstailfree(b, &bufqueues[BQ_CLEANH]);
+ buf_release(b);
+ } else {
+ /* free up some high memory and try again. */
+ if (bfreeclean(delta, &bufqueues[BQ_CLEANH])
+ == 0)
+ goto moveit;
+ else {
+ /* otherwise just free the buffer */
+ buf_release(b);
+ if (b->b_vp) {
+ RB_REMOVE(buf_rb_bufs,
+ &b->b_vp->v_bufs_tree, b);
+ brelvp(b);
+ }
+ buf_put(b);
+ }
+ }
+ b = bn;
+ }
+ }
+ wakeup(&needda);
+ splx(s);
+ if (bcstats.dmapages > newdmapages)
+ return(-1);
+ else
+ return(0);
+}
+
+
#ifdef DDB
void bcstats_print(int (*)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */);
/*
@@ -1252,8 +1414,8 @@ bcstats_print(
bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
(*pr)("kvaslots %lld avail kva slots %lld\n",
bcstats.kvaslots, bcstats.kvaslots_avail);
- (*pr)("bufpages %lld, dirtypages %lld\n",
- bcstats.numbufpages, bcstats.numdirtypages);
+ (*pr)("total bufpages %lld, dmapages %lld, dirtypages %lld\n",
+ bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
(*pr)("pendingreads %lld, pendingwrites %lld\n",
bcstats.pendingreads, bcstats.pendingwrites);
}
diff --git a/sys/kern/vfs_biomem.c b/sys/kern/vfs_biomem.c
index 363686b7ec7..1b68aed642d 100644
--- a/sys/kern/vfs_biomem.c
+++ b/sys/kern/vfs_biomem.c
@@ -1,6 +1,7 @@
-/* $OpenBSD: vfs_biomem.c,v 1.23 2013/01/18 10:07:37 beck Exp $ */
+/* $OpenBSD: vfs_biomem.c,v 1.24 2013/06/11 19:01:20 beck Exp $ */
/*
* Copyright (c) 2007 Artur Grabowski <art@openbsd.org>
+ * Copyright (c) 2012,2013 Bob Beck <beck@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -267,6 +268,7 @@ void
buf_alloc_pages(struct buf *bp, vsize_t size)
{
voff_t offs;
+ int i;
KASSERT(size == round_page(size));
KASSERT(bp->b_pobj == NULL);
@@ -278,8 +280,18 @@ buf_alloc_pages(struct buf *bp, vsize_t size)
KASSERT(buf_page_offset > 0);
- uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
+ do {
+ i = uvm_pagealloc_multi(buf_object, offs, size,
+ UVM_PLA_NOWAIT);
+ if (i == 0)
+ break;
+ } while (bufbackoff(&dma_constraint, 100) == 0);
+ if (i != 0)
+ i = uvm_pagealloc_multi(buf_object, offs, size,
+ UVM_PLA_WAITOK);
bcstats.numbufpages += atop(size);
+ bcstats.dmapages += atop(size);
+ SET(bp->b_flags, B_DMA);
bp->b_pobj = buf_object;
bp->b_poffs = offs;
bp->b_bufsize = size;
@@ -307,10 +319,68 @@ buf_free_pages(struct buf *bp)
pg->wire_count = 0;
uvm_pagefree(pg);
bcstats.numbufpages--;
+ if (ISSET(bp->b_flags, B_DMA))
+ bcstats.dmapages--;
}
+ CLR(bp->b_flags, B_DMA);
}
-/*
- * XXX - it might make sense to make a buf_realloc_pages to avoid
- * bouncing through the free list all the time.
- */
+/* Reallocate a buf into a particular pmem range specified by "where". */
+int
+buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where,
+ int flags)
+{
+ vaddr_t va;
+ int dma;
+ int i, r;
+ KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
+
+ splassert(IPL_BIO);
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+ dma = ISSET(bp->b_flags, B_DMA);
+
+ /* if the original buf is mapped, unmap it */
+ if (bp->b_data != NULL) {
+ va = (vaddr_t)bp->b_data;
+ pmap_kremove(va, bp->b_bufsize);
+ pmap_update(pmap_kernel());
+ }
+
+ r = 0;
+ do {
+ r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+ bp->b_bufsize, UVM_PLA_NOWAIT, where);
+ if (r == 0)
+ break;
+ } while ((bufbackoff(where, 100) == 0) && (flags & UVM_PLA_WAITOK));
+ if (r != 0 && (! flags & UVM_PLA_NOWAIT))
+ r = uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs,
+ bp->b_bufsize, flags, where);
+
+ /*
+ * do this now, and put it back later when we know where we are
+ */
+ if (dma)
+ bcstats.dmapages -= atop(bp->b_bufsize);
+
+ dma = 1;
+ /* if the original buf was mapped, re-map it */
+ for (i = 0; i < atop(bp->b_bufsize); i++) {
+ struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+ bp->b_poffs + ptoa(i));
+ KASSERT(pg != NULL);
+ if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
+ dma = 0;
+ if (bp->b_data != NULL) {
+ pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ }
+ }
+ if (dma) {
+ SET(bp->b_flags, B_DMA);
+ bcstats.dmapages += atop(bp->b_bufsize);
+ } else
+ CLR(bp->b_flags, B_DMA);
+ return(r);
+}
diff --git a/sys/kern/vfs_vops.c b/sys/kern/vfs_vops.c
index 58daeeda5e8..16d3b2db6ce 100644
--- a/sys/kern/vfs_vops.c
+++ b/sys/kern/vfs_vops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_vops.c,v 1.6 2013/06/11 16:42:16 deraadt Exp $ */
+/* $OpenBSD: vfs_vops.c,v 1.7 2013/06/11 19:01:20 beck Exp $ */
/*
* Copyright (c) 2010 Thordur I. Bjornsson <thib@openbsd.org>
*
@@ -633,6 +633,11 @@ VOP_STRATEGY(struct buf *bp)
if (bp->b_vp->v_op->vop_strategy == NULL)
return (EOPNOTSUPP);
+ /*
+ * Flip buffer to dma reachable memory if necessary.
+ */
+ if (ISSET(bp->b_flags, B_BC))
+ buf_dma(bp);
return ((bp->b_vp->v_op->vop_strategy)(&a));
}
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index ea25a47a078..a036927e742 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: buf.h,v 1.85 2013/06/11 16:42:17 deraadt Exp $ */
+/* $OpenBSD: buf.h,v 1.86 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */
/*
@@ -234,12 +234,14 @@ struct buf {
#define B_SCANNED 0x00100000 /* Block already pushed during sync */
#define B_PDAEMON 0x00200000 /* I/O started by pagedaemon */
#define B_RELEASED 0x00400000 /* free this buffer after its kvm */
+#define B_BC 0x00800000 /* Managed by the Buffer Cache. */
+#define B_DMA 0x01000000 /* DMA reachable. */
#define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
"\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \
"\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \
"\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \
- "\025SCANNED\026DAEMON\027RELEASED"
+ "\025SCANNED\026DAEMON\027RELEASED\030BC\031DMA"
/*
* This structure describes a clustered I/O. It is stored in the b_saveaddr
@@ -305,6 +307,7 @@ void bremfree(struct buf *);
void bufinit(void);
void buf_dirty(struct buf *);
void buf_undirty(struct buf *);
+void buf_dma(struct buf *);
int bwrite(struct buf *);
struct buf *getblk(struct vnode *, daddr_t, int, int, int);
struct buf *geteblk(int);
@@ -328,7 +331,8 @@ int buf_dealloc_mem(struct buf *);
void buf_fix_mapping(struct buf *, vsize_t);
void buf_alloc_pages(struct buf *, vsize_t);
void buf_free_pages(struct buf *);
-
+struct uvm_constraint_range;
+int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
void minphys(struct buf *bp);
int physio(void (*strategy)(struct buf *), dev_t dev, int flags,
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 977a821e8f3..e8e02549873 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: mount.h,v 1.111 2013/06/03 15:56:01 tedu Exp $ */
+/* $OpenBSD: mount.h,v 1.112 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: mount.h,v 1.48 1996/02/18 11:55:47 fvdl Exp $ */
/*
@@ -528,6 +528,7 @@ extern long buflowpages, bufhighpages, bufbackpages;
#define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \
: bcstats.numcleanpages - buflowpages)
extern int bufcachepercent;
+extern int bufhigh(int);
extern void bufadjust(int);
struct uvm_constraint_range;
extern int bufbackoff(struct uvm_constraint_range*, long);
diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h
index 55679c9175c..b2f0be0cb18 100644
--- a/sys/uvm/uvm_extern.h
+++ b/sys/uvm/uvm_extern.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: uvm_extern.h,v 1.107 2013/05/23 01:42:59 tedu Exp $ */
+/* $OpenBSD: uvm_extern.h,v 1.108 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $ */
/*
@@ -680,11 +680,11 @@ struct vm_page *uvm_pagealloc(struct uvm_object *,
voff_t, struct vm_anon *, int);
vaddr_t uvm_pagealloc_contig(vaddr_t, vaddr_t,
vaddr_t, vaddr_t);
-void uvm_pagealloc_multi(struct uvm_object *, voff_t,
+int uvm_pagealloc_multi(struct uvm_object *, voff_t,
vsize_t, int);
void uvm_pagerealloc(struct vm_page *,
struct uvm_object *, voff_t);
-void uvm_pagerealloc_multi(struct uvm_object *, voff_t,
+int uvm_pagerealloc_multi(struct uvm_object *, voff_t,
vsize_t, int, struct uvm_constraint_range *);
/* Actually, uvm_page_physload takes PF#s which need their own type */
void uvm_page_physload(paddr_t, paddr_t, paddr_t,
diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c
index 27cd5ad0b1c..b904577a544 100644
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: uvm_page.c,v 1.125 2013/05/30 16:29:46 tedu Exp $ */
+/* $OpenBSD: uvm_page.c,v 1.126 2013/06/11 19:01:20 beck Exp $ */
/* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */
/*
@@ -876,19 +876,21 @@ uvm_pglistfree(struct pglist *list)
* interface used by the buffer cache to allocate a buffer at a time.
* The pages are allocated wired in DMA accessible memory
*/
-void
+int
uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags)
{
struct pglist plist;
struct vm_page *pg;
- int i;
+ int i, r;
TAILQ_INIT(&plist);
- (void) uvm_pglistalloc(size, dma_constraint.ucr_low,
+ r = uvm_pglistalloc(size, dma_constraint.ucr_low,
dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)),
- UVM_PLA_WAITOK);
+ flags);
+ if (r != 0)
+ return(r);
i = 0;
while ((pg = TAILQ_FIRST(&plist)) != NULL) {
pg->wire_count = 1;
@@ -897,6 +899,7 @@ uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
TAILQ_REMOVE(&plist, pg, pageq);
uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL);
}
+ return(0);
}
/*
@@ -904,21 +907,23 @@ uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
* The pages are reallocated wired outside the DMA accessible region.
*
*/
-void
+int
uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
int flags, struct uvm_constraint_range *where)
{
struct pglist plist;
struct vm_page *pg, *tpg;
- int i;
+ int i,r;
voff_t offset;
TAILQ_INIT(&plist);
if (size == 0)
panic("size 0 uvm_pagerealloc");
- (void) uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
- 0, &plist, atop(round_page(size)), UVM_PLA_WAITOK);
+ r = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
+ 0, &plist, atop(round_page(size)), flags);
+ if (r != 0)
+ return(r);
i = 0;
while((pg = TAILQ_FIRST(&plist)) != NULL) {
offset = off + ptoa(i++);
@@ -931,6 +936,7 @@ uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size,
uvm_pagefree(tpg);
uvm_pagealloc_pg(pg, obj, offset, NULL);
}
+ return(0);
}
/*