summaryrefslogtreecommitdiffstats
path: root/lib/libsqlite3/src/btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libsqlite3/src/btree.c')
-rw-r--r--lib/libsqlite3/src/btree.c1120
1 files changed, 779 insertions, 341 deletions
diff --git a/lib/libsqlite3/src/btree.c b/lib/libsqlite3/src/btree.c
index 7ea66e0d3be..f9f76c2ebbc 100644
--- a/lib/libsqlite3/src/btree.c
+++ b/lib/libsqlite3/src/btree.c
@@ -1139,6 +1139,11 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
** end of the page and all free space is collected into one
** big FreeBlk that occurs in between the header and cell
** pointer array and the cell content area.
+**
+** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
+** b-tree page so that there are no freeblocks or fragment bytes, all
+** unused bytes are contained in the unallocated space region, and all
+** cells are packed tightly at the end of the page.
*/
static int defragmentPage(MemPage *pPage){
int i; /* Loop counter */
@@ -1151,6 +1156,7 @@ static int defragmentPage(MemPage *pPage){
int nCell; /* Number of cells on the page */
unsigned char *data; /* The page data */
unsigned char *temp; /* Temp area for cell content */
+ unsigned char *src; /* Source of content */
int iCellFirst; /* First allowable cell index */
int iCellLast; /* Last possible cell index */
@@ -1160,15 +1166,13 @@ static int defragmentPage(MemPage *pPage){
assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
assert( pPage->nOverflow==0 );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
- data = pPage->aData;
+ temp = 0;
+ src = data = pPage->aData;
hdr = pPage->hdrOffset;
cellOffset = pPage->cellOffset;
nCell = pPage->nCell;
assert( nCell==get2byte(&data[hdr+3]) );
usableSize = pPage->pBt->usableSize;
- cbrk = get2byte(&data[hdr+5]);
- memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
cbrk = usableSize;
iCellFirst = cellOffset + 2*nCell;
iCellLast = usableSize - 4;
@@ -1187,7 +1191,7 @@ static int defragmentPage(MemPage *pPage){
}
#endif
assert( pc>=iCellFirst && pc<=iCellLast );
- size = cellSizePtr(pPage, &temp[pc]);
+ size = cellSizePtr(pPage, &src[pc]);
cbrk -= size;
#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
if( cbrk<iCellFirst ){
@@ -1201,8 +1205,16 @@ static int defragmentPage(MemPage *pPage){
assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
testcase( cbrk+size==usableSize );
testcase( pc+size==usableSize );
- memcpy(&data[cbrk], &temp[pc], size);
put2byte(pAddr, cbrk);
+ if( temp==0 ){
+ int x;
+ if( cbrk==pc ) continue;
+ temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
+ x = get2byte(&data[hdr+5]);
+ memcpy(&temp[x], &data[x], (cbrk+size) - x);
+ src = temp;
+ }
+ memcpy(&data[cbrk], &src[pc], size);
}
assert( cbrk>=iCellFirst );
put2byte(&data[hdr+5], cbrk);
@@ -1218,6 +1230,69 @@ static int defragmentPage(MemPage *pPage){
}
/*
+** Search the free-list on page pPg for space to store a cell nByte bytes in
+** size. If one can be found, return a pointer to the space and remove it
+** from the free-list.
+**
+** If no suitable space can be found on the free-list, return NULL.
+**
+** This function may detect corruption within pPg. If corruption is
+** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
+**
+** If a slot of at least nByte bytes is found but cannot be used because
+** there are already at least 60 fragmented bytes on the page, return NULL.
+** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true.
+*/
+static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){
+ const int hdr = pPg->hdrOffset;
+ u8 * const aData = pPg->aData;
+ int iAddr;
+ int pc;
+ int usableSize = pPg->pBt->usableSize;
+
+ for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){
+ int size; /* Size of the free slot */
+ /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
+ ** increasing offset. */
+ if( pc>usableSize-4 || pc<iAddr+4 ){
+ *pRc = SQLITE_CORRUPT_BKPT;
+ return 0;
+ }
+ /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
+ ** freeblock form a big-endian integer which is the size of the freeblock
+ ** in bytes, including the 4-byte header. */
+ size = get2byte(&aData[pc+2]);
+ if( size>=nByte ){
+ int x = size - nByte;
+ testcase( x==4 );
+ testcase( x==3 );
+ if( x<4 ){
+ /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
+ ** number of bytes in fragments may not exceed 60. */
+ if( aData[hdr+7]>=60 ){
+ if( pbDefrag ) *pbDefrag = 1;
+ return 0;
+ }
+ /* Remove the slot from the free-list. Update the number of
+ ** fragmented bytes within the page. */
+ memcpy(&aData[iAddr], &aData[pc], 2);
+ aData[hdr+7] += (u8)x;
+ }else if( size+pc > usableSize ){
+ *pRc = SQLITE_CORRUPT_BKPT;
+ return 0;
+ }else{
+ /* The slot remains on the free-list. Reduce its size to account
+ ** for the portion used by the new allocation. */
+ put2byte(&aData[pc+2], x);
+ }
+ return &aData[pc + x];
+ }
+ }
+
+ return 0;
+}
+
+/*
** Allocate nByte bytes of space from within the B-Tree page passed
** as the first argument. Write into *pIdx the index into pPage->aData[]
** of the first byte of allocated space. Return either SQLITE_OK or
@@ -1234,9 +1309,8 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
u8 * const data = pPage->aData; /* Local cache of pPage->aData */
int top; /* First byte of cell content area */
+ int rc = SQLITE_OK; /* Integer return code */
int gap; /* First byte of gap between cell pointers and cell content */
- int rc; /* Integer return code */
- int usableSize; /* Usable size of the page */
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
assert( pPage->pBt );
@@ -1244,20 +1318,18 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
assert( nByte>=0 ); /* Minimum cell size is 4 */
assert( pPage->nFree>=nByte );
assert( pPage->nOverflow==0 );
- usableSize = pPage->pBt->usableSize;
- assert( nByte < usableSize-8 );
+ assert( nByte < (int)(pPage->pBt->usableSize-8) );
assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
gap = pPage->cellOffset + 2*pPage->nCell;
assert( gap<=65536 );
- top = get2byte(&data[hdr+5]);
- if( gap>top ){
- if( top==0 ){
- top = 65536;
- }else{
- return SQLITE_CORRUPT_BKPT;
- }
- }
+ /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
+ ** and the reserved space is zero (the usual value for reserved space)
+ ** then the cell content offset of an empty page wants to be 65536.
+ ** However, that integer is too large to be stored in a 2-byte unsigned
+ ** integer, so a value of 0 is used in its place. */
+ top = get2byteNotZero(&data[hdr+5]);
+ if( gap>top ) return SQLITE_CORRUPT_BKPT;
/* If there is enough space between gap and top for one more cell pointer
** array entry offset, and if the freelist is not empty, then search the
@@ -1267,33 +1339,14 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
testcase( gap+1==top );
testcase( gap==top );
if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){
- int pc, addr;
- for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
- int size; /* Size of the free slot */
- if( pc>usableSize-4 || pc<addr+4 ){
- return SQLITE_CORRUPT_BKPT;
- }
- size = get2byte(&data[pc+2]);
- if( size>=nByte ){
- int x = size - nByte;
- testcase( x==4 );
- testcase( x==3 );
- if( x<4 ){
- if( data[hdr+7]>=60 ) goto defragment_page;
- /* Remove the slot from the free-list. Update the number of
- ** fragmented bytes within the page. */
- memcpy(&data[addr], &data[pc], 2);
- data[hdr+7] += (u8)x;
- }else if( size+pc > usableSize ){
- return SQLITE_CORRUPT_BKPT;
- }else{
- /* The slot remains on the free-list. Reduce its size to account
- ** for the portion used by the new allocation. */
- put2byte(&data[pc+2], x);
- }
- *pIdx = pc + x;
- return SQLITE_OK;
- }
+ int bDefrag = 0;
+ u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag);
+ if( rc ) return rc;
+ if( bDefrag ) goto defragment_page;
+ if( pSpace ){
+ assert( pSpace>=data && (pSpace - data)<65536 );
+ *pIdx = (int)(pSpace - data);
+ return SQLITE_OK;
}
}
@@ -1302,8 +1355,8 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
*/
testcase( gap+2+nByte==top );
if( gap+2+nByte>top ){
-defragment_page:
- testcase( pPage->nCell==0 );
+ defragment_page:
+ assert( pPage->nCell>0 || CORRUPT_DB );
rc = defragmentPage(pPage);
if( rc ) return rc;
top = get2byteNotZero(&data[hdr+5]);
@@ -1350,7 +1403,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
assert( pPage->pBt!=0 );
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
- assert( iEnd <= pPage->pBt->usableSize );
+ assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
assert( iSize>=4 ); /* Minimum cell size is 4 */
assert( iStart<=iLast );
@@ -1445,18 +1498,32 @@ static int decodeFlags(MemPage *pPage, int flagByte){
pPage->childPtrSize = 4-4*pPage->leaf;
pBt = pPage->pBt;
if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
+ /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior
+ ** table b-tree page. */
+ assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
+ /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf
+ ** table b-tree page. */
+ assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
pPage->intKey = 1;
pPage->intKeyLeaf = pPage->leaf;
pPage->noPayload = !pPage->leaf;
pPage->maxLocal = pBt->maxLeaf;
pPage->minLocal = pBt->minLeaf;
}else if( flagByte==PTF_ZERODATA ){
+ /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior
+ ** index b-tree page. */
+ assert( (PTF_ZERODATA)==2 );
+ /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf
+ ** index b-tree page. */
+ assert( (PTF_ZERODATA|PTF_LEAF)==10 );
pPage->intKey = 0;
pPage->intKeyLeaf = 0;
pPage->noPayload = 0;
pPage->maxLocal = pBt->maxLocal;
pPage->minLocal = pBt->minLocal;
}else{
+ /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
+ ** an error. */
return SQLITE_CORRUPT_BKPT;
}
pPage->max1bytePayload = pBt->max1bytePayload;
@@ -1496,21 +1563,33 @@ static int btreeInitPage(MemPage *pPage){
hdr = pPage->hdrOffset;
data = pPage->aData;
+ /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
+ ** the b-tree page type. */
if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
pPage->maskPage = (u16)(pBt->pageSize - 1);
pPage->nOverflow = 0;
usableSize = pBt->usableSize;
- pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
+ pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
pPage->aDataEnd = &data[usableSize];
pPage->aCellIdx = &data[cellOffset];
+ /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
+ ** the start of the cell content area. A zero value for this integer is
+ ** interpreted as 65536. */
top = get2byteNotZero(&data[hdr+5]);
+ /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
+ ** number of cells on the page. */
pPage->nCell = get2byte(&data[hdr+3]);
if( pPage->nCell>MX_CELL(pBt) ){
/* To many cells for a single page. The page must be corrupt */
return SQLITE_CORRUPT_BKPT;
}
testcase( pPage->nCell==MX_CELL(pBt) );
+ /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
+ ** possible for a root page of a table that contains no rows) then the
+ ** offset to the cell content area will equal the page size minus the
+ ** bytes of reserved space. */
+ assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
/* A malformed database page might cause us to read past the end
** of page when parsing a cell.
@@ -1544,13 +1623,20 @@ static int btreeInitPage(MemPage *pPage){
}
#endif
- /* Compute the total free space on the page */
+ /* Compute the total free space on the page
+ ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
+ ** start of the first freeblock on the page, or is zero if there are no
+ ** freeblocks. */
pc = get2byte(&data[hdr+1]);
- nFree = data[hdr+7] + top;
+ nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */
while( pc>0 ){
u16 next, size;
if( pc<iCellFirst || pc>iCellLast ){
- /* Start of free block is off the page */
+ /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
+ ** always be at least one cell before the first freeblock.
+ **
+ ** Or, the freeblock is off the end of the page
+ */
return SQLITE_CORRUPT_BKPT;
}
next = get2byte(&data[pc]);
@@ -1956,6 +2042,9 @@ int sqlite3BtreeOpen(
#ifdef SQLITE_SECURE_DELETE
pBt->btsFlags |= BTS_SECURE_DELETE;
#endif
+ /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
+ ** determined by the 2-byte integer located at an offset of 16 bytes from
+ ** the beginning of the database file. */
pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
|| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
@@ -1974,6 +2063,9 @@ int sqlite3BtreeOpen(
#endif
nReserve = 0;
}else{
+ /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
+ ** determined by the one-byte unsigned integer found at an offset of 20
+ ** into the database file header. */
nReserve = zDbHeader[20];
pBt->btsFlags |= BTS_PAGESIZE_FIXED;
#ifndef SQLITE_OMIT_AUTOVACUUM
@@ -2483,6 +2575,9 @@ static int lockBtree(BtShared *pBt){
u32 usableSize;
u8 *page1 = pPage1->aData;
rc = SQLITE_NOTADB;
+ /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
+ ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
+ ** 61 74 20 33 00. */
if( memcmp(page1, zMagicHeader, 16)!=0 ){
goto page1_init_failed;
}
@@ -2523,15 +2618,21 @@ static int lockBtree(BtShared *pBt){
}
#endif
- /* The maximum embedded fraction must be exactly 25%. And the minimum
- ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
+ /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
+ ** fractions and the leaf payload fraction values must be 64, 32, and 32.
+ **
** The original design allowed these amounts to vary, but as of
** version 3.6.0, we require them to be fixed.
*/
if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
goto page1_init_failed;
}
+ /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
+ ** determined by the 2-byte integer located at an offset of 16 bytes from
+ ** the beginning of the database file. */
pageSize = (page1[16]<<8) | (page1[17]<<16);
+ /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
+ ** between 512 and 65536 inclusive. */
if( ((pageSize-1)&pageSize)!=0
|| pageSize>SQLITE_MAX_PAGE_SIZE
|| pageSize<=256
@@ -2539,6 +2640,13 @@ static int lockBtree(BtShared *pBt){
goto page1_init_failed;
}
assert( (pageSize & 7)==0 );
+ /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
+ ** integer at offset 20 is the number of bytes of space at the end of
+ ** each page to reserve for extensions.
+ **
+ ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
+ ** determined by the one-byte unsigned integer found at an offset of 20
+ ** into the database file header. */
usableSize = pageSize - page1[20];
if( (u32)pageSize!=pBt->pageSize ){
/* After reading the first page of the database assuming a page size
@@ -2559,6 +2667,9 @@ static int lockBtree(BtShared *pBt){
rc = SQLITE_CORRUPT_BKPT;
goto page1_init_failed;
}
+ /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
+ ** be less than 480. In other words, if the page size is 512, then the
+ ** reserved space size cannot exceed 32. */
if( usableSize<480 ){
goto page1_init_failed;
}
@@ -3439,6 +3550,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
sqlite3BtreeLeave(p);
return rc;
}
+ p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */
pBt->inTransaction = TRANS_READ;
btreeClearHasContent(pBt);
}
@@ -3802,7 +3914,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){
releasePage(pCur->apPage[i]);
}
unlockBtreeIfUnused(pBt);
- sqlite3DbFree(pBtree->db, pCur->aOverflow);
+ sqlite3_free(pCur->aOverflow);
/* sqlite3_free(pCur); */
sqlite3BtreeLeave(pBtree);
}
@@ -4097,6 +4209,7 @@ static int accessPayload(
offset -= pCur->info.nLocal;
}
+
if( rc==SQLITE_OK && amt>0 ){
const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
Pgno nextPage;
@@ -4114,8 +4227,8 @@ static int accessPayload(
if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
if( nOvfl>pCur->nOvflAlloc ){
- Pgno *aNew = (Pgno*)sqlite3DbRealloc(
- pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno)
+ Pgno *aNew = (Pgno*)sqlite3Realloc(
+ pCur->aOverflow, nOvfl*2*sizeof(Pgno)
);
if( aNew==0 ){
rc = SQLITE_NOMEM;
@@ -4162,6 +4275,7 @@ static int accessPayload(
*/
assert( eOp!=2 );
assert( pCur->curFlags & BTCF_ValidOvfl );
+ assert( pCur->pBtree->db==pBt->db );
if( pCur->aOverflow[iIdx+1] ){
nextPage = pCur->aOverflow[iIdx+1];
}else{
@@ -5136,6 +5250,8 @@ static int allocateBtreePage(
assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
pPage1 = pBt->pPage1;
mxPage = btreePagecount(pBt);
+ /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
+ ** stores stores the total number of pages on the freelist. */
n = get4byte(&pPage1->aData[36]);
testcase( n==mxPage-1 );
if( n>=mxPage ){
@@ -5182,8 +5298,14 @@ static int allocateBtreePage(
do {
pPrevTrunk = pTrunk;
if( pPrevTrunk ){
+ /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
+ ** is the page number of the next freelist trunk page in the list or
+ ** zero if this is the last freelist trunk page. */
iTrunk = get4byte(&pPrevTrunk->aData[0]);
}else{
+ /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
+ ** stores the page number of the first page of the freelist, or zero if
+ ** the freelist is empty. */
iTrunk = get4byte(&pPage1->aData[32]);
}
testcase( iTrunk==mxPage );
@@ -5198,8 +5320,9 @@ static int allocateBtreePage(
}
assert( pTrunk!=0 );
assert( pTrunk->aData!=0 );
-
- k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
+ /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
+ ** is the number of leaf page pointers to follow. */
+ k = get4byte(&pTrunk->aData[4]);
if( k==0 && !searchList ){
/* The trunk has no leaves and the list is not being searched.
** So extract the trunk page itself and use it as the newly
@@ -5517,6 +5640,11 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
** for now. At some point in the future (once everyone has upgraded
** to 3.6.0 or later) we should consider fixing the conditional above
** to read "usableSize/4-2" instead of "usableSize/4-8".
+ **
+ ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
+ ** avoid using the last six entries in the freelist trunk page array in
+ ** order that database files created by newer versions of SQLite can be
+ ** read by older versions of SQLite.
*/
rc = sqlite3PagerWrite(pTrunk->pDbPage);
if( rc==SQLITE_OK ){
@@ -5868,9 +5996,17 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
return;
}
pPage->nCell--;
- memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
- put2byte(&data[hdr+3], pPage->nCell);
- pPage->nFree += 2;
+ if( pPage->nCell==0 ){
+ memset(&data[hdr+1], 0, 4);
+ data[hdr+7] = 0;
+ put2byte(&data[hdr+5], pPage->pBt->usableSize);
+ pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
+ - pPage->childPtrSize - 8;
+ }else{
+ memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
+ put2byte(&data[hdr+3], pPage->nCell);
+ pPage->nFree += 2;
+ }
}
/*
@@ -5965,45 +6101,271 @@ static void insertCell(
}
/*
-** Add a list of cells to a page. The page should be initially empty.
-** The cells are guaranteed to fit on the page.
+** Array apCell[] contains pointers to nCell b-tree page cells. The
+** szCell[] array contains the size in bytes of each cell. This function
+** replaces the current contents of page pPg with the contents of the cell
+** array.
+**
+** Some of the cells in apCell[] may currently be stored in pPg. This
+** function works around problems caused by this by making a copy of any
+** such cells before overwriting the page data.
+**
+** The MemPage.nFree field is invalidated by this function. It is the
+** responsibility of the caller to set it correctly.
*/
-static void assemblePage(
- MemPage *pPage, /* The page to be assembled */
- int nCell, /* The number of cells to add to this page */
- u8 **apCell, /* Pointers to cell bodies */
- u16 *aSize /* Sizes of the cells */
+static void rebuildPage(
+ MemPage *pPg, /* Edit this page */
+ int nCell, /* Final number of cells on page */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
){
- int i; /* Loop counter */
- u8 *pCellptr; /* Address of next cell pointer */
- int cellbody; /* Address of next cell body */
- u8 * const data = pPage->aData; /* Pointer to data for pPage */
- const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
- const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
+ const int hdr = pPg->hdrOffset; /* Offset of header on pPg */
+ u8 * const aData = pPg->aData; /* Pointer to data for pPg */
+ const int usableSize = pPg->pBt->usableSize;
+ u8 * const pEnd = &aData[usableSize];
+ int i;
+ u8 *pCellptr = pPg->aCellIdx;
+ u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
+ u8 *pData;
- assert( pPage->nOverflow==0 );
- assert( sqlite3_mutex_held(pPage->pBt->mutex) );
- assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
- && (int)MX_CELL(pPage->pBt)<=10921);
- assert( sqlite3PagerIswriteable(pPage->pDbPage) );
+ i = get2byte(&aData[hdr+5]);
+ memcpy(&pTmp[i], &aData[i], usableSize - i);
+
+ pData = pEnd;
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( pCell>aData && pCell<pEnd ){
+ pCell = &pTmp[pCell - aData];
+ }
+ pData -= szCell[i];
+ memcpy(pData, pCell, szCell[i]);
+ put2byte(pCellptr, (pData - aData));
+ pCellptr += 2;
+ assert( szCell[i]==cellSizePtr(pPg, pCell) );
+ }
+
+ /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
+ pPg->nCell = nCell;
+ pPg->nOverflow = 0;
+
+ put2byte(&aData[hdr+1], 0);
+ put2byte(&aData[hdr+3], pPg->nCell);
+ put2byte(&aData[hdr+5], pData - aData);
+ aData[hdr+7] = 0x00;
+}
+
+/*
+** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
+** contains the size in bytes of each such cell. This function attempts to
+** add the cells stored in the array to page pPg. If it cannot (because
+** the page needs to be defragmented before the cells will fit), non-zero
+** is returned. Otherwise, if the cells are added successfully, zero is
+** returned.
+**
+** Argument pCellptr points to the first entry in the cell-pointer array
+** (part of page pPg) to populate. After cell apCell[0] is written to the
+** page body, a 16-bit offset is written to pCellptr. And so on, for each
+** cell in the array. It is the responsibility of the caller to ensure
+** that it is safe to overwrite this part of the cell-pointer array.
+**
+** When this function is called, *ppData points to the start of the
+** content area on page pPg. If the size of the content area is extended,
+** *ppData is updated to point to the new start of the content area
+** before returning.
+**
+** Finally, argument pBegin points to the byte immediately following the
+** end of the space required by this page for the cell-pointer area (for
+** all cells - not just those inserted by the current call). If the content
+** area must be extended to before this point in order to accomodate all
+** cells in apCell[], then the cells do not fit and non-zero is returned.
+*/
+static int pageInsertArray(
+ MemPage *pPg, /* Page to add cells to */
+ u8 *pBegin, /* End of cell-pointer array */
+ u8 **ppData, /* IN/OUT: Page content -area pointer */
+ u8 *pCellptr, /* Pointer to cell-pointer area */
+ int nCell, /* Number of cells to add to pPg */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ int i;
+ u8 *aData = pPg->aData;
+ u8 *pData = *ppData;
+ const int bFreelist = aData[1] || aData[2];
+ assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */
+ for(i=0; i<nCell; i++){
+ int sz = szCell[i];
+ int rc;
+ u8 *pSlot;
+ if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){
+ pData -= sz;
+ if( pData<pBegin ) return 1;
+ pSlot = pData;
+ }
+ memcpy(pSlot, apCell[i], sz);
+ put2byte(pCellptr, (pSlot - aData));
+ pCellptr += 2;
+ }
+ *ppData = pData;
+ return 0;
+}
+
+/*
+** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
+** contains the size in bytes of each such cell. This function adds the
+** space associated with each cell in the array that is currently stored
+** within the body of pPg to the pPg free-list. The cell-pointers and other
+** fields of the page are not updated.
+**
+** This function returns the total number of cells added to the free-list.
+*/
+static int pageFreeArray(
+ MemPage *pPg, /* Page to edit */
+ int nCell, /* Cells to delete */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ u8 * const aData = pPg->aData;
+ u8 * const pEnd = &aData[pPg->pBt->usableSize];
+ u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
+ int nRet = 0;
+ int i;
+ u8 *pFree = 0;
+ int szFree = 0;
+
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( pCell>=pStart && pCell<pEnd ){
+ int sz = szCell[i];
+ if( pFree!=(pCell + sz) ){
+ if( pFree ){
+ assert( pFree>aData && (pFree - aData)<65536 );
+ freeSpace(pPg, (u16)(pFree - aData), szFree);
+ }
+ pFree = pCell;
+ szFree = sz;
+ if( pFree+sz>pEnd ) return 0;
+ }else{
+ pFree = pCell;
+ szFree += sz;
+ }
+ nRet++;
+ }
+ }
+ if( pFree ){
+ assert( pFree>aData && (pFree - aData)<65536 );
+ freeSpace(pPg, (u16)(pFree - aData), szFree);
+ }
+ return nRet;
+}
+
+/*
+** apCell[] and szCell[] contains pointers to and sizes of all cells in the
+** pages being balanced. The current page, pPg, has pPg->nCell cells starting
+** with apCell[iOld]. After balancing, this page should hold nNew cells
+** starting at apCell[iNew].
+**
+** This routine makes the necessary adjustments to pPg so that it contains
+** the correct cells after being balanced.
+**
+** The pPg->nFree field is invalid when this function returns. It is the
+** responsibility of the caller to set it correctly.
+*/
+static void editPage(
+ MemPage *pPg, /* Edit this page */
+ int iOld, /* Index of first cell currently on page */
+ int iNew, /* Index of new first cell on page */
+ int nNew, /* Final number of cells on page */
+ u8 **apCell, /* Array of cells */
+ u16 *szCell /* Array of cell sizes */
+){
+ u8 * const aData = pPg->aData;
+ const int hdr = pPg->hdrOffset;
+ u8 *pBegin = &pPg->aCellIdx[nNew * 2];
+ int nCell = pPg->nCell; /* Cells stored on pPg */
+ u8 *pData;
+ u8 *pCellptr;
+ int i;
+ int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
+ int iNewEnd = iNew + nNew;
+
+#ifdef SQLITE_DEBUG
+ u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
+ memcpy(pTmp, aData, pPg->pBt->usableSize);
+#endif
+
+ /* Remove cells from the start and end of the page */
+ if( iOld<iNew ){
+ int nShift = pageFreeArray(
+ pPg, iNew-iOld, &apCell[iOld], &szCell[iOld]
+ );
+ memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
+ nCell -= nShift;
+ }
+ if( iNewEnd < iOldEnd ){
+ nCell -= pageFreeArray(
+ pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd]
+ );
+ }
+
+ pData = &aData[get2byteNotZero(&aData[hdr+5])];
+ if( pData<pBegin ) goto editpage_fail;
+
+ /* Add cells to the start of the page */
+ if( iNew<iOld ){
+ int nAdd = MIN(nNew,iOld-iNew);
+ assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
+ pCellptr = pPg->aCellIdx;
+ memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ nAdd, &apCell[iNew], &szCell[iNew]
+ ) ) goto editpage_fail;
+ nCell += nAdd;
+ }
+
+ /* Add any overflow cells */
+ for(i=0; i<pPg->nOverflow; i++){
+ int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
+ if( iCell>=0 && iCell<nNew ){
+ pCellptr = &pPg->aCellIdx[iCell * 2];
+ memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
+ nCell++;
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ 1, &apCell[iCell + iNew], &szCell[iCell + iNew]
+ ) ) goto editpage_fail;
+ }
+ }
+
+ /* Append cells to the end of the page */
+ pCellptr = &pPg->aCellIdx[nCell*2];
+ if( pageInsertArray(
+ pPg, pBegin, &pData, pCellptr,
+ nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell]
+ ) ) goto editpage_fail;
- /* Check that the page has just been zeroed by zeroPage() */
- assert( pPage->nCell==0 );
- assert( get2byteNotZero(&data[hdr+5])==nUsable );
+ pPg->nCell = nNew;
+ pPg->nOverflow = 0;
- pCellptr = &pPage->aCellIdx[nCell*2];
- cellbody = nUsable;
- for(i=nCell-1; i>=0; i--){
- u16 sz = aSize[i];
- pCellptr -= 2;
- cellbody -= sz;
- put2byte(pCellptr, cellbody);
- memcpy(&data[cellbody], apCell[i], sz);
+ put2byte(&aData[hdr+3], pPg->nCell);
+ put2byte(&aData[hdr+5], pData - aData);
+
+#ifdef SQLITE_DEBUG
+ for(i=0; i<nNew && !CORRUPT_DB; i++){
+ u8 *pCell = apCell[i+iNew];
+ int iOff = get2byte(&pPg->aCellIdx[i*2]);
+ if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
+ pCell = &pTmp[pCell - aData];
+ }
+ assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
}
- put2byte(&data[hdr+3], nCell);
- put2byte(&data[hdr+5], cellbody);
- pPage->nFree -= (nCell*2 + nUsable - cellbody);
- pPage->nCell = (u16)nCell;
+#endif
+
+ return;
+ editpage_fail:
+ /* Unable to edit this page. Rebuild it from scratch instead. */
+ rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}
/*
@@ -6057,7 +6419,7 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
assert( pPage->nOverflow==1 );
/* This error condition is now caught prior to reaching this function */
- if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;
+ if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
/* Allocate a new page. This page will become the right-sibling of
** pPage. Make the parent page writable, so that the new divider cell
@@ -6075,7 +6437,8 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
assert( sqlite3PagerIswriteable(pNew->pDbPage) );
assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
- assemblePage(pNew, 1, &pCell, &szCell);
+ rebuildPage(pNew, 1, &pCell, &szCell);
+ pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
/* If this is an auto-vacuum database, update the pointer map
** with entries for the new page, and any pointer from the
@@ -6294,17 +6657,22 @@ static int balance_nonroot(
int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
int szScratch; /* Size of scratch memory requested */
MemPage *apOld[NB]; /* pPage and up to two siblings */
- MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
u8 *pRight; /* Location in parent of right-sibling pointer */
u8 *apDiv[NB-1]; /* Divider cells in pParent */
int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
- int szNew[NB+2]; /* Combined size of cells place on i-th page */
+ int cntOld[NB+2]; /* Old index in aCell[] after i-th page */
+ int szNew[NB+2]; /* Combined size of cells placed on i-th page */
u8 **apCell = 0; /* All cells begin balanced */
u16 *szCell; /* Local size of all cells in apCell[] */
u8 *aSpace1; /* Space for copies of dividers cells */
Pgno pgno; /* Temp var to store a page number in */
+ u8 abDone[NB+2]; /* True after i'th new page is populated */
+ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */
+ Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */
+ u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */
+ memset(abDone, 0, sizeof(abDone));
pBt = pParent->pBt;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
@@ -6413,12 +6781,14 @@ static int balance_nonroot(
/*
** Allocate space for memory structures
*/
- k = pBt->pageSize + ROUND8(sizeof(MemPage));
szScratch =
nMaxCells*sizeof(u8*) /* apCell */
+ nMaxCells*sizeof(u16) /* szCell */
- + pBt->pageSize /* aSpace1 */
- + k*nOld; /* Page copies (apCopy) */
+ + pBt->pageSize; /* aSpace1 */
+
+ /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
+ ** that is more than 6 times the database page size. */
+ assert( szScratch<=6*(int)pBt->pageSize );
apCell = sqlite3ScratchMalloc( szScratch );
if( apCell==0 ){
rc = SQLITE_NOMEM;
@@ -6431,8 +6801,8 @@ static int balance_nonroot(
/*
** Load pointers to all cells on sibling pages and the divider cells
** into the local apCell[] array. Make copies of the divider cells
- ** into space obtained from aSpace1[] and remove the divider cells
- ** from pParent.
+ ** into space obtained from aSpace1[]. The divider cells have already
+ ** been removed from pParent.
**
** If the siblings are on leaf pages, then the child pointers of the
** divider cells are stripped from the cells before they are copied
@@ -6448,15 +6818,7 @@ static int balance_nonroot(
leafData = apOld[0]->intKeyLeaf;
for(i=0; i<nOld; i++){
int limit;
-
- /* Before doing anything else, take a copy of the i'th original sibling
- ** The rest of this function will use data from the copies rather
- ** that the original pages since the original pages will be in the
- ** process of being overwritten. */
- MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
- memcpy(pOld, apOld[i], sizeof(MemPage));
- pOld->aData = (void*)&pOld[1];
- memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
+ MemPage *pOld = apOld[i];
limit = pOld->nCell+pOld->nOverflow;
if( pOld->nOverflow>0 ){
@@ -6477,6 +6839,7 @@ static int balance_nonroot(
nCell++;
}
}
+ cntOld[i] = nCell;
if( i<nOld-1 && !leafData){
u16 sz = (u16)szNew[i];
u8 *pTemp;
@@ -6499,7 +6862,11 @@ static int balance_nonroot(
}else{
assert( leafCorrection==4 );
if( szCell[nCell]<4 ){
- /* Do not allow any cells smaller than 4 bytes. */
+ /* Do not allow any cells smaller than 4 bytes. If a smaller cell
+ ** does exist, pad it with 0x00 bytes. */
+ assert( szCell[nCell]==3 );
+ assert( apCell[nCell]==&aSpace1[iSpace1-3] );
+ aSpace1[iSpace1++] = 0x00;
szCell[nCell] = 4;
}
}
@@ -6528,7 +6895,7 @@ static int balance_nonroot(
assert( i<nMaxCells );
subtotal += szCell[i] + 2;
if( subtotal > usableSpace ){
- szNew[k] = subtotal - szCell[i];
+ szNew[k] = subtotal - szCell[i] - 2;
cntNew[k] = i;
if( leafData ){ i--; }
subtotal = 0;
@@ -6542,9 +6909,10 @@ static int balance_nonroot(
/*
** The packing computed by the previous block is biased toward the siblings
- ** on the left side. The left siblings are always nearly full, while the
- ** right-most sibling might be nearly empty. This block of code attempts
- ** to adjust the packing of siblings to get a better balance.
+ ** on the left side (siblings with smaller keys). The left siblings are
+ ** always nearly full, while the right-most sibling might be nearly empty.
+ ** The next block of code attempts to adjust the packing of siblings to
+ ** get a better balance.
**
** This adjustment is more than an optimization. The packing above might
** be so out of balance as to be illegal. For example, the right-most
@@ -6573,22 +6941,18 @@ static int balance_nonroot(
szNew[i-1] = szLeft;
}
- /* Either we found one or more cells (cntnew[0])>0) or pPage is
- ** a virtual root page. A virtual root page is when the real root
- ** page is page 1 and we are the only child of that page.
- **
- ** UPDATE: The assert() below is not necessarily true if the database
- ** file is corrupt. The corruption will be detected and reported later
- ** in this procedure so there is no need to act upon it now.
+ /* Sanity check: For a non-corrupt database file one of the follwing
+ ** must be true:
+ ** (1) We found one or more cells (cntNew[0])>0), or
+ ** (2) pPage is a virtual root page. A virtual root page is when
+ ** the real root page is page 1 and we are the only child of
+ ** that page.
*/
-#if 0
- assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
-#endif
-
- TRACE(("BALANCE: old: %d %d %d ",
- apOld[0]->pgno,
- nOld>=2 ? apOld[1]->pgno : 0,
- nOld>=3 ? apOld[2]->pgno : 0
+ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
+ TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
+ apOld[0]->pgno, apOld[0]->nCell,
+ nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
+ nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
));
/*
@@ -6611,8 +6975,10 @@ static int balance_nonroot(
assert( i>0 );
rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
if( rc ) goto balance_cleanup;
+ zeroPage(pNew, pageFlags);
apNew[i] = pNew;
nNew++;
+ cntOld[i] = nCell;
/* Set the pointer-map entry for the new sibling page. */
if( ISAUTOVACUUM ){
@@ -6624,135 +6990,247 @@ static int balance_nonroot(
}
}
- /* Free any old pages that were not reused as new pages.
- */
- while( i<nOld ){
- freePage(apOld[i], &rc);
- if( rc ) goto balance_cleanup;
- releasePage(apOld[i]);
- apOld[i] = 0;
- i++;
- }
-
/*
- ** Put the new pages in ascending order. This helps to
- ** keep entries in the disk file in order so that a scan
- ** of the table is a linear scan through the file. That
- ** in turn helps the operating system to deliver pages
- ** from the disk more rapidly.
+ ** Reassign page numbers so that the new pages are in ascending order.
+ ** This helps to keep entries in the disk file in order so that a scan
+ ** of the table is closer to a linear scan through the file. That in turn
+ ** helps the operating system to deliver pages from the disk more rapidly.
**
- ** An O(n^2) insertion sort algorithm is used, but since
- ** n is never more than NB (a small constant), that should
- ** not be a problem.
+ ** An O(n^2) insertion sort algorithm is used, but since n is never more
+ ** than (NB+2) (a small constant), that should not be a problem.
**
- ** When NB==3, this one optimization makes the database
- ** about 25% faster for large insertions and deletions.
+ ** When NB==3, this one optimization makes the database about 25% faster
+ ** for large insertions and deletions.
*/
- for(i=0; i<k-1; i++){
- int minV = apNew[i]->pgno;
- int minI = i;
- for(j=i+1; j<k; j++){
- if( apNew[j]->pgno<(unsigned)minV ){
- minI = j;
- minV = apNew[j]->pgno;
+ for(i=0; i<nNew; i++){
+ aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
+ aPgFlags[i] = apNew[i]->pDbPage->flags;
+ for(j=0; j<i; j++){
+ if( aPgno[j]==aPgno[i] ){
+ /* This branch is taken if the set of sibling pages somehow contains
+ ** duplicate entries. This can happen if the database is corrupt.
+ ** It would be simpler to detect this as part of the loop below, but
+ ** we do the detection here in order to avoid populating the pager
+ ** cache with two separate objects associated with the same
+ ** page number. */
+ assert( CORRUPT_DB );
+ rc = SQLITE_CORRUPT_BKPT;
+ goto balance_cleanup;
}
}
- if( minI>i ){
- MemPage *pT;
- pT = apNew[i];
- apNew[i] = apNew[minI];
- apNew[minI] = pT;
+ }
+ for(i=0; i<nNew; i++){
+ int iBest = 0; /* aPgno[] index of page number to use */
+ for(j=1; j<nNew; j++){
+ if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
+ }
+ pgno = aPgOrder[iBest];
+ aPgOrder[iBest] = 0xffffffff;
+ if( iBest!=i ){
+ if( iBest>i ){
+ sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
+ }
+ sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
+ apNew[i]->pgno = pgno;
}
}
- TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
- apNew[0]->pgno, szNew[0],
+
+ TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
+ "%d(%d nc=%d) %d(%d nc=%d)\n",
+ apNew[0]->pgno, szNew[0], cntNew[0],
nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
+ nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
+ nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
- nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
+ nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
+ nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
+ nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
+ ));
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
put4byte(pRight, apNew[nNew-1]->pgno);
- /*
- ** Evenly distribute the data in apCell[] across the new pages.
- ** Insert divider cells into pParent as necessary.
+ /* If the sibling pages are not leaves, ensure that the right-child pointer
+ ** of the right-most new sibling page is set to the value that was
+ ** originally in the same field of the right-most old sibling page. */
+ if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
+ MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
+ memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
+ }
+
+ /* Make any required updates to pointer map entries associated with
+ ** cells stored on sibling pages following the balance operation. Pointer
+ ** map entries associated with divider cells are set by the insertCell()
+ ** routine. The associated pointer map entries are:
+ **
+ ** a) if the cell contains a reference to an overflow chain, the
+ ** entry associated with the first page in the overflow chain, and
+ **
+ ** b) if the sibling pages are not leaves, the child page associated
+ ** with the cell.
+ **
+ ** If the sibling pages are not leaves, then the pointer map entry
+ ** associated with the right-child of each sibling may also need to be
+ ** updated. This happens below, after the sibling pages have been
+ ** populated, not here.
*/
- j = 0;
- for(i=0; i<nNew; i++){
- /* Assemble the new sibling page. */
+ if( ISAUTOVACUUM ){
+ MemPage *pNew = apNew[0];
+ u8 *aOld = pNew->aData;
+ int cntOldNext = pNew->nCell + pNew->nOverflow;
+ int usableSize = pBt->usableSize;
+ int iNew = 0;
+ int iOld = 0;
+
+ for(i=0; i<nCell; i++){
+ u8 *pCell = apCell[i];
+ if( i==cntOldNext ){
+ MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
+ cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
+ aOld = pOld->aData;
+ }
+ if( i==cntNew[iNew] ){
+ pNew = apNew[++iNew];
+ if( !leafData ) continue;
+ }
+
+ /* Cell pCell is destined for new sibling page pNew. Originally, it
+ ** was either part of sibling page iOld (possibly an overflow cell),
+ ** or else the divider cell to the left of sibling page iOld. So,
+ ** if sibling page iOld had the same page number as pNew, and if
+ ** pCell really was a part of sibling page iOld (not a divider or
+ ** overflow cell), we can skip updating the pointer map entries. */
+ if( iOld>=nNew
+ || pNew->pgno!=aPgno[iOld]
+ || pCell<aOld
+ || pCell>=&aOld[usableSize]
+ ){
+ if( !leafCorrection ){
+ ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
+ }
+ if( szCell[i]>pNew->minLocal ){
+ ptrmapPutOvflPtr(pNew, pCell, &rc);
+ }
+ }
+ }
+ }
+
+ /* Insert new divider cells into pParent. */
+ for(i=0; i<nNew-1; i++){
+ u8 *pCell;
+ u8 *pTemp;
+ int sz;
MemPage *pNew = apNew[i];
+ j = cntNew[i];
+
assert( j<nMaxCells );
- zeroPage(pNew, pageFlags);
- assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
- assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
- assert( pNew->nOverflow==0 );
+ pCell = apCell[j];
+ sz = szCell[j] + leafCorrection;
+ pTemp = &aOvflSpace[iOvflSpace];
+ if( !pNew->leaf ){
+ memcpy(&pNew->aData[8], pCell, 4);
+ }else if( leafData ){
+ /* If the tree is a leaf-data tree, and the siblings are leaves,
+ ** then there is no divider cell in apCell[]. Instead, the divider
+ ** cell consists of the integer key for the right-most cell of
+ ** the sibling-page assembled above only.
+ */
+ CellInfo info;
+ j--;
+ btreeParseCellPtr(pNew, apCell[j], &info);
+ pCell = pTemp;
+ sz = 4 + putVarint(&pCell[4], info.nKey);
+ pTemp = 0;
+ }else{
+ pCell -= 4;
+ /* Obscure case for non-leaf-data trees: If the cell at pCell was
+ ** previously stored on a leaf node, and its reported size was 4
+ ** bytes, then it may actually be smaller than this
+ ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
+ ** any cell). But it is important to pass the correct size to
+ ** insertCell(), so reparse the cell now.
+ **
+ ** Note that this can never happen in an SQLite data file, as all
+ ** cells are at least 4 bytes. It only happens in b-trees used
+ ** to evaluate "IN (SELECT ...)" and similar clauses.
+ */
+ if( szCell[j]==4 ){
+ assert(leafCorrection==4);
+ sz = cellSizePtr(pParent, pCell);
+ }
+ }
+ iOvflSpace += sz;
+ assert( sz<=pBt->maxLocal+23 );
+ assert( iOvflSpace <= (int)pBt->pageSize );
+ insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
+ if( rc!=SQLITE_OK ) goto balance_cleanup;
+ assert( sqlite3PagerIswriteable(pParent->pDbPage) );
+ }
- j = cntNew[i];
+ /* Now update the actual sibling pages. The order in which they are updated
+ ** is important, as this code needs to avoid disrupting any page from which
+ ** cells may still to be read. In practice, this means:
+ **
+ ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
+ ** then it is not safe to update page apNew[iPg] until after
+ ** the left-hand sibling apNew[iPg-1] has been updated.
+ **
+ ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
+ ** then it is not safe to update page apNew[iPg] until after
+ ** the right-hand sibling apNew[iPg+1] has been updated.
+ **
+ ** If neither of the above apply, the page is safe to update.
+ **
+ ** The iPg value in the following loop starts at nNew-1 goes down
+ ** to 0, then back up to nNew-1 again, thus making two passes over
+ ** the pages. On the initial downward pass, only condition (1) above
+ ** needs to be tested because (2) will always be true from the previous
+ ** step. On the upward pass, both conditions are always true, so the
+ ** upwards pass simply processes pages that were missed on the downward
+ ** pass.
+ */
+ for(i=1-nNew; i<nNew; i++){
+ int iPg = i<0 ? -i : i;
+ assert( iPg>=0 && iPg<nNew );
+ if( abDone[iPg] ) continue; /* Skip pages already processed */
+ if( i>=0 /* On the upwards pass, or... */
+ || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */
+ ){
+ int iNew;
+ int iOld;
+ int nNewCell;
- /* If the sibling page assembled above was not the right-most sibling,
- ** insert a divider cell into the parent page.
- */
- assert( i<nNew-1 || j==nCell );
- if( j<nCell ){
- u8 *pCell;
- u8 *pTemp;
- int sz;
-
- assert( j<nMaxCells );
- pCell = apCell[j];
- sz = szCell[j] + leafCorrection;
- pTemp = &aOvflSpace[iOvflSpace];
- if( !pNew->leaf ){
- memcpy(&pNew->aData[8], pCell, 4);
- }else if( leafData ){
- /* If the tree is a leaf-data tree, and the siblings are leaves,
- ** then there is no divider cell in apCell[]. Instead, the divider
- ** cell consists of the integer key for the right-most cell of
- ** the sibling-page assembled above only.
- */
- CellInfo info;
- j--;
- btreeParseCellPtr(pNew, apCell[j], &info);
- pCell = pTemp;
- sz = 4 + putVarint(&pCell[4], info.nKey);
- pTemp = 0;
+ /* Verify condition (1): If cells are moving left, update iPg
+ ** only after iPg-1 has already been updated. */
+ assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
+
+ /* Verify condition (2): If cells are moving right, update iPg
+ ** only after iPg+1 has already been updated. */
+ assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
+
+ if( iPg==0 ){
+ iNew = iOld = 0;
+ nNewCell = cntNew[0];
}else{
- pCell -= 4;
- /* Obscure case for non-leaf-data trees: If the cell at pCell was
- ** previously stored on a leaf node, and its reported size was 4
- ** bytes, then it may actually be smaller than this
- ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
- ** any cell). But it is important to pass the correct size to
- ** insertCell(), so reparse the cell now.
- **
- ** Note that this can never happen in an SQLite data file, as all
- ** cells are at least 4 bytes. It only happens in b-trees used
- ** to evaluate "IN (SELECT ...)" and similar clauses.
- */
- if( szCell[j]==4 ){
- assert(leafCorrection==4);
- sz = cellSizePtr(pParent, pCell);
- }
+ iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;
+ iNew = cntNew[iPg-1] + !leafData;
+ nNewCell = cntNew[iPg] - iNew;
}
- iOvflSpace += sz;
- assert( sz<=pBt->maxLocal+23 );
- assert( iOvflSpace <= (int)pBt->pageSize );
- insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
- if( rc!=SQLITE_OK ) goto balance_cleanup;
- assert( sqlite3PagerIswriteable(pParent->pDbPage) );
- j++;
- nxDiv++;
+ editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);
+ abDone[iPg]++;
+ apNew[iPg]->nFree = usableSpace-szNew[iPg];
+ assert( apNew[iPg]->nOverflow==0 );
+ assert( apNew[iPg]->nCell==nNewCell );
}
}
- assert( j==nCell );
+
+ /* All pages have been processed exactly once */
+ assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
+
assert( nOld>0 );
assert( nNew>0 );
- if( (pageFlags & PTF_LEAF)==0 ){
- u8 *zChild = &apCopy[nOld-1]->aData[8];
- memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
- }
if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
/* The root page of the b-tree now contains no cells. The only sibling
@@ -6765,126 +7243,50 @@ static int balance_nonroot(
** sets all pointer-map entries corresponding to database image pages
** for which the pointer is stored within the content being copied.
**
- ** The second assert below verifies that the child page is defragmented
- ** (it must be, as it was just reconstructed using assemblePage()). This
- ** is important if the parent page happens to be page 1 of the database
- ** image. */
+ ** It is critical that the child page be defragmented before being
+ ** copied into the parent, because if the parent is page 1 then it will
+ ** by smaller than the child due to the database header, and so all the
+ ** free space needs to be up front.
+ */
assert( nNew==1 );
+ rc = defragmentPage(apNew[0]);
+ testcase( rc!=SQLITE_OK );
assert( apNew[0]->nFree ==
- (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
+ (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
+ || rc!=SQLITE_OK
);
copyNodeContent(apNew[0], pParent, &rc);
freePage(apNew[0], &rc);
- }else if( ISAUTOVACUUM ){
- /* Fix the pointer-map entries for all the cells that were shifted around.
- ** There are several different types of pointer-map entries that need to
- ** be dealt with by this routine. Some of these have been set already, but
- ** many have not. The following is a summary:
- **
- ** 1) The entries associated with new sibling pages that were not
- ** siblings when this function was called. These have already
- ** been set. We don't need to worry about old siblings that were
- ** moved to the free-list - the freePage() code has taken care
- ** of those.
- **
- ** 2) The pointer-map entries associated with the first overflow
- ** page in any overflow chains used by new divider cells. These
- ** have also already been taken care of by the insertCell() code.
- **
- ** 3) If the sibling pages are not leaves, then the child pages of
- ** cells stored on the sibling pages may need to be updated.
- **
- ** 4) If the sibling pages are not internal intkey nodes, then any
- ** overflow pages used by these cells may need to be updated
- ** (internal intkey nodes never contain pointers to overflow pages).
- **
- ** 5) If the sibling pages are not leaves, then the pointer-map
- ** entries for the right-child pages of each sibling may need
- ** to be updated.
- **
- ** Cases 1 and 2 are dealt with above by other code. The next
- ** block deals with cases 3 and 4 and the one after that, case 5. Since
- ** setting a pointer map entry is a relatively expensive operation, this
- ** code only sets pointer map entries for child or overflow pages that have
- ** actually moved between pages. */
- MemPage *pNew = apNew[0];
- MemPage *pOld = apCopy[0];
- int nOverflow = pOld->nOverflow;
- int iNextOld = pOld->nCell + nOverflow;
- int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
- j = 0; /* Current 'old' sibling page */
- k = 0; /* Current 'new' sibling page */
- for(i=0; i<nCell; i++){
- int isDivider = 0;
- while( i==iNextOld ){
- /* Cell i is the cell immediately following the last cell on old
- ** sibling page j. If the siblings are not leaf pages of an
- ** intkey b-tree, then cell i was a divider cell. */
- assert( j+1 < ArraySize(apCopy) );
- assert( j+1 < nOld );
- pOld = apCopy[++j];
- iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
- if( pOld->nOverflow ){
- nOverflow = pOld->nOverflow;
- iOverflow = i + !leafData + pOld->aiOvfl[0];
- }
- isDivider = !leafData;
- }
-
- assert(nOverflow>0 || iOverflow<i );
- assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
- assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
- if( i==iOverflow ){
- isDivider = 1;
- if( (--nOverflow)>0 ){
- iOverflow++;
- }
- }
-
- if( i==cntNew[k] ){
- /* Cell i is the cell immediately following the last cell on new
- ** sibling page k. If the siblings are not leaf pages of an
- ** intkey b-tree, then cell i is a divider cell. */
- pNew = apNew[++k];
- if( !leafData ) continue;
- }
- assert( j<nOld );
- assert( k<nNew );
-
- /* If the cell was originally divider cell (and is not now) or
- ** an overflow cell, or if the cell was located on a different sibling
- ** page before the balancing, then the pointer map entries associated
- ** with any child or overflow pages need to be updated. */
- if( isDivider || pOld->pgno!=pNew->pgno ){
- if( !leafCorrection ){
- ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
- }
- if( szCell[i]>pNew->minLocal ){
- ptrmapPutOvflPtr(pNew, apCell[i], &rc);
- }
- }
+ }else if( ISAUTOVACUUM && !leafCorrection ){
+ /* Fix the pointer map entries associated with the right-child of each
+ ** sibling page. All other pointer map entries have already been taken
+ ** care of. */
+ for(i=0; i<nNew; i++){
+ u32 key = get4byte(&apNew[i]->aData[8]);
+ ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
}
+ }
- if( !leafCorrection ){
- for(i=0; i<nNew; i++){
- u32 key = get4byte(&apNew[i]->aData[8]);
- ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
- }
- }
+ assert( pParent->isInit );
+ TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
+ nOld, nNew, nCell));
+
+ /* Free any old pages that were not reused as new pages.
+ */
+ for(i=nNew; i<nOld; i++){
+ freePage(apOld[i], &rc);
+ }
#if 0
+ if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
/* The ptrmapCheckPages() contains assert() statements that verify that
** all pointer map pages are set correctly. This is helpful while
** debugging. This is usually disabled because a corrupt database may
** cause an assert() statement to fail. */
ptrmapCheckPages(apNew, nNew);
ptrmapCheckPages(&pParent, 1);
-#endif
}
-
- assert( pParent->isInit );
- TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
- nOld, nNew, nCell));
+#endif
/*
** Cleanup before returning.
@@ -7776,6 +8178,13 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
** The schema layer numbers meta values differently. At the schema
** layer (and the SetCookie and ReadCookie opcodes) the number of
** free pages is not visible. So Cookie[0] is the same as Meta[1].
+**
+** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead
+** of reading the value out of the header, it instead loads the "DataVersion"
+** from the pager. The BTREE_DATA_VERSION value is not actually stored in the
+** database file. It is a number computed by the pager. But its access
+** pattern is the same as header meta values, and so it is convenient to
+** read it from this routine.
*/
void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
BtShared *pBt = p->pBt;
@@ -7786,7 +8195,11 @@ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
assert( pBt->pPage1 );
assert( idx>=0 && idx<=15 );
- *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
+ if( idx==BTREE_DATA_VERSION ){
+ *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
+ }else{
+ *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
+ }
/* If auto-vacuum is disabled in this build and this is an auto-vacuum
** database, mark the database as read-only. */
@@ -7877,7 +8290,7 @@ int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
if( pCur->iPage==0 ){
/* All pages of the b-tree have been visited. Return successfully. */
*pnEntry = nEntry;
- return SQLITE_OK;
+ return moveToRoot(pCur);
}
moveToParent(pCur);
}while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
@@ -8269,8 +8682,14 @@ static int checkTreePage(
assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
memset(hit+contentOffset, 0, usableSize-contentOffset);
memset(hit, 1, contentOffset);
+ /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
+ ** number of cells on the page. */
nCell = get2byte(&data[hdr+3]);
+ /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
+ ** immediately follows the b-tree page header. */
cellStart = hdr + 12 - 4*pPage->leaf;
+ /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
+ ** integer offsets to the cell contents. */
for(i=0; i<nCell; i++){
int pc = get2byte(&data[cellStart+i*2]);
u32 size = 65536;
@@ -8286,6 +8705,9 @@ static int checkTreePage(
for(j=pc+size-1; j>=pc; j--) hit[j]++;
}
}
+ /* EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
+ ** is the offset of the first freeblock, or zero if there are no
+ ** freeblocks on the page. */
i = get2byte(&data[hdr+1]);
while( i>0 ){
int size, j;
@@ -8293,7 +8715,13 @@ static int checkTreePage(
size = get2byte(&data[i+2]);
assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
for(j=i+size-1; j>=i; j--) hit[j]++;
+ /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
+ ** big-endian integer which is the offset in the b-tree page of the next
+ ** freeblock in the chain, or zero if the freeblock is the last on the
+ ** chain. */
j = get2byte(&data[i]);
+ /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
+ ** increasing offset. */
assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
i = j;
@@ -8307,6 +8735,11 @@ static int checkTreePage(
break;
}
}
+ /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
+ ** is stored in the fifth field of the b-tree page header.
+ ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
+ ** number of fragmented free bytes within the cell content area.
+ */
if( cnt!=data[hdr+7] ){
checkAppendMsg(pCheck,
"Fragmentation of %d bytes reported as %d on page %d",
@@ -8709,3 +9142,8 @@ void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
int sqlite3BtreeIsReadonly(Btree *p){
return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
}
+
+/*
+** Return the size of the header added to each page by this module.
+*/
+int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }