diff options
Diffstat (limited to 'lib/libsqlite3/src/btree.c')
-rw-r--r-- | lib/libsqlite3/src/btree.c | 1120 |
1 files changed, 779 insertions, 341 deletions
diff --git a/lib/libsqlite3/src/btree.c b/lib/libsqlite3/src/btree.c index 7ea66e0d3be..f9f76c2ebbc 100644 --- a/lib/libsqlite3/src/btree.c +++ b/lib/libsqlite3/src/btree.c @@ -1139,6 +1139,11 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ ** end of the page and all free space is collected into one ** big FreeBlk that occurs in between the header and cell ** pointer array and the cell content area. +** +** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a +** b-tree page so that there are no freeblocks or fragment bytes, all +** unused bytes are contained in the unallocated space region, and all +** cells are packed tightly at the end of the page. */ static int defragmentPage(MemPage *pPage){ int i; /* Loop counter */ @@ -1151,6 +1156,7 @@ static int defragmentPage(MemPage *pPage){ int nCell; /* Number of cells on the page */ unsigned char *data; /* The page data */ unsigned char *temp; /* Temp area for cell content */ + unsigned char *src; /* Source of content */ int iCellFirst; /* First allowable cell index */ int iCellLast; /* Last possible cell index */ @@ -1160,15 +1166,13 @@ static int defragmentPage(MemPage *pPage){ assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - temp = sqlite3PagerTempSpace(pPage->pBt->pPager); - data = pPage->aData; + temp = 0; + src = data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; nCell = pPage->nCell; assert( nCell==get2byte(&data[hdr+3]) ); usableSize = pPage->pBt->usableSize; - cbrk = get2byte(&data[hdr+5]); - memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); cbrk = usableSize; iCellFirst = cellOffset + 2*nCell; iCellLast = usableSize - 4; @@ -1187,7 +1191,7 @@ static int defragmentPage(MemPage *pPage){ } #endif assert( pc>=iCellFirst && pc<=iCellLast ); - size = cellSizePtr(pPage, &temp[pc]); + size = cellSizePtr(pPage, &src[pc]); cbrk -= size; #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) if( cbrk<iCellFirst ){ @@ -1201,8 +1205,16 @@ static int defragmentPage(MemPage *pPage){ assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); testcase( cbrk+size==usableSize ); testcase( pc+size==usableSize ); - memcpy(&data[cbrk], &temp[pc], size); put2byte(pAddr, cbrk); + if( temp==0 ){ + int x; + if( cbrk==pc ) continue; + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); + x = get2byte(&data[hdr+5]); + memcpy(&temp[x], &data[x], (cbrk+size) - x); + src = temp; + } + memcpy(&data[cbrk], &src[pc], size); } assert( cbrk>=iCellFirst ); put2byte(&data[hdr+5], cbrk); @@ -1218,6 +1230,69 @@ static int defragmentPage(MemPage *pPage){ } /* +** Search the free-list on page pPg for space to store a cell nByte bytes in +** size. If one can be found, return a pointer to the space and remove it +** from the free-list. +** +** If no suitable space can be found on the free-list, return NULL. +** +** This function may detect corruption within pPg. If corruption is +** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. +** +** If a slot of at least nByte bytes is found but cannot be used because +** there are already at least 60 fragmented bytes on the page, return NULL. +** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true. +*/ +static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ + const int hdr = pPg->hdrOffset; + u8 * const aData = pPg->aData; + int iAddr; + int pc; + int usableSize = pPg->pBt->usableSize; + + for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){ + int size; /* Size of the free slot */ + /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of + ** increasing offset. */ + if( pc>usableSize-4 || pc<iAddr+4 ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + } + /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each + ** freeblock form a big-endian integer which is the size of the freeblock + ** in bytes, including the 4-byte header. */ + size = get2byte(&aData[pc+2]); + if( size>=nByte ){ + int x = size - nByte; + testcase( x==4 ); + testcase( x==3 ); + if( x<4 ){ + /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total + ** number of bytes in fragments may not exceed 60. */ + if( aData[hdr+7]>=60 ){ + if( pbDefrag ) *pbDefrag = 1; + return 0; + } + /* Remove the slot from the free-list. Update the number of + ** fragmented bytes within the page. */ + memcpy(&aData[iAddr], &aData[pc], 2); + aData[hdr+7] += (u8)x; + }else if( size+pc > usableSize ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + }else{ + /* The slot remains on the free-list. Reduce its size to account + ** for the portion used by the new allocation. */ + put2byte(&aData[pc+2], x); + } + return &aData[pc + x]; + } + } + + return 0; +} + +/* ** Allocate nByte bytes of space from within the B-Tree page passed ** as the first argument. Write into *pIdx the index into pPage->aData[] ** of the first byte of allocated space. Return either SQLITE_OK or @@ -1234,9 +1309,8 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ u8 * const data = pPage->aData; /* Local cache of pPage->aData */ int top; /* First byte of cell content area */ + int rc = SQLITE_OK; /* Integer return code */ int gap; /* First byte of gap between cell pointers and cell content */ - int rc; /* Integer return code */ - int usableSize; /* Usable size of the page */ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( pPage->pBt ); @@ -1244,20 +1318,18 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ assert( nByte>=0 ); /* Minimum cell size is 4 */ assert( pPage->nFree>=nByte ); assert( pPage->nOverflow==0 ); - usableSize = pPage->pBt->usableSize; - assert( nByte < usableSize-8 ); + assert( nByte < (int)(pPage->pBt->usableSize-8) ); assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); gap = pPage->cellOffset + 2*pPage->nCell; assert( gap<=65536 ); - top = get2byte(&data[hdr+5]); - if( gap>top ){ - if( top==0 ){ - top = 65536; - }else{ - return SQLITE_CORRUPT_BKPT; - } - } + /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size + ** and the reserved space is zero (the usual value for reserved space) + ** then the cell content offset of an empty page wants to be 65536. + ** However, that integer is too large to be stored in a 2-byte unsigned + ** integer, so a value of 0 is used in its place. */ + top = get2byteNotZero(&data[hdr+5]); + if( gap>top ) return SQLITE_CORRUPT_BKPT; /* If there is enough space between gap and top for one more cell pointer ** array entry offset, and if the freelist is not empty, then search the @@ -1267,33 +1339,14 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ testcase( gap+1==top ); testcase( gap==top ); if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){ - int pc, addr; - for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ - int size; /* Size of the free slot */ - if( pc>usableSize-4 || pc<addr+4 ){ - return SQLITE_CORRUPT_BKPT; - } - size = get2byte(&data[pc+2]); - if( size>=nByte ){ - int x = size - nByte; - testcase( x==4 ); - testcase( x==3 ); - if( x<4 ){ - if( data[hdr+7]>=60 ) goto defragment_page; - /* Remove the slot from the free-list. Update the number of - ** fragmented bytes within the page. */ - memcpy(&data[addr], &data[pc], 2); - data[hdr+7] += (u8)x; - }else if( size+pc > usableSize ){ - return SQLITE_CORRUPT_BKPT; - }else{ - /* The slot remains on the free-list. Reduce its size to account - ** for the portion used by the new allocation. */ - put2byte(&data[pc+2], x); - } - *pIdx = pc + x; - return SQLITE_OK; - } + int bDefrag = 0; + u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag); + if( rc ) return rc; + if( bDefrag ) goto defragment_page; + if( pSpace ){ + assert( pSpace>=data && (pSpace - data)<65536 ); + *pIdx = (int)(pSpace - data); + return SQLITE_OK; } } @@ -1302,8 +1355,8 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ */ testcase( gap+2+nByte==top ); if( gap+2+nByte>top ){ -defragment_page: - testcase( pPage->nCell==0 ); + defragment_page: + assert( pPage->nCell>0 || CORRUPT_DB ); rc = defragmentPage(pPage); if( rc ) return rc; top = get2byteNotZero(&data[hdr+5]); @@ -1350,7 +1403,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); - assert( iEnd <= pPage->pBt->usableSize ); + assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( iSize>=4 ); /* Minimum cell size is 4 */ assert( iStart<=iLast ); @@ -1445,18 +1498,32 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->childPtrSize = 4-4*pPage->leaf; pBt = pPage->pBt; if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ + /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior + ** table b-tree page. */ + assert( (PTF_LEAFDATA|PTF_INTKEY)==5 ); + /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf + ** table b-tree page. */ + assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 ); pPage->intKey = 1; pPage->intKeyLeaf = pPage->leaf; pPage->noPayload = !pPage->leaf; pPage->maxLocal = pBt->maxLeaf; pPage->minLocal = pBt->minLeaf; }else if( flagByte==PTF_ZERODATA ){ + /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior + ** index b-tree page. */ + assert( (PTF_ZERODATA)==2 ); + /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf + ** index b-tree page. */ + assert( (PTF_ZERODATA|PTF_LEAF)==10 ); pPage->intKey = 0; pPage->intKeyLeaf = 0; pPage->noPayload = 0; pPage->maxLocal = pBt->maxLocal; pPage->minLocal = pBt->minLocal; }else{ + /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is + ** an error. */ return SQLITE_CORRUPT_BKPT; } pPage->max1bytePayload = pBt->max1bytePayload; @@ -1496,21 +1563,33 @@ static int btreeInitPage(MemPage *pPage){ hdr = pPage->hdrOffset; data = pPage->aData; + /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating + ** the b-tree page type. */ if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); pPage->maskPage = (u16)(pBt->pageSize - 1); pPage->nOverflow = 0; usableSize = pBt->usableSize; - pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; + pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize; pPage->aDataEnd = &data[usableSize]; pPage->aCellIdx = &data[cellOffset]; + /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates + ** the start of the cell content area. A zero value for this integer is + ** interpreted as 65536. */ top = get2byteNotZero(&data[hdr+5]); + /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the + ** number of cells on the page. */ pPage->nCell = get2byte(&data[hdr+3]); if( pPage->nCell>MX_CELL(pBt) ){ /* To many cells for a single page. The page must be corrupt */ return SQLITE_CORRUPT_BKPT; } testcase( pPage->nCell==MX_CELL(pBt) ); + /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only + ** possible for a root page of a table that contains no rows) then the + ** offset to the cell content area will equal the page size minus the + ** bytes of reserved space. */ + assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB ); /* A malformed database page might cause us to read past the end ** of page when parsing a cell. @@ -1544,13 +1623,20 @@ static int btreeInitPage(MemPage *pPage){ } #endif - /* Compute the total free space on the page */ + /* Compute the total free space on the page + ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the + ** start of the first freeblock on the page, or is zero if there are no + ** freeblocks. */ pc = get2byte(&data[hdr+1]); - nFree = data[hdr+7] + top; + nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ while( pc>0 ){ u16 next, size; if( pc<iCellFirst || pc>iCellLast ){ - /* Start of free block is off the page */ + /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will + ** always be at least one cell before the first freeblock. + ** + ** Or, the freeblock is off the end of the page + */ return SQLITE_CORRUPT_BKPT; } next = get2byte(&data[pc]); @@ -1956,6 +2042,9 @@ int sqlite3BtreeOpen( #ifdef SQLITE_SECURE_DELETE pBt->btsFlags |= BTS_SECURE_DELETE; #endif + /* EVIDENCE-OF: R-51873-39618 The page size for a database file is + ** determined by the 2-byte integer located at an offset of 16 bytes from + ** the beginning of the database file. */ pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ @@ -1974,6 +2063,9 @@ int sqlite3BtreeOpen( #endif nReserve = 0; }else{ + /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is + ** determined by the one-byte unsigned integer found at an offset of 20 + ** into the database file header. */ nReserve = zDbHeader[20]; pBt->btsFlags |= BTS_PAGESIZE_FIXED; #ifndef SQLITE_OMIT_AUTOVACUUM @@ -2483,6 +2575,9 @@ static int lockBtree(BtShared *pBt){ u32 usableSize; u8 *page1 = pPage1->aData; rc = SQLITE_NOTADB; + /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins + ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d + ** 61 74 20 33 00. */ if( memcmp(page1, zMagicHeader, 16)!=0 ){ goto page1_init_failed; } @@ -2523,15 +2618,21 @@ static int lockBtree(BtShared *pBt){ } #endif - /* The maximum embedded fraction must be exactly 25%. And the minimum - ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. + /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload + ** fractions and the leaf payload fraction values must be 64, 32, and 32. + ** ** The original design allowed these amounts to vary, but as of ** version 3.6.0, we require them to be fixed. */ if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ goto page1_init_failed; } + /* EVIDENCE-OF: R-51873-39618 The page size for a database file is + ** determined by the 2-byte integer located at an offset of 16 bytes from + ** the beginning of the database file. */ pageSize = (page1[16]<<8) | (page1[17]<<16); + /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two + ** between 512 and 65536 inclusive. */ if( ((pageSize-1)&pageSize)!=0 || pageSize>SQLITE_MAX_PAGE_SIZE || pageSize<=256 @@ -2539,6 +2640,13 @@ static int lockBtree(BtShared *pBt){ goto page1_init_failed; } assert( (pageSize & 7)==0 ); + /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte + ** integer at offset 20 is the number of bytes of space at the end of + ** each page to reserve for extensions. + ** + ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is + ** determined by the one-byte unsigned integer found at an offset of 20 + ** into the database file header. */ usableSize = pageSize - page1[20]; if( (u32)pageSize!=pBt->pageSize ){ /* After reading the first page of the database assuming a page size @@ -2559,6 +2667,9 @@ static int lockBtree(BtShared *pBt){ rc = SQLITE_CORRUPT_BKPT; goto page1_init_failed; } + /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to + ** be less than 480. In other words, if the page size is 512, then the + ** reserved space size cannot exceed 32. */ if( usableSize<480 ){ goto page1_init_failed; } @@ -3439,6 +3550,7 @@ int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ sqlite3BtreeLeave(p); return rc; } + p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */ pBt->inTransaction = TRANS_READ; btreeClearHasContent(pBt); } @@ -3802,7 +3914,7 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ releasePage(pCur->apPage[i]); } unlockBtreeIfUnused(pBt); - sqlite3DbFree(pBtree->db, pCur->aOverflow); + sqlite3_free(pCur->aOverflow); /* sqlite3_free(pCur); */ sqlite3BtreeLeave(pBtree); } @@ -4097,6 +4209,7 @@ static int accessPayload( offset -= pCur->info.nLocal; } + if( rc==SQLITE_OK && amt>0 ){ const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ Pgno nextPage; @@ -4114,8 +4227,8 @@ static int accessPayload( if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){ int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; if( nOvfl>pCur->nOvflAlloc ){ - Pgno *aNew = (Pgno*)sqlite3DbRealloc( - pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno) + Pgno *aNew = (Pgno*)sqlite3Realloc( + pCur->aOverflow, nOvfl*2*sizeof(Pgno) ); if( aNew==0 ){ rc = SQLITE_NOMEM; @@ -4162,6 +4275,7 @@ static int accessPayload( */ assert( eOp!=2 ); assert( pCur->curFlags & BTCF_ValidOvfl ); + assert( pCur->pBtree->db==pBt->db ); if( pCur->aOverflow[iIdx+1] ){ nextPage = pCur->aOverflow[iIdx+1]; }else{ @@ -5136,6 +5250,8 @@ static int allocateBtreePage( assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); pPage1 = pBt->pPage1; mxPage = btreePagecount(pBt); + /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36 + ** stores stores the total number of pages on the freelist. */ n = get4byte(&pPage1->aData[36]); testcase( n==mxPage-1 ); if( n>=mxPage ){ @@ -5182,8 +5298,14 @@ static int allocateBtreePage( do { pPrevTrunk = pTrunk; if( pPrevTrunk ){ + /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page + ** is the page number of the next freelist trunk page in the list or + ** zero if this is the last freelist trunk page. */ iTrunk = get4byte(&pPrevTrunk->aData[0]); }else{ + /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 + ** stores the page number of the first page of the freelist, or zero if + ** the freelist is empty. */ iTrunk = get4byte(&pPage1->aData[32]); } testcase( iTrunk==mxPage ); @@ -5198,8 +5320,9 @@ static int allocateBtreePage( } assert( pTrunk!=0 ); assert( pTrunk->aData!=0 ); - - k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */ + /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page + ** is the number of leaf page pointers to follow. */ + k = get4byte(&pTrunk->aData[4]); if( k==0 && !searchList ){ /* The trunk has no leaves and the list is not being searched. ** So extract the trunk page itself and use it as the newly @@ -5517,6 +5640,11 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ ** for now. At some point in the future (once everyone has upgraded ** to 3.6.0 or later) we should consider fixing the conditional above ** to read "usableSize/4-2" instead of "usableSize/4-8". + ** + ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still + ** avoid using the last six entries in the freelist trunk page array in + ** order that database files created by newer versions of SQLite can be + ** read by older versions of SQLite. */ rc = sqlite3PagerWrite(pTrunk->pDbPage); if( rc==SQLITE_OK ){ @@ -5868,9 +5996,17 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ return; } pPage->nCell--; - memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); - put2byte(&data[hdr+3], pPage->nCell); - pPage->nFree += 2; + if( pPage->nCell==0 ){ + memset(&data[hdr+1], 0, 4); + data[hdr+7] = 0; + put2byte(&data[hdr+5], pPage->pBt->usableSize); + pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset + - pPage->childPtrSize - 8; + }else{ + memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); + put2byte(&data[hdr+3], pPage->nCell); + pPage->nFree += 2; + } } /* @@ -5965,45 +6101,271 @@ static void insertCell( } /* -** Add a list of cells to a page. The page should be initially empty. -** The cells are guaranteed to fit on the page. +** Array apCell[] contains pointers to nCell b-tree page cells. The +** szCell[] array contains the size in bytes of each cell. This function +** replaces the current contents of page pPg with the contents of the cell +** array. +** +** Some of the cells in apCell[] may currently be stored in pPg. This +** function works around problems caused by this by making a copy of any +** such cells before overwriting the page data. +** +** The MemPage.nFree field is invalidated by this function. It is the +** responsibility of the caller to set it correctly. */ -static void assemblePage( - MemPage *pPage, /* The page to be assembled */ - int nCell, /* The number of cells to add to this page */ - u8 **apCell, /* Pointers to cell bodies */ - u16 *aSize /* Sizes of the cells */ +static void rebuildPage( + MemPage *pPg, /* Edit this page */ + int nCell, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ ){ - int i; /* Loop counter */ - u8 *pCellptr; /* Address of next cell pointer */ - int cellbody; /* Address of next cell body */ - u8 * const data = pPage->aData; /* Pointer to data for pPage */ - const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ - const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ + const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ + u8 * const aData = pPg->aData; /* Pointer to data for pPg */ + const int usableSize = pPg->pBt->usableSize; + u8 * const pEnd = &aData[usableSize]; + int i; + u8 *pCellptr = pPg->aCellIdx; + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + u8 *pData; - assert( pPage->nOverflow==0 ); - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) - && (int)MX_CELL(pPage->pBt)<=10921); - assert( sqlite3PagerIswriteable(pPage->pDbPage) ); + i = get2byte(&aData[hdr+5]); + memcpy(&pTmp[i], &aData[i], usableSize - i); + + pData = pEnd; + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>aData && pCell<pEnd ){ + pCell = &pTmp[pCell - aData]; + } + pData -= szCell[i]; + memcpy(pData, pCell, szCell[i]); + put2byte(pCellptr, (pData - aData)); + pCellptr += 2; + assert( szCell[i]==cellSizePtr(pPg, pCell) ); + } + + /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ + pPg->nCell = nCell; + pPg->nOverflow = 0; + + put2byte(&aData[hdr+1], 0); + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + aData[hdr+7] = 0x00; +} + +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function attempts to +** add the cells stored in the array to page pPg. If it cannot (because +** the page needs to be defragmented before the cells will fit), non-zero +** is returned. Otherwise, if the cells are added successfully, zero is +** returned. +** +** Argument pCellptr points to the first entry in the cell-pointer array +** (part of page pPg) to populate. After cell apCell[0] is written to the +** page body, a 16-bit offset is written to pCellptr. And so on, for each +** cell in the array. It is the responsibility of the caller to ensure +** that it is safe to overwrite this part of the cell-pointer array. +** +** When this function is called, *ppData points to the start of the +** content area on page pPg. If the size of the content area is extended, +** *ppData is updated to point to the new start of the content area +** before returning. +** +** Finally, argument pBegin points to the byte immediately following the +** end of the space required by this page for the cell-pointer area (for +** all cells - not just those inserted by the current call). If the content +** area must be extended to before this point in order to accomodate all +** cells in apCell[], then the cells do not fit and non-zero is returned. +*/ +static int pageInsertArray( + MemPage *pPg, /* Page to add cells to */ + u8 *pBegin, /* End of cell-pointer array */ + u8 **ppData, /* IN/OUT: Page content -area pointer */ + u8 *pCellptr, /* Pointer to cell-pointer area */ + int nCell, /* Number of cells to add to pPg */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + int i; + u8 *aData = pPg->aData; + u8 *pData = *ppData; + const int bFreelist = aData[1] || aData[2]; + assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ + for(i=0; i<nCell; i++){ + int sz = szCell[i]; + int rc; + u8 *pSlot; + if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){ + pData -= sz; + if( pData<pBegin ) return 1; + pSlot = pData; + } + memcpy(pSlot, apCell[i], sz); + put2byte(pCellptr, (pSlot - aData)); + pCellptr += 2; + } + *ppData = pData; + return 0; +} + +/* +** Array apCell[] contains nCell pointers to b-tree cells. Array szCell +** contains the size in bytes of each such cell. This function adds the +** space associated with each cell in the array that is currently stored +** within the body of pPg to the pPg free-list. The cell-pointers and other +** fields of the page are not updated. +** +** This function returns the total number of cells added to the free-list. +*/ +static int pageFreeArray( + MemPage *pPg, /* Page to edit */ + int nCell, /* Cells to delete */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + u8 * const pEnd = &aData[pPg->pBt->usableSize]; + u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; + int nRet = 0; + int i; + u8 *pFree = 0; + int szFree = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( pCell>=pStart && pCell<pEnd ){ + int sz = szCell[i]; + if( pFree!=(pCell + sz) ){ + if( pFree ){ + assert( pFree>aData && (pFree - aData)<65536 ); + freeSpace(pPg, (u16)(pFree - aData), szFree); + } + pFree = pCell; + szFree = sz; + if( pFree+sz>pEnd ) return 0; + }else{ + pFree = pCell; + szFree += sz; + } + nRet++; + } + } + if( pFree ){ + assert( pFree>aData && (pFree - aData)<65536 ); + freeSpace(pPg, (u16)(pFree - aData), szFree); + } + return nRet; +} + +/* +** apCell[] and szCell[] contains pointers to and sizes of all cells in the +** pages being balanced. The current page, pPg, has pPg->nCell cells starting +** with apCell[iOld]. After balancing, this page should hold nNew cells +** starting at apCell[iNew]. +** +** This routine makes the necessary adjustments to pPg so that it contains +** the correct cells after being balanced. +** +** The pPg->nFree field is invalid when this function returns. It is the +** responsibility of the caller to set it correctly. +*/ +static void editPage( + MemPage *pPg, /* Edit this page */ + int iOld, /* Index of first cell currently on page */ + int iNew, /* Index of new first cell on page */ + int nNew, /* Final number of cells on page */ + u8 **apCell, /* Array of cells */ + u16 *szCell /* Array of cell sizes */ +){ + u8 * const aData = pPg->aData; + const int hdr = pPg->hdrOffset; + u8 *pBegin = &pPg->aCellIdx[nNew * 2]; + int nCell = pPg->nCell; /* Cells stored on pPg */ + u8 *pData; + u8 *pCellptr; + int i; + int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; + int iNewEnd = iNew + nNew; + +#ifdef SQLITE_DEBUG + u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); + memcpy(pTmp, aData, pPg->pBt->usableSize); +#endif + + /* Remove cells from the start and end of the page */ + if( iOld<iNew ){ + int nShift = pageFreeArray( + pPg, iNew-iOld, &apCell[iOld], &szCell[iOld] + ); + memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); + nCell -= nShift; + } + if( iNewEnd < iOldEnd ){ + nCell -= pageFreeArray( + pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd] + ); + } + + pData = &aData[get2byteNotZero(&aData[hdr+5])]; + if( pData<pBegin ) goto editpage_fail; + + /* Add cells to the start of the page */ + if( iNew<iOld ){ + int nAdd = MIN(nNew,iOld-iNew); + assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); + pCellptr = pPg->aCellIdx; + memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nAdd, &apCell[iNew], &szCell[iNew] + ) ) goto editpage_fail; + nCell += nAdd; + } + + /* Add any overflow cells */ + for(i=0; i<pPg->nOverflow; i++){ + int iCell = (iOld + pPg->aiOvfl[i]) - iNew; + if( iCell>=0 && iCell<nNew ){ + pCellptr = &pPg->aCellIdx[iCell * 2]; + memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); + nCell++; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + 1, &apCell[iCell + iNew], &szCell[iCell + iNew] + ) ) goto editpage_fail; + } + } + + /* Append cells to the end of the page */ + pCellptr = &pPg->aCellIdx[nCell*2]; + if( pageInsertArray( + pPg, pBegin, &pData, pCellptr, + nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell] + ) ) goto editpage_fail; - /* Check that the page has just been zeroed by zeroPage() */ - assert( pPage->nCell==0 ); - assert( get2byteNotZero(&data[hdr+5])==nUsable ); + pPg->nCell = nNew; + pPg->nOverflow = 0; - pCellptr = &pPage->aCellIdx[nCell*2]; - cellbody = nUsable; - for(i=nCell-1; i>=0; i--){ - u16 sz = aSize[i]; - pCellptr -= 2; - cellbody -= sz; - put2byte(pCellptr, cellbody); - memcpy(&data[cellbody], apCell[i], sz); + put2byte(&aData[hdr+3], pPg->nCell); + put2byte(&aData[hdr+5], pData - aData); + +#ifdef SQLITE_DEBUG + for(i=0; i<nNew && !CORRUPT_DB; i++){ + u8 *pCell = apCell[i+iNew]; + int iOff = get2byte(&pPg->aCellIdx[i*2]); + if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){ + pCell = &pTmp[pCell - aData]; + } + assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) ); } - put2byte(&data[hdr+3], nCell); - put2byte(&data[hdr+5], cellbody); - pPage->nFree -= (nCell*2 + nUsable - cellbody); - pPage->nCell = (u16)nCell; +#endif + + return; + editpage_fail: + /* Unable to edit this page. Rebuild it from scratch instead. */ + rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]); } /* @@ -6057,7 +6419,7 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( pPage->nOverflow==1 ); /* This error condition is now caught prior to reaching this function */ - if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; + if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT; /* Allocate a new page. This page will become the right-sibling of ** pPage. Make the parent page writable, so that the new divider cell @@ -6075,7 +6437,8 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); - assemblePage(pNew, 1, &pCell, &szCell); + rebuildPage(pNew, 1, &pCell, &szCell); + pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the @@ -6294,17 +6657,22 @@ static int balance_nonroot( int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ - MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ - int szNew[NB+2]; /* Combined size of cells place on i-th page */ + int cntOld[NB+2]; /* Old index in aCell[] after i-th page */ + int szNew[NB+2]; /* Combined size of cells placed on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ + u8 abDone[NB+2]; /* True after i'th new page is populated */ + Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ + Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ + u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ + memset(abDone, 0, sizeof(abDone)); pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); @@ -6413,12 +6781,14 @@ static int balance_nonroot( /* ** Allocate space for memory structures */ - k = pBt->pageSize + ROUND8(sizeof(MemPage)); szScratch = nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(u16) /* szCell */ - + pBt->pageSize /* aSpace1 */ - + k*nOld; /* Page copies (apCopy) */ + + pBt->pageSize; /* aSpace1 */ + + /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer + ** that is more than 6 times the database page size. */ + assert( szScratch<=6*(int)pBt->pageSize ); apCell = sqlite3ScratchMalloc( szScratch ); if( apCell==0 ){ rc = SQLITE_NOMEM; @@ -6431,8 +6801,8 @@ static int balance_nonroot( /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local apCell[] array. Make copies of the divider cells - ** into space obtained from aSpace1[] and remove the divider cells - ** from pParent. + ** into space obtained from aSpace1[]. The divider cells have already + ** been removed from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied @@ -6448,15 +6818,7 @@ static int balance_nonroot( leafData = apOld[0]->intKeyLeaf; for(i=0; i<nOld; i++){ int limit; - - /* Before doing anything else, take a copy of the i'th original sibling - ** The rest of this function will use data from the copies rather - ** that the original pages since the original pages will be in the - ** process of being overwritten. */ - MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; - memcpy(pOld, apOld[i], sizeof(MemPage)); - pOld->aData = (void*)&pOld[1]; - memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); + MemPage *pOld = apOld[i]; limit = pOld->nCell+pOld->nOverflow; if( pOld->nOverflow>0 ){ @@ -6477,6 +6839,7 @@ static int balance_nonroot( nCell++; } } + cntOld[i] = nCell; if( i<nOld-1 && !leafData){ u16 sz = (u16)szNew[i]; u8 *pTemp; @@ -6499,7 +6862,11 @@ static int balance_nonroot( }else{ assert( leafCorrection==4 ); if( szCell[nCell]<4 ){ - /* Do not allow any cells smaller than 4 bytes. */ + /* Do not allow any cells smaller than 4 bytes. If a smaller cell + ** does exist, pad it with 0x00 bytes. */ + assert( szCell[nCell]==3 ); + assert( apCell[nCell]==&aSpace1[iSpace1-3] ); + aSpace1[iSpace1++] = 0x00; szCell[nCell] = 4; } } @@ -6528,7 +6895,7 @@ static int balance_nonroot( assert( i<nMaxCells ); subtotal += szCell[i] + 2; if( subtotal > usableSpace ){ - szNew[k] = subtotal - szCell[i]; + szNew[k] = subtotal - szCell[i] - 2; cntNew[k] = i; if( leafData ){ i--; } subtotal = 0; @@ -6542,9 +6909,10 @@ static int balance_nonroot( /* ** The packing computed by the previous block is biased toward the siblings - ** on the left side. The left siblings are always nearly full, while the - ** right-most sibling might be nearly empty. This block of code attempts - ** to adjust the packing of siblings to get a better balance. + ** on the left side (siblings with smaller keys). The left siblings are + ** always nearly full, while the right-most sibling might be nearly empty. + ** The next block of code attempts to adjust the packing of siblings to + ** get a better balance. ** ** This adjustment is more than an optimization. The packing above might ** be so out of balance as to be illegal. For example, the right-most @@ -6573,22 +6941,18 @@ static int balance_nonroot( szNew[i-1] = szLeft; } - /* Either we found one or more cells (cntnew[0])>0) or pPage is - ** a virtual root page. A virtual root page is when the real root - ** page is page 1 and we are the only child of that page. - ** - ** UPDATE: The assert() below is not necessarily true if the database - ** file is corrupt. The corruption will be detected and reported later - ** in this procedure so there is no need to act upon it now. + /* Sanity check: For a non-corrupt database file one of the follwing + ** must be true: + ** (1) We found one or more cells (cntNew[0])>0), or + ** (2) pPage is a virtual root page. A virtual root page is when + ** the real root page is page 1 and we are the only child of + ** that page. */ -#if 0 - assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); -#endif - - TRACE(("BALANCE: old: %d %d %d ", - apOld[0]->pgno, - nOld>=2 ? apOld[1]->pgno : 0, - nOld>=3 ? apOld[2]->pgno : 0 + assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); + TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n", + apOld[0]->pgno, apOld[0]->nCell, + nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, + nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 )); /* @@ -6611,8 +6975,10 @@ static int balance_nonroot( assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; + zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; + cntOld[i] = nCell; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ @@ -6624,135 +6990,247 @@ static int balance_nonroot( } } - /* Free any old pages that were not reused as new pages. - */ - while( i<nOld ){ - freePage(apOld[i], &rc); - if( rc ) goto balance_cleanup; - releasePage(apOld[i]); - apOld[i] = 0; - i++; - } - /* - ** Put the new pages in ascending order. This helps to - ** keep entries in the disk file in order so that a scan - ** of the table is a linear scan through the file. That - ** in turn helps the operating system to deliver pages - ** from the disk more rapidly. + ** Reassign page numbers so that the new pages are in ascending order. + ** This helps to keep entries in the disk file in order so that a scan + ** of the table is closer to a linear scan through the file. That in turn + ** helps the operating system to deliver pages from the disk more rapidly. ** - ** An O(n^2) insertion sort algorithm is used, but since - ** n is never more than NB (a small constant), that should - ** not be a problem. + ** An O(n^2) insertion sort algorithm is used, but since n is never more + ** than (NB+2) (a small constant), that should not be a problem. ** - ** When NB==3, this one optimization makes the database - ** about 25% faster for large insertions and deletions. + ** When NB==3, this one optimization makes the database about 25% faster + ** for large insertions and deletions. */ - for(i=0; i<k-1; i++){ - int minV = apNew[i]->pgno; - int minI = i; - for(j=i+1; j<k; j++){ - if( apNew[j]->pgno<(unsigned)minV ){ - minI = j; - minV = apNew[j]->pgno; + for(i=0; i<nNew; i++){ + aPgOrder[i] = aPgno[i] = apNew[i]->pgno; + aPgFlags[i] = apNew[i]->pDbPage->flags; + for(j=0; j<i; j++){ + if( aPgno[j]==aPgno[i] ){ + /* This branch is taken if the set of sibling pages somehow contains + ** duplicate entries. This can happen if the database is corrupt. + ** It would be simpler to detect this as part of the loop below, but + ** we do the detection here in order to avoid populating the pager + ** cache with two separate objects associated with the same + ** page number. */ + assert( CORRUPT_DB ); + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; } } - if( minI>i ){ - MemPage *pT; - pT = apNew[i]; - apNew[i] = apNew[minI]; - apNew[minI] = pT; + } + for(i=0; i<nNew; i++){ + int iBest = 0; /* aPgno[] index of page number to use */ + for(j=1; j<nNew; j++){ + if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j; + } + pgno = aPgOrder[iBest]; + aPgOrder[iBest] = 0xffffffff; + if( iBest!=i ){ + if( iBest>i ){ + sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); + } + sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); + apNew[i]->pgno = pgno; } } - TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", - apNew[0]->pgno, szNew[0], + + TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) " + "%d(%d nc=%d) %d(%d nc=%d)\n", + apNew[0]->pgno, szNew[0], cntNew[0], nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, + nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, + nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, - nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); + nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, + nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, + nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 + )); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); put4byte(pRight, apNew[nNew-1]->pgno); - /* - ** Evenly distribute the data in apCell[] across the new pages. - ** Insert divider cells into pParent as necessary. + /* If the sibling pages are not leaves, ensure that the right-child pointer + ** of the right-most new sibling page is set to the value that was + ** originally in the same field of the right-most old sibling page. */ + if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ + MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; + memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); + } + + /* Make any required updates to pointer map entries associated with + ** cells stored on sibling pages following the balance operation. Pointer + ** map entries associated with divider cells are set by the insertCell() + ** routine. The associated pointer map entries are: + ** + ** a) if the cell contains a reference to an overflow chain, the + ** entry associated with the first page in the overflow chain, and + ** + ** b) if the sibling pages are not leaves, the child page associated + ** with the cell. + ** + ** If the sibling pages are not leaves, then the pointer map entry + ** associated with the right-child of each sibling may also need to be + ** updated. This happens below, after the sibling pages have been + ** populated, not here. */ - j = 0; - for(i=0; i<nNew; i++){ - /* Assemble the new sibling page. */ + if( ISAUTOVACUUM ){ + MemPage *pNew = apNew[0]; + u8 *aOld = pNew->aData; + int cntOldNext = pNew->nCell + pNew->nOverflow; + int usableSize = pBt->usableSize; + int iNew = 0; + int iOld = 0; + + for(i=0; i<nCell; i++){ + u8 *pCell = apCell[i]; + if( i==cntOldNext ){ + MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld]; + cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; + aOld = pOld->aData; + } + if( i==cntNew[iNew] ){ + pNew = apNew[++iNew]; + if( !leafData ) continue; + } + + /* Cell pCell is destined for new sibling page pNew. Originally, it + ** was either part of sibling page iOld (possibly an overflow cell), + ** or else the divider cell to the left of sibling page iOld. So, + ** if sibling page iOld had the same page number as pNew, and if + ** pCell really was a part of sibling page iOld (not a divider or + ** overflow cell), we can skip updating the pointer map entries. */ + if( iOld>=nNew + || pNew->pgno!=aPgno[iOld] + || pCell<aOld + || pCell>=&aOld[usableSize] + ){ + if( !leafCorrection ){ + ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); + } + if( szCell[i]>pNew->minLocal ){ + ptrmapPutOvflPtr(pNew, pCell, &rc); + } + } + } + } + + /* Insert new divider cells into pParent. */ + for(i=0; i<nNew-1; i++){ + u8 *pCell; + u8 *pTemp; + int sz; MemPage *pNew = apNew[i]; + j = cntNew[i]; + assert( j<nMaxCells ); - zeroPage(pNew, pageFlags); - assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); - assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); - assert( pNew->nOverflow==0 ); + pCell = apCell[j]; + sz = szCell[j] + leafCorrection; + pTemp = &aOvflSpace[iOvflSpace]; + if( !pNew->leaf ){ + memcpy(&pNew->aData[8], pCell, 4); + }else if( leafData ){ + /* If the tree is a leaf-data tree, and the siblings are leaves, + ** then there is no divider cell in apCell[]. Instead, the divider + ** cell consists of the integer key for the right-most cell of + ** the sibling-page assembled above only. + */ + CellInfo info; + j--; + btreeParseCellPtr(pNew, apCell[j], &info); + pCell = pTemp; + sz = 4 + putVarint(&pCell[4], info.nKey); + pTemp = 0; + }else{ + pCell -= 4; + /* Obscure case for non-leaf-data trees: If the cell at pCell was + ** previously stored on a leaf node, and its reported size was 4 + ** bytes, then it may actually be smaller than this + ** (see btreeParseCellPtr(), 4 bytes is the minimum size of + ** any cell). But it is important to pass the correct size to + ** insertCell(), so reparse the cell now. + ** + ** Note that this can never happen in an SQLite data file, as all + ** cells are at least 4 bytes. It only happens in b-trees used + ** to evaluate "IN (SELECT ...)" and similar clauses. + */ + if( szCell[j]==4 ){ + assert(leafCorrection==4); + sz = cellSizePtr(pParent, pCell); + } + } + iOvflSpace += sz; + assert( sz<=pBt->maxLocal+23 ); + assert( iOvflSpace <= (int)pBt->pageSize ); + insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc); + if( rc!=SQLITE_OK ) goto balance_cleanup; + assert( sqlite3PagerIswriteable(pParent->pDbPage) ); + } - j = cntNew[i]; + /* Now update the actual sibling pages. The order in which they are updated + ** is important, as this code needs to avoid disrupting any page from which + ** cells may still to be read. In practice, this means: + ** + ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) + ** then it is not safe to update page apNew[iPg] until after + ** the left-hand sibling apNew[iPg-1] has been updated. + ** + ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) + ** then it is not safe to update page apNew[iPg] until after + ** the right-hand sibling apNew[iPg+1] has been updated. + ** + ** If neither of the above apply, the page is safe to update. + ** + ** The iPg value in the following loop starts at nNew-1 goes down + ** to 0, then back up to nNew-1 again, thus making two passes over + ** the pages. On the initial downward pass, only condition (1) above + ** needs to be tested because (2) will always be true from the previous + ** step. On the upward pass, both conditions are always true, so the + ** upwards pass simply processes pages that were missed on the downward + ** pass. + */ + for(i=1-nNew; i<nNew; i++){ + int iPg = i<0 ? -i : i; + assert( iPg>=0 && iPg<nNew ); + if( abDone[iPg] ) continue; /* Skip pages already processed */ + if( i>=0 /* On the upwards pass, or... */ + || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ + ){ + int iNew; + int iOld; + int nNewCell; - /* If the sibling page assembled above was not the right-most sibling, - ** insert a divider cell into the parent page. - */ - assert( i<nNew-1 || j==nCell ); - if( j<nCell ){ - u8 *pCell; - u8 *pTemp; - int sz; - - assert( j<nMaxCells ); - pCell = apCell[j]; - sz = szCell[j] + leafCorrection; - pTemp = &aOvflSpace[iOvflSpace]; - if( !pNew->leaf ){ - memcpy(&pNew->aData[8], pCell, 4); - }else if( leafData ){ - /* If the tree is a leaf-data tree, and the siblings are leaves, - ** then there is no divider cell in apCell[]. Instead, the divider - ** cell consists of the integer key for the right-most cell of - ** the sibling-page assembled above only. - */ - CellInfo info; - j--; - btreeParseCellPtr(pNew, apCell[j], &info); - pCell = pTemp; - sz = 4 + putVarint(&pCell[4], info.nKey); - pTemp = 0; + /* Verify condition (1): If cells are moving left, update iPg + ** only after iPg-1 has already been updated. */ + assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); + + /* Verify condition (2): If cells are moving right, update iPg + ** only after iPg+1 has already been updated. */ + assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); + + if( iPg==0 ){ + iNew = iOld = 0; + nNewCell = cntNew[0]; }else{ - pCell -= 4; - /* Obscure case for non-leaf-data trees: If the cell at pCell was - ** previously stored on a leaf node, and its reported size was 4 - ** bytes, then it may actually be smaller than this - ** (see btreeParseCellPtr(), 4 bytes is the minimum size of - ** any cell). But it is important to pass the correct size to - ** insertCell(), so reparse the cell now. - ** - ** Note that this can never happen in an SQLite data file, as all - ** cells are at least 4 bytes. It only happens in b-trees used - ** to evaluate "IN (SELECT ...)" and similar clauses. - */ - if( szCell[j]==4 ){ - assert(leafCorrection==4); - sz = cellSizePtr(pParent, pCell); - } + iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell; + iNew = cntNew[iPg-1] + !leafData; + nNewCell = cntNew[iPg] - iNew; } - iOvflSpace += sz; - assert( sz<=pBt->maxLocal+23 ); - assert( iOvflSpace <= (int)pBt->pageSize ); - insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); - if( rc!=SQLITE_OK ) goto balance_cleanup; - assert( sqlite3PagerIswriteable(pParent->pDbPage) ); - j++; - nxDiv++; + editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell); + abDone[iPg]++; + apNew[iPg]->nFree = usableSpace-szNew[iPg]; + assert( apNew[iPg]->nOverflow==0 ); + assert( apNew[iPg]->nCell==nNewCell ); } } - assert( j==nCell ); + + /* All pages have been processed exactly once */ + assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); + assert( nOld>0 ); assert( nNew>0 ); - if( (pageFlags & PTF_LEAF)==0 ){ - u8 *zChild = &apCopy[nOld-1]->aData[8]; - memcpy(&apNew[nNew-1]->aData[8], zChild, 4); - } if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ /* The root page of the b-tree now contains no cells. The only sibling @@ -6765,126 +7243,50 @@ static int balance_nonroot( ** sets all pointer-map entries corresponding to database image pages ** for which the pointer is stored within the content being copied. ** - ** The second assert below verifies that the child page is defragmented - ** (it must be, as it was just reconstructed using assemblePage()). This - ** is important if the parent page happens to be page 1 of the database - ** image. */ + ** It is critical that the child page be defragmented before being + ** copied into the parent, because if the parent is page 1 then it will + ** by smaller than the child due to the database header, and so all the + ** free space needs to be up front. + */ assert( nNew==1 ); + rc = defragmentPage(apNew[0]); + testcase( rc!=SQLITE_OK ); assert( apNew[0]->nFree == - (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) + || rc!=SQLITE_OK ); copyNodeContent(apNew[0], pParent, &rc); freePage(apNew[0], &rc); - }else if( ISAUTOVACUUM ){ - /* Fix the pointer-map entries for all the cells that were shifted around. - ** There are several different types of pointer-map entries that need to - ** be dealt with by this routine. Some of these have been set already, but - ** many have not. The following is a summary: - ** - ** 1) The entries associated with new sibling pages that were not - ** siblings when this function was called. These have already - ** been set. We don't need to worry about old siblings that were - ** moved to the free-list - the freePage() code has taken care - ** of those. - ** - ** 2) The pointer-map entries associated with the first overflow - ** page in any overflow chains used by new divider cells. These - ** have also already been taken care of by the insertCell() code. - ** - ** 3) If the sibling pages are not leaves, then the child pages of - ** cells stored on the sibling pages may need to be updated. - ** - ** 4) If the sibling pages are not internal intkey nodes, then any - ** overflow pages used by these cells may need to be updated - ** (internal intkey nodes never contain pointers to overflow pages). - ** - ** 5) If the sibling pages are not leaves, then the pointer-map - ** entries for the right-child pages of each sibling may need - ** to be updated. - ** - ** Cases 1 and 2 are dealt with above by other code. The next - ** block deals with cases 3 and 4 and the one after that, case 5. Since - ** setting a pointer map entry is a relatively expensive operation, this - ** code only sets pointer map entries for child or overflow pages that have - ** actually moved between pages. */ - MemPage *pNew = apNew[0]; - MemPage *pOld = apCopy[0]; - int nOverflow = pOld->nOverflow; - int iNextOld = pOld->nCell + nOverflow; - int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1); - j = 0; /* Current 'old' sibling page */ - k = 0; /* Current 'new' sibling page */ - for(i=0; i<nCell; i++){ - int isDivider = 0; - while( i==iNextOld ){ - /* Cell i is the cell immediately following the last cell on old - ** sibling page j. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i was a divider cell. */ - assert( j+1 < ArraySize(apCopy) ); - assert( j+1 < nOld ); - pOld = apCopy[++j]; - iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; - if( pOld->nOverflow ){ - nOverflow = pOld->nOverflow; - iOverflow = i + !leafData + pOld->aiOvfl[0]; - } - isDivider = !leafData; - } - - assert(nOverflow>0 || iOverflow<i ); - assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1); - assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1); - if( i==iOverflow ){ - isDivider = 1; - if( (--nOverflow)>0 ){ - iOverflow++; - } - } - - if( i==cntNew[k] ){ - /* Cell i is the cell immediately following the last cell on new - ** sibling page k. If the siblings are not leaf pages of an - ** intkey b-tree, then cell i is a divider cell. */ - pNew = apNew[++k]; - if( !leafData ) continue; - } - assert( j<nOld ); - assert( k<nNew ); - - /* If the cell was originally divider cell (and is not now) or - ** an overflow cell, or if the cell was located on a different sibling - ** page before the balancing, then the pointer map entries associated - ** with any child or overflow pages need to be updated. */ - if( isDivider || pOld->pgno!=pNew->pgno ){ - if( !leafCorrection ){ - ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); - } - if( szCell[i]>pNew->minLocal ){ - ptrmapPutOvflPtr(pNew, apCell[i], &rc); - } - } + }else if( ISAUTOVACUUM && !leafCorrection ){ + /* Fix the pointer map entries associated with the right-child of each + ** sibling page. All other pointer map entries have already been taken + ** care of. */ + for(i=0; i<nNew; i++){ + u32 key = get4byte(&apNew[i]->aData[8]); + ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); } + } - if( !leafCorrection ){ - for(i=0; i<nNew; i++){ - u32 key = get4byte(&apNew[i]->aData[8]); - ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); - } - } + assert( pParent->isInit ); + TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", + nOld, nNew, nCell)); + + /* Free any old pages that were not reused as new pages. + */ + for(i=nNew; i<nOld; i++){ + freePage(apOld[i], &rc); + } #if 0 + if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){ /* The ptrmapCheckPages() contains assert() statements that verify that ** all pointer map pages are set correctly. This is helpful while ** debugging. This is usually disabled because a corrupt database may ** cause an assert() statement to fail. */ ptrmapCheckPages(apNew, nNew); ptrmapCheckPages(&pParent, 1); -#endif } - - assert( pParent->isInit ); - TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", - nOld, nNew, nCell)); +#endif /* ** Cleanup before returning. @@ -7776,6 +8178,13 @@ int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ ** The schema layer numbers meta values differently. At the schema ** layer (and the SetCookie and ReadCookie opcodes) the number of ** free pages is not visible. So Cookie[0] is the same as Meta[1]. +** +** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead +** of reading the value out of the header, it instead loads the "DataVersion" +** from the pager. The BTREE_DATA_VERSION value is not actually stored in the +** database file. It is a number computed by the pager. But its access +** pattern is the same as header meta values, and so it is convenient to +** read it from this routine. */ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ BtShared *pBt = p->pBt; @@ -7786,7 +8195,11 @@ void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ assert( pBt->pPage1 ); assert( idx>=0 && idx<=15 ); - *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); + if( idx==BTREE_DATA_VERSION ){ + *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion; + }else{ + *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); + } /* If auto-vacuum is disabled in this build and this is an auto-vacuum ** database, mark the database as read-only. */ @@ -7877,7 +8290,7 @@ int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ if( pCur->iPage==0 ){ /* All pages of the b-tree have been visited. Return successfully. */ *pnEntry = nEntry; - return SQLITE_OK; + return moveToRoot(pCur); } moveToParent(pCur); }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); @@ -8269,8 +8682,14 @@ static int checkTreePage( assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ memset(hit+contentOffset, 0, usableSize-contentOffset); memset(hit, 1, contentOffset); + /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the + ** number of cells on the page. */ nCell = get2byte(&data[hdr+3]); + /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page + ** immediately follows the b-tree page header. */ cellStart = hdr + 12 - 4*pPage->leaf; + /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte + ** integer offsets to the cell contents. */ for(i=0; i<nCell; i++){ int pc = get2byte(&data[cellStart+i*2]); u32 size = 65536; @@ -8286,6 +8705,9 @@ static int checkTreePage( for(j=pc+size-1; j>=pc; j--) hit[j]++; } } + /* EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header + ** is the offset of the first freeblock, or zero if there are no + ** freeblocks on the page. */ i = get2byte(&data[hdr+1]); while( i>0 ){ int size, j; @@ -8293,7 +8715,13 @@ static int checkTreePage( size = get2byte(&data[i+2]); assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ for(j=i+size-1; j>=i; j--) hit[j]++; + /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a + ** big-endian integer which is the offset in the b-tree page of the next + ** freeblock in the chain, or zero if the freeblock is the last on the + ** chain. */ j = get2byte(&data[i]); + /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of + ** increasing offset. */ assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ i = j; @@ -8307,6 +8735,11 @@ static int checkTreePage( break; } } + /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments + ** is stored in the fifth field of the b-tree page header. + ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the + ** number of fragmented free bytes within the cell content area. + */ if( cnt!=data[hdr+7] ){ checkAppendMsg(pCheck, "Fragmentation of %d bytes reported as %d on page %d", @@ -8709,3 +9142,8 @@ void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){ int sqlite3BtreeIsReadonly(Btree *p){ return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; } + +/* +** Return the size of the header added to each page by this module. +*/ +int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } |