diff options
Diffstat (limited to 'lib/libsqlite3/src/btree.c')
-rw-r--r-- | lib/libsqlite3/src/btree.c | 1588 |
1 files changed, 991 insertions, 597 deletions
diff --git a/lib/libsqlite3/src/btree.c b/lib/libsqlite3/src/btree.c index 789796d55ae..2692ade8e48 100644 --- a/lib/libsqlite3/src/btree.c +++ b/lib/libsqlite3/src/btree.c @@ -490,13 +490,15 @@ static void invalidateIncrblobCursors( int isClearTable /* True if all rows are being deleted */ ){ BtCursor *p; - BtShared *pBt = pBtree->pBt; + if( pBtree->hasIncrblobCur==0 ) return; assert( sqlite3BtreeHoldsMutex(pBtree) ); - for(p=pBt->pCursor; p; p=p->pNext){ - if( (p->curFlags & BTCF_Incrblob)!=0 - && (isClearTable || p->info.nKey==iRow) - ){ - p->eState = CURSOR_INVALID; + pBtree->hasIncrblobCur = 0; + for(p=pBtree->pBt->pCursor; p; p=p->pNext){ + if( (p->curFlags & BTCF_Incrblob)!=0 ){ + pBtree->hasIncrblobCur = 1; + if( isClearTable || p->info.nKey==iRow ){ + p->eState = CURSOR_INVALID; + } } } } @@ -618,7 +620,7 @@ static int saveCursorPosition(BtCursor *pCur){ ** table, then malloc space for and store the pCur->nKey bytes of key ** data. */ - if( 0==pCur->apPage[0]->intKey ){ + if( 0==pCur->curIntKey ){ void *pKey = sqlite3Malloc( pCur->nKey ); if( pKey ){ rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); @@ -631,7 +633,7 @@ static int saveCursorPosition(BtCursor *pCur){ rc = SQLITE_NOMEM; } } - assert( !pCur->apPage[0]->intKey || !pCur->pKey ); + assert( !pCur->curIntKey || !pCur->pKey ); if( rc==SQLITE_OK ){ btreeReleaseAllCursorPages(pCur); @@ -653,6 +655,15 @@ static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); ** routine is called just before cursor pExcept is used to modify the ** table, for example in BtreeDelete() or BtreeInsert(). ** +** If there are two or more cursors on the same btree, then all such +** cursors should have their BTCF_Multiple flag set. The btreeCursor() +** routine enforces that rule. This routine only needs to be called in +** the uncommon case when pExpect has the BTCF_Multiple flag set. +** +** If pExpect!=NULL and if no other cursors are found on the same root-page, +** then the BTCF_Multiple flag on pExpect is cleared, to avoid another +** pointless call to this routine. +** ** Implementation note: This routine merely checks to see if any cursors ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) ** event that cursors are in need to being saved. @@ -664,7 +675,9 @@ static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ for(p=pBt->pCursor; p; p=p->pNext){ if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; } - return p ? saveCursorsOnList(p, iRoot, pExcept) : SQLITE_OK; + if( p ) return saveCursorsOnList(p, iRoot, pExcept); + if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; + return SQLITE_OK; } /* This helper routine to saveAllCursors does the actual work of saving @@ -952,39 +965,88 @@ static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ ** the page, 1 means the second cell, and so forth) return a pointer ** to the cell content. ** +** findCellPastPtr() does the same except it skips past the initial +** 4-byte child pointer found on interior pages, if there is one. +** ** This routine works only for pages that do not contain overflow cells. */ #define findCell(P,I) \ - ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)]))) -#define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I))))) + ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) +#define findCellPastPtr(P,I) \ + ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) /* -** This a more complex version of findCell() that works for -** pages that do contain overflow cells. +** This is common tail processing for btreeParseCellPtr() and +** btreeParseCellPtrIndex() for the case when the cell does not fit entirely +** on a single B-tree page. Make necessary adjustments to the CellInfo +** structure. */ -static u8 *findOverflowCell(MemPage *pPage, int iCell){ - int i; - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - for(i=pPage->nOverflow-1; i>=0; i--){ - int k; - k = pPage->aiOvfl[i]; - if( k<=iCell ){ - if( k==iCell ){ - return pPage->apOvfl[i]; - } - iCell--; - } +static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( + MemPage *pPage, /* Page containing the cell */ + u8 *pCell, /* Pointer to the cell text. */ + CellInfo *pInfo /* Fill in this structure */ +){ + /* If the payload will not fit completely on the local page, we have + ** to decide how much to store locally and how much to spill onto + ** overflow pages. The strategy is to minimize the amount of unused + ** space on overflow pages while keeping the amount of local storage + ** in between minLocal and maxLocal. + ** + ** Warning: changing the way overflow payload is distributed in any + ** way will result in an incompatible file format. + */ + int minLocal; /* Minimum amount of payload held locally */ + int maxLocal; /* Maximum amount of payload held locally */ + int surplus; /* Overflow payload available for local storage */ + + minLocal = pPage->minLocal; + maxLocal = pPage->maxLocal; + surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); + testcase( surplus==maxLocal ); + testcase( surplus==maxLocal+1 ); + if( surplus <= maxLocal ){ + pInfo->nLocal = (u16)surplus; + }else{ + pInfo->nLocal = (u16)minLocal; } - return findCell(pPage, iCell); + pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell); + pInfo->nSize = pInfo->iOverflow + 4; } /* -** Parse a cell content block and fill in the CellInfo structure. There -** are two versions of this function. btreeParseCell() takes a -** cell index as the second argument and btreeParseCellPtr() -** takes a pointer to the body of the cell as its second argument. +** The following routines are implementations of the MemPage.xParseCell() +** method. +** +** Parse a cell content block and fill in the CellInfo structure. +** +** btreeParseCellPtr() => table btree leaf nodes +** btreeParseCellNoPayload() => table btree internal nodes +** btreeParseCellPtrIndex() => index btree nodes +** +** There is also a wrapper function btreeParseCell() that works for +** all MemPage types and that references the cell by index rather than +** by pointer. */ +static void btreeParseCellPtrNoPayload( + MemPage *pPage, /* Page containing the cell */ + u8 *pCell, /* Pointer to the cell text. */ + CellInfo *pInfo /* Fill in this structure */ +){ + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + assert( pPage->leaf==0 ); + assert( pPage->noPayload ); + assert( pPage->childPtrSize==4 ); +#ifndef SQLITE_DEBUG + UNUSED_PARAMETER(pPage); +#endif + pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); + pInfo->nPayload = 0; + pInfo->nLocal = 0; + pInfo->iOverflow = 0; + pInfo->pPayload = 0; + return; +} static void btreeParseCellPtr( MemPage *pPage, /* Page containing the cell */ u8 *pCell, /* Pointer to the cell text. */ @@ -992,26 +1054,93 @@ static void btreeParseCellPtr( ){ u8 *pIter; /* For scanning through pCell */ u32 nPayload; /* Number of bytes of cell payload */ + u64 iKey; /* Extracted Key value */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( pPage->leaf==0 || pPage->leaf==1 ); - if( pPage->intKeyLeaf ){ - assert( pPage->childPtrSize==0 ); - pIter = pCell + getVarint32(pCell, nPayload); - pIter += getVarint(pIter, (u64*)&pInfo->nKey); - }else if( pPage->noPayload ){ - assert( pPage->childPtrSize==4 ); - pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); - pInfo->nPayload = 0; - pInfo->nLocal = 0; + assert( pPage->intKeyLeaf || pPage->noPayload ); + assert( pPage->noPayload==0 ); + assert( pPage->intKeyLeaf ); + assert( pPage->childPtrSize==0 ); + pIter = pCell; + + /* The next block of code is equivalent to: + ** + ** pIter += getVarint32(pIter, nPayload); + ** + ** The code is inlined to avoid a function call. + */ + nPayload = *pIter; + if( nPayload>=0x80 ){ + u8 *pEnd = &pIter[8]; + nPayload &= 0x7f; + do{ + nPayload = (nPayload<<7) | (*++pIter & 0x7f); + }while( (*pIter)>=0x80 && pIter<pEnd ); + } + pIter++; + + /* The next block of code is equivalent to: + ** + ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); + ** + ** The code is inlined to avoid a function call. + */ + iKey = *pIter; + if( iKey>=0x80 ){ + u8 *pEnd = &pIter[7]; + iKey &= 0x7f; + while(1){ + iKey = (iKey<<7) | (*++pIter & 0x7f); + if( (*pIter)<0x80 ) break; + if( pIter>=pEnd ){ + iKey = (iKey<<8) | *++pIter; + break; + } + } + } + pIter++; + + pInfo->nKey = *(i64*)&iKey; + pInfo->nPayload = nPayload; + pInfo->pPayload = pIter; + testcase( nPayload==pPage->maxLocal ); + testcase( nPayload==pPage->maxLocal+1 ); + if( nPayload<=pPage->maxLocal ){ + /* This is the (easy) common case where the entire payload fits + ** on the local page. No overflow is required. + */ + pInfo->nSize = nPayload + (u16)(pIter - pCell); + if( pInfo->nSize<4 ) pInfo->nSize = 4; + pInfo->nLocal = (u16)nPayload; pInfo->iOverflow = 0; - pInfo->pPayload = 0; - return; }else{ - pIter = pCell + pPage->childPtrSize; - pIter += getVarint32(pIter, nPayload); - pInfo->nKey = nPayload; + btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); + } +} +static void btreeParseCellPtrIndex( + MemPage *pPage, /* Page containing the cell */ + u8 *pCell, /* Pointer to the cell text. */ + CellInfo *pInfo /* Fill in this structure */ +){ + u8 *pIter; /* For scanning through pCell */ + u32 nPayload; /* Number of bytes of cell payload */ + + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + assert( pPage->leaf==0 || pPage->leaf==1 ); + assert( pPage->intKeyLeaf==0 ); + assert( pPage->noPayload==0 ); + pIter = pCell + pPage->childPtrSize; + nPayload = *pIter; + if( nPayload>=0x80 ){ + u8 *pEnd = &pIter[8]; + nPayload &= 0x7f; + do{ + nPayload = (nPayload<<7) | (*++pIter & 0x7f); + }while( *(pIter)>=0x80 && pIter<pEnd ); } + pIter++; + pInfo->nKey = nPayload; pInfo->nPayload = nPayload; pInfo->pPayload = pIter; testcase( nPayload==pPage->maxLocal ); @@ -1025,31 +1154,7 @@ static void btreeParseCellPtr( pInfo->nLocal = (u16)nPayload; pInfo->iOverflow = 0; }else{ - /* If the payload will not fit completely on the local page, we have - ** to decide how much to store locally and how much to spill onto - ** overflow pages. The strategy is to minimize the amount of unused - ** space on overflow pages while keeping the amount of local storage - ** in between minLocal and maxLocal. - ** - ** Warning: changing the way overflow payload is distributed in any - ** way will result in an incompatible file format. - */ - int minLocal; /* Minimum amount of payload held locally */ - int maxLocal; /* Maximum amount of payload held locally */ - int surplus; /* Overflow payload available for local storage */ - - minLocal = pPage->minLocal; - maxLocal = pPage->maxLocal; - surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); - testcase( surplus==maxLocal ); - testcase( surplus==maxLocal+1 ); - if( surplus <= maxLocal ){ - pInfo->nLocal = (u16)surplus; - }else{ - pInfo->nLocal = (u16)minLocal; - } - pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell); - pInfo->nSize = pInfo->iOverflow + 4; + btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); } } static void btreeParseCell( @@ -1057,14 +1162,20 @@ static void btreeParseCell( int iCell, /* The cell index. First cell is 0 */ CellInfo *pInfo /* Fill in this structure */ ){ - btreeParseCellPtr(pPage, findCell(pPage, iCell), pInfo); + pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); } /* +** The following routines are implementations of the MemPage.xCellSize +** method. +** ** Compute the total number of bytes that a Cell needs in the cell ** data area of the btree-page. The return number includes the cell ** data header and the local payload, but not any overflow page or ** the space used by the cell pointer. +** +** cellSizePtrNoPayload() => table internal nodes +** cellSizePtr() => all index nodes & table leaf nodes */ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */ @@ -1077,18 +1188,13 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of ** this function verifies that this invariant is not violated. */ CellInfo debuginfo; - btreeParseCellPtr(pPage, pCell, &debuginfo); + pPage->xParseCell(pPage, pCell, &debuginfo); #endif - if( pPage->noPayload ){ - pEnd = &pIter[9]; - while( (*pIter++)&0x80 && pIter<pEnd ); - assert( pPage->childPtrSize==4 ); - return (u16)(pIter - pCell); - } + assert( pPage->noPayload==0 ); nSize = *pIter; if( nSize>=0x80 ){ - pEnd = &pIter[9]; + pEnd = &pIter[8]; nSize &= 0x7f; do{ nSize = (nSize<<7) | (*++pIter & 0x7f); @@ -1120,12 +1226,34 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ assert( nSize==debuginfo.nSize || CORRUPT_DB ); return (u16)nSize; } +static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ + u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ + u8 *pEnd; /* End mark for a varint */ + +#ifdef SQLITE_DEBUG + /* The value returned by this function should always be the same as + ** the (CellInfo.nSize) value found by doing a full parse of the + ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of + ** this function verifies that this invariant is not violated. */ + CellInfo debuginfo; + pPage->xParseCell(pPage, pCell, &debuginfo); +#else + UNUSED_PARAMETER(pPage); +#endif + + assert( pPage->childPtrSize==4 ); + pEnd = pIter + 9; + while( (*pIter++)&0x80 && pIter<pEnd ); + assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); + return (u16)(pIter - pCell); +} + #ifdef SQLITE_DEBUG /* This variation on cellSizePtr() is used inside of assert() statements ** only. */ static u16 cellSize(MemPage *pPage, int iCell){ - return cellSizePtr(pPage, findCell(pPage, iCell)); + return pPage->xCellSize(pPage, findCell(pPage, iCell)); } #endif @@ -1139,7 +1267,7 @@ static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ CellInfo info; if( *pRC ) return; assert( pCell!=0 ); - btreeParseCellPtr(pPage, pCell, &info); + pPage->xParseCell(pPage, pCell, &info); if( info.iOverflow ){ Pgno ovfl = get4byte(&pCell[info.iOverflow]); ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); @@ -1196,26 +1324,18 @@ static int defragmentPage(MemPage *pPage){ pc = get2byte(pAddr); testcase( pc==iCellFirst ); testcase( pc==iCellLast ); -#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) /* These conditions have already been verified in btreeInitPage() - ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined + ** if PRAGMA cell_size_check=ON. */ if( pc<iCellFirst || pc>iCellLast ){ return SQLITE_CORRUPT_BKPT; } -#endif assert( pc>=iCellFirst && pc<=iCellLast ); - size = cellSizePtr(pPage, &src[pc]); + size = pPage->xCellSize(pPage, &src[pc]); cbrk -= size; -#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) - if( cbrk<iCellFirst ){ - return SQLITE_CORRUPT_BKPT; - } -#else if( cbrk<iCellFirst || pc+size>usableSize ){ return SQLITE_CORRUPT_BKPT; } -#endif assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); testcase( cbrk+size==usableSize ); testcase( pc+size==usableSize ); @@ -1253,18 +1373,20 @@ static int defragmentPage(MemPage *pPage){ ** This function may detect corruption within pPg. If corruption is ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. ** -** If a slot of at least nByte bytes is found but cannot be used because -** there are already at least 60 fragmented bytes on the page, return NULL. -** In this case, if pbDefrag parameter is not NULL, set *pbDefrag to true. +** Slots on the free list that are between 1 and 3 bytes larger than nByte +** will be ignored if adding the extra space to the fragmentation count +** causes the fragmentation count to exceed 60. */ -static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ +static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ const int hdr = pPg->hdrOffset; u8 * const aData = pPg->aData; - int iAddr; - int pc; + int iAddr = hdr + 1; + int pc = get2byte(&aData[iAddr]); + int x; int usableSize = pPg->pBt->usableSize; - for(iAddr=hdr+1; (pc = get2byte(&aData[iAddr]))>0; iAddr=pc){ + assert( pc>0 ); + do{ int size; /* Size of the free slot */ /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of ** increasing offset. */ @@ -1276,24 +1398,21 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ ** freeblock form a big-endian integer which is the size of the freeblock ** in bytes, including the 4-byte header. */ size = get2byte(&aData[pc+2]); - if( size>=nByte ){ - int x = size - nByte; + if( (x = size - nByte)>=0 ){ testcase( x==4 ); testcase( x==3 ); - if( x<4 ){ + if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){ + *pRc = SQLITE_CORRUPT_BKPT; + return 0; + }else if( x<4 ){ /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total ** number of bytes in fragments may not exceed 60. */ - if( aData[hdr+7]>=60 ){ - if( pbDefrag ) *pbDefrag = 1; - return 0; - } + if( aData[hdr+7]>57 ) return 0; + /* Remove the slot from the free-list. Update the number of ** fragmented bytes within the page. */ memcpy(&aData[iAddr], &aData[pc], 2); aData[hdr+7] += (u8)x; - }else if( size+pc > usableSize ){ - *pRc = SQLITE_CORRUPT_BKPT; - return 0; }else{ /* The slot remains on the free-list. Reduce its size to account ** for the portion used by the new allocation. */ @@ -1301,7 +1420,9 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc, int *pbDefrag){ } return &aData[pc + x]; } - } + iAddr = pc; + pc = get2byte(&aData[pc]); + }while( pc ); return 0; } @@ -1342,8 +1463,15 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ ** then the cell content offset of an empty page wants to be 65536. ** However, that integer is too large to be stored in a 2-byte unsigned ** integer, so a value of 0 is used in its place. */ - top = get2byteNotZero(&data[hdr+5]); - if( gap>top ) return SQLITE_CORRUPT_BKPT; + top = get2byte(&data[hdr+5]); + assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */ + if( gap>top ){ + if( top==0 && pPage->pBt->usableSize==65536 ){ + top = 65536; + }else{ + return SQLITE_CORRUPT_BKPT; + } + } /* If there is enough space between gap and top for one more cell pointer ** array entry offset, and if the freelist is not empty, then search the @@ -1352,15 +1480,14 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ testcase( gap+2==top ); testcase( gap+1==top ); testcase( gap==top ); - if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){ - int bDefrag = 0; - u8 *pSpace = pageFindSlot(pPage, nByte, &rc, &bDefrag); - if( rc ) return rc; - if( bDefrag ) goto defragment_page; + if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ + u8 *pSpace = pageFindSlot(pPage, nByte, &rc); if( pSpace ){ assert( pSpace>=data && (pSpace - data)<65536 ); *pIdx = (int)(pSpace - data); return SQLITE_OK; + }else if( rc ){ + return rc; } } @@ -1369,7 +1496,6 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ */ testcase( gap+2+nByte==top ); if( gap+2+nByte>top ){ - defragment_page: assert( pPage->nCell>0 || CORRUPT_DB ); rc = defragmentPage(pPage); if( rc ) return rc; @@ -1416,7 +1542,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); - assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); + assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( iSize>=4 ); /* Minimum cell size is 4 */ @@ -1445,7 +1571,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ /* At this point: ** iFreeBlk: First freeblock after iStart, or zero if none - ** iPtr: The address of a pointer iFreeBlk + ** iPtr: The address of a pointer to iFreeBlk ** ** Check to see if iFreeBlk should be coalesced onto the end of iStart. */ @@ -1453,6 +1579,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ nFrag = iFreeBlk - iEnd; if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT; iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); + if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT; iSize = iEnd - iStart; iFreeBlk = get2byte(&data[iFreeBlk]); } @@ -1510,6 +1637,7 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); flagByte &= ~PTF_LEAF; pPage->childPtrSize = 4-4*pPage->leaf; + pPage->xCellSize = cellSizePtr; pBt = pPage->pBt; if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior @@ -1519,8 +1647,16 @@ static int decodeFlags(MemPage *pPage, int flagByte){ ** table b-tree page. */ assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 ); pPage->intKey = 1; - pPage->intKeyLeaf = pPage->leaf; - pPage->noPayload = !pPage->leaf; + if( pPage->leaf ){ + pPage->intKeyLeaf = 1; + pPage->noPayload = 0; + pPage->xParseCell = btreeParseCellPtr; + }else{ + pPage->intKeyLeaf = 0; + pPage->noPayload = 1; + pPage->xCellSize = cellSizePtrNoPayload; + pPage->xParseCell = btreeParseCellPtrNoPayload; + } pPage->maxLocal = pBt->maxLeaf; pPage->minLocal = pBt->minLeaf; }else if( flagByte==PTF_ZERODATA ){ @@ -1533,6 +1669,7 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->intKey = 0; pPage->intKeyLeaf = 0; pPage->noPayload = 0; + pPage->xParseCell = btreeParseCellPtrIndex; pPage->maxLocal = pBt->maxLocal; pPage->minLocal = pBt->minLocal; }else{ @@ -1556,6 +1693,7 @@ static int decodeFlags(MemPage *pPage, int flagByte){ static int btreeInitPage(MemPage *pPage){ assert( pPage->pBt!=0 ); + assert( pPage->pBt->db!=0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); @@ -1587,6 +1725,7 @@ static int btreeInitPage(MemPage *pPage){ pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize; pPage->aDataEnd = &data[usableSize]; pPage->aCellIdx = &data[cellOffset]; + pPage->aDataOfst = &data[pPage->childPtrSize]; /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates ** the start of the cell content area. A zero value for this integer is ** interpreted as 65536. */ @@ -1614,20 +1753,19 @@ static int btreeInitPage(MemPage *pPage){ */ iCellFirst = cellOffset + 2*pPage->nCell; iCellLast = usableSize - 4; -#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) - { + if( pBt->db->flags & SQLITE_CellSizeCk ){ int i; /* Index into the cell pointer array */ int sz; /* Size of a cell */ if( !pPage->leaf ) iCellLast--; for(i=0; i<pPage->nCell; i++){ - pc = get2byte(&data[cellOffset+i*2]); + pc = get2byteAligned(&data[cellOffset+i*2]); testcase( pc==iCellFirst ); testcase( pc==iCellLast ); if( pc<iCellFirst || pc>iCellLast ){ return SQLITE_CORRUPT_BKPT; } - sz = cellSizePtr(pPage, &data[pc]); + sz = pPage->xCellSize(pPage, &data[pc]); testcase( pc+sz==usableSize ); if( pc+sz>usableSize ){ return SQLITE_CORRUPT_BKPT; @@ -1635,7 +1773,6 @@ static int btreeInitPage(MemPage *pPage){ } if( !pPage->leaf ) iCellLast++; } -#endif /* Compute the total free space on the page ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the @@ -1708,6 +1845,7 @@ static void zeroPage(MemPage *pPage, int flags){ pPage->cellOffset = first; pPage->aDataEnd = &data[pBt->usableSize]; pPage->aCellIdx = &data[first]; + pPage->aDataOfst = &data[pPage->childPtrSize]; pPage->nOverflow = 0; assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); pPage->maskPage = (u16)(pBt->pageSize - 1); @@ -1726,16 +1864,16 @@ static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ pPage->pDbPage = pDbPage; pPage->pBt = pBt; pPage->pgno = pgno; - pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; + pPage->hdrOffset = pgno==1 ? 100 : 0; return pPage; } /* ** Get a page from the pager. Initialize the MemPage.pBt and -** MemPage.aData elements if needed. +** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). ** -** If the noContent flag is set, it means that we do not care about -** the content of the page at this time. So do not go to the disk +** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care +** about the content of the page at this time. So do not go to the disk ** to fetch the content. Just fill in the content with zeros for now. ** If in the future we call sqlite3PagerWrite() on this page, that ** means we have started to be concerned about content and the disk @@ -1787,35 +1925,62 @@ u32 sqlite3BtreeLastPage(Btree *p){ } /* -** Get a page from the pager and initialize it. This routine is just a -** convenience wrapper around separate calls to btreeGetPage() and -** btreeInitPage(). +** Get a page from the pager and initialize it. +** +** If pCur!=0 then the page is being fetched as part of a moveToChild() +** call. Do additional sanity checking on the page in this case. +** And if the fetch fails, this routine must decrement pCur->iPage. +** +** The page is fetched as read-write unless pCur is not NULL and is +** a read-only cursor. ** -** If an error occurs, then the value *ppPage is set to is undefined. It +** If an error occurs, then *ppPage is undefined. It ** may remain unchanged, or it may be set to an invalid value. */ static int getAndInitPage( BtShared *pBt, /* The database file */ Pgno pgno, /* Number of the page to get */ MemPage **ppPage, /* Write the page pointer here */ - int bReadonly /* PAGER_GET_READONLY or 0 */ + BtCursor *pCur, /* Cursor to receive the page, or NULL */ + int bReadOnly /* True for a read-only page */ ){ int rc; + DbPage *pDbPage; assert( sqlite3_mutex_held(pBt->mutex) ); - assert( bReadonly==PAGER_GET_READONLY || bReadonly==0 ); + assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] ); + assert( pCur==0 || bReadOnly==pCur->curPagerFlags ); + assert( pCur==0 || pCur->iPage>0 ); if( pgno>btreePagecount(pBt) ){ rc = SQLITE_CORRUPT_BKPT; - }else{ - rc = btreeGetPage(pBt, pgno, ppPage, bReadonly); - if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){ - rc = btreeInitPage(*ppPage); - if( rc!=SQLITE_OK ){ - releasePage(*ppPage); - } + goto getAndInitPage_error; + } + rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); + if( rc ){ + goto getAndInitPage_error; + } + *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); + if( (*ppPage)->isInit==0 ){ + rc = btreeInitPage(*ppPage); + if( rc!=SQLITE_OK ){ + releasePage(*ppPage); + goto getAndInitPage_error; } } + /* If obtaining a child page for a cursor, we must verify that the page is + ** compatible with the root page. */ + if( pCur + && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) + ){ + rc = SQLITE_CORRUPT_BKPT; + releasePage(*ppPage); + goto getAndInitPage_error; + } + return SQLITE_OK; + +getAndInitPage_error: + if( pCur ) pCur->iPage--; testcase( pgno==0 ); assert( pgno!=0 || rc==SQLITE_CORRUPT ); return rc; @@ -1825,18 +1990,49 @@ static int getAndInitPage( ** Release a MemPage. This should be called once for each prior ** call to btreeGetPage. */ +static void releasePageNotNull(MemPage *pPage){ + assert( pPage->aData ); + assert( pPage->pBt ); + assert( pPage->pDbPage!=0 ); + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); + assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); + assert( sqlite3_mutex_held(pPage->pBt->mutex) ); + sqlite3PagerUnrefNotNull(pPage->pDbPage); +} static void releasePage(MemPage *pPage){ - if( pPage ){ - assert( pPage->aData ); - assert( pPage->pBt ); - assert( pPage->pDbPage!=0 ); - assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); - assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); - assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - sqlite3PagerUnrefNotNull(pPage->pDbPage); + if( pPage ) releasePageNotNull(pPage); +} + +/* +** Get an unused page. +** +** This works just like btreeGetPage() with the addition: +** +** * If the page is already in use for some other purpose, immediately +** release it and return an SQLITE_CURRUPT error. +** * Make sure the isInit flag is clear +*/ +static int btreeGetUnusedPage( + BtShared *pBt, /* The btree */ + Pgno pgno, /* Number of the page to fetch */ + MemPage **ppPage, /* Return the page in this parameter */ + int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ +){ + int rc = btreeGetPage(pBt, pgno, ppPage, flags); + if( rc==SQLITE_OK ){ + if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ + releasePage(*ppPage); + *ppPage = 0; + return SQLITE_CORRUPT_BKPT; + } + (*ppPage)->isInit = 0; + }else{ + *ppPage = 0; } + return rc; } + /* ** During a rollback, when the pager reloads information into the cache ** so that the cache is restored to its original state at the start of @@ -2429,7 +2625,7 @@ int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && ((pageSize-1)&pageSize)==0 ){ assert( (pageSize & 7)==0 ); - assert( !pBt->pPage1 && !pBt->pCursor ); + assert( !pBt->pCursor ); pBt->pageSize = (u32)pageSize; freeTempSpace(pBt); } @@ -2779,7 +2975,7 @@ static void unlockBtreeIfUnused(BtShared *pBt){ assert( pPage1->aData ); assert( sqlite3PagerRefcount(pBt->pPager)==1 ); pBt->pPage1 = 0; - releasePage(pPage1); + releasePageNotNull(pPage1); } } @@ -3084,15 +3280,17 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ u8 isInitOrig = pPage->isInit; int i; int nCell; + int rc; - btreeInitPage(pPage); + rc = btreeInitPage(pPage); + if( rc ) return rc; nCell = pPage->nCell; for(i=0; i<nCell; i++){ u8 *pCell = findCell(pPage, i); if( eType==PTRMAP_OVERFLOW1 ){ CellInfo info; - btreeParseCellPtr(pPage, pCell, &info); + pPage->xParseCell(pPage, pCell, &info); if( info.iOverflow && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage && iFrom==get4byte(&pCell[info.iOverflow]) @@ -3391,7 +3589,7 @@ int sqlite3BtreeIncrVacuum(Btree *p){ static int autoVacuumCommit(BtShared *pBt){ int rc = SQLITE_OK; Pager *pPager = pBt->pPager; - VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); + VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); ) assert( sqlite3_mutex_held(pBt->mutex) ); invalidateAllOverflowCache(pBt); @@ -3833,6 +4031,7 @@ static int btreeCursor( BtCursor *pCur /* Space for new cursor */ ){ BtShared *pBt = p->pBt; /* Shared b-tree handle */ + BtCursor *pX; /* Looping over other all cursors */ assert( sqlite3BtreeHoldsMutex(p) ); assert( wrFlag==0 || wrFlag==1 ); @@ -3848,10 +4047,8 @@ static int btreeCursor( assert( p->inTrans>TRANS_NONE ); assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); assert( pBt->pPage1 && pBt->pPage1->aData ); + assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); - if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){ - return SQLITE_READONLY; - } if( wrFlag ){ allocateTempSpace(pBt); if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM; @@ -3870,10 +4067,16 @@ static int btreeCursor( pCur->pBt = pBt; assert( wrFlag==0 || wrFlag==BTCF_WriteFlag ); pCur->curFlags = wrFlag; - pCur->pNext = pBt->pCursor; - if( pCur->pNext ){ - pCur->pNext->pPrev = pCur; + pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY; + /* If there are two or more cursors on the same btree, then all such + ** cursors *must* have the BTCF_Multiple flag set. */ + for(pX=pBt->pCursor; pX; pX=pX->pNext){ + if( pX->pgnoRoot==(Pgno)iTable ){ + pX->curFlags |= BTCF_Multiple; + pCur->curFlags |= BTCF_Multiple; + } } + pCur->pNext = pBt->pCursor; pBt->pCursor = pCur; pCur->eState = CURSOR_INVALID; return SQLITE_OK; @@ -3886,9 +4089,13 @@ int sqlite3BtreeCursor( BtCursor *pCur /* Write new cursor here */ ){ int rc; - sqlite3BtreeEnter(p); - rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); - sqlite3BtreeLeave(p); + if( iTable<1 ){ + rc = SQLITE_CORRUPT_BKPT; + }else{ + sqlite3BtreeEnter(p); + rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); + sqlite3BtreeLeave(p); + } return rc; } @@ -3927,13 +4134,18 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ BtShared *pBt = pCur->pBt; sqlite3BtreeEnter(pBtree); sqlite3BtreeClearCursor(pCur); - if( pCur->pPrev ){ - pCur->pPrev->pNext = pCur->pNext; - }else{ + assert( pBt->pCursor!=0 ); + if( pBt->pCursor==pCur ){ pBt->pCursor = pCur->pNext; - } - if( pCur->pNext ){ - pCur->pNext->pPrev = pCur->pPrev; + }else{ + BtCursor *pPrev = pBt->pCursor; + do{ + if( pPrev->pNext==pCur ){ + pPrev->pNext = pCur->pNext; + break; + } + pPrev = pPrev->pNext; + }while( ALWAYS(pPrev) ); } for(i=0; i<=pCur->iPage; i++){ releasePage(pCur->apPage[i]); @@ -3953,13 +4165,6 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ ** ** BtCursor.info is a cache of the information in the current cell. ** Using this cache reduces the number of calls to btreeParseCell(). -** -** 2007-06-25: There is a bug in some versions of MSVC that cause the -** compiler to crash when getCellInfo() is implemented as a macro. -** But there is a measureable speed advantage to using the macro on gcc -** (when less compiler optimizations like -Os or -O0 are used and the -** compiler is not doing aggressive inlining.) So we use a real function -** for MSVC and a macro for everything else. Ticket #2457. */ #ifndef NDEBUG static void assertCellInfo(BtCursor *pCur){ @@ -3972,28 +4177,15 @@ int sqlite3BtreeCloseCursor(BtCursor *pCur){ #else #define assertCellInfo(x) #endif -#ifdef _MSC_VER - /* Use a real function in MSVC to work around bugs in that compiler. */ - static void getCellInfo(BtCursor *pCur){ - if( pCur->info.nSize==0 ){ - int iPage = pCur->iPage; - btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); - pCur->curFlags |= BTCF_ValidNKey; - }else{ - assertCellInfo(pCur); - } - } -#else /* if not _MSC_VER */ - /* Use a macro in all other compilers so that the function is inlined */ -#define getCellInfo(pCur) \ - if( pCur->info.nSize==0 ){ \ - int iPage = pCur->iPage; \ - btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ - pCur->curFlags |= BTCF_ValidNKey; \ - }else{ \ - assertCellInfo(pCur); \ +static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ + if( pCur->info.nSize==0 ){ + int iPage = pCur->iPage; + pCur->curFlags |= BTCF_ValidNKey; + btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); + }else{ + assertCellInfo(pCur); } -#endif /* _MSC_VER */ +} #ifndef NDEBUG /* The next routine used only within assert() statements */ /* @@ -4451,13 +4643,18 @@ static const void *fetchPayload( BtCursor *pCur, /* Cursor pointing to entry to read from */ u32 *pAmt /* Write the number of available bytes here */ ){ + u32 amt; assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); assert( pCur->eState==CURSOR_VALID ); assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); assert( cursorHoldsMutex(pCur) ); assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); assert( pCur->info.nSize>0 ); - *pAmt = pCur->info.nLocal; + assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB ); + assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB); + amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload); + if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal; + *pAmt = amt; return (void*)pCur->info.pPayload; } @@ -4494,9 +4691,6 @@ const void *sqlite3BtreeDataFetch(BtCursor *pCur, u32 *pAmt){ ** vice-versa). */ static int moveToChild(BtCursor *pCur, u32 newPgno){ - int rc; - int i = pCur->iPage; - MemPage *pNewPage; BtShared *pBt = pCur->pBt; assert( cursorHoldsMutex(pCur) ); @@ -4506,19 +4700,12 @@ static int moveToChild(BtCursor *pCur, u32 newPgno){ if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ return SQLITE_CORRUPT_BKPT; } - rc = getAndInitPage(pBt, newPgno, &pNewPage, - (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0); - if( rc ) return rc; - pCur->apPage[i+1] = pNewPage; - pCur->aiIdx[i+1] = 0; - pCur->iPage++; - pCur->info.nSize = 0; pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); - if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ - return SQLITE_CORRUPT_BKPT; - } - return SQLITE_OK; + pCur->iPage++; + pCur->aiIdx[pCur->iPage] = 0; + return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage], + pCur, pCur->curPagerFlags); } #if SQLITE_DEBUG @@ -4562,11 +4749,9 @@ static void moveToParent(BtCursor *pCur){ pCur->apPage[pCur->iPage]->pgno ); testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); - - releasePage(pCur->apPage[pCur->iPage]); - pCur->iPage--; pCur->info.nSize = 0; pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); + releasePageNotNull(pCur->apPage[pCur->iPage--]); } /* @@ -4607,18 +4792,23 @@ static int moveToRoot(BtCursor *pCur){ } if( pCur->iPage>=0 ){ - while( pCur->iPage ) releasePage(pCur->apPage[pCur->iPage--]); + while( pCur->iPage ){ + assert( pCur->apPage[pCur->iPage]!=0 ); + releasePageNotNull(pCur->apPage[pCur->iPage--]); + } }else if( pCur->pgnoRoot==0 ){ pCur->eState = CURSOR_INVALID; return SQLITE_OK; }else{ + assert( pCur->iPage==(-1) ); rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0], - (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0); + 0, pCur->curPagerFlags); if( rc!=SQLITE_OK ){ pCur->eState = CURSOR_INVALID; return rc; } pCur->iPage = 0; + pCur->curIntKey = pCur->apPage[0]->intKey; } pRoot = pCur->apPage[0]; assert( pRoot->pgno==pCur->pgnoRoot ); @@ -4821,7 +5011,7 @@ int sqlite3BtreeMovetoUnpacked( /* If the cursor is already positioned at the point we are trying ** to move to, then just return without doing any work */ if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 - && pCur->apPage[0]->intKey + && pCur->curIntKey ){ if( pCur->info.nKey==intKey ){ *pRes = 0; @@ -4856,7 +5046,8 @@ int sqlite3BtreeMovetoUnpacked( assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 ); return SQLITE_OK; } - assert( pCur->apPage[0]->intKey || pIdxKey ); + assert( pCur->apPage[0]->intKey==pCur->curIntKey ); + assert( pCur->curIntKey || pIdxKey ); for(;;){ int lwr, upr, idx, c; Pgno chldPg; @@ -4879,7 +5070,7 @@ int sqlite3BtreeMovetoUnpacked( if( xRecordCompare==0 ){ for(;;){ i64 nCellKey; - pCell = findCell(pPage, idx) + pPage->childPtrSize; + pCell = findCellPastPtr(pPage, idx); if( pPage->intKeyLeaf ){ while( 0x80 <= *(pCell++) ){ if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT; @@ -4911,8 +5102,8 @@ int sqlite3BtreeMovetoUnpacked( } }else{ for(;;){ - int nCell; - pCell = findCell(pPage, idx) + pPage->childPtrSize; + int nCell; /* Size of the pCell cell in bytes */ + pCell = findCellPastPtr(pPage, idx); /* The maximum supported page-size is 65536 bytes. This means that ** the maximum number of record bytes stored on an index B-Tree @@ -4940,12 +5131,25 @@ int sqlite3BtreeMovetoUnpacked( /* The record flows over onto one or more overflow pages. In ** this case the whole cell needs to be parsed, a buffer allocated ** and accessPayload() used to retrieve the record into the - ** buffer before VdbeRecordCompare() can be called. */ + ** buffer before VdbeRecordCompare() can be called. + ** + ** If the record is corrupt, the xRecordCompare routine may read + ** up to two varints past the end of the buffer. An extra 18 + ** bytes of padding is allocated at the end of the buffer in + ** case this happens. */ void *pCellKey; u8 * const pCellBody = pCell - pPage->childPtrSize; - btreeParseCellPtr(pPage, pCellBody, &pCur->info); + pPage->xParseCell(pPage, pCellBody, &pCur->info); nCell = (int)pCur->info.nKey; - pCellKey = sqlite3Malloc( nCell ); + testcase( nCell<0 ); /* True if key size is 2^32 or more */ + testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ + testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ + testcase( nCell==2 ); /* Minimum legal index key size */ + if( nCell<2 ){ + rc = SQLITE_CORRUPT_BKPT; + goto moveto_finish; + } + pCellKey = sqlite3Malloc( nCell+18 ); if( pCellKey==0 ){ rc = SQLITE_NOMEM; goto moveto_finish; @@ -5238,8 +5442,7 @@ int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ ** sqlite3PagerUnref() on the new page when it is done. ** ** SQLITE_OK is returned on success. Any other return value indicates -** an error. *ppPage and *pPgno are undefined in the event of an error. -** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. +** an error. *ppPage is set to NULL in the event of an error. ** ** If the "nearby" parameter is not 0, then an effort is made to ** locate a page close to the page number "nearby". This can be used in an @@ -5282,6 +5485,7 @@ static int allocateBtreePage( /* There are pages on the freelist. Reuse one of those pages. */ Pgno iTrunk; u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ + u32 nSearch = 0; /* Count of the number of search attempts */ /* If eMode==BTALLOC_EXACT and a query of the pointer-map ** shows that the page 'nearby' is somewhere on the free-list, then @@ -5330,10 +5534,10 @@ static int allocateBtreePage( iTrunk = get4byte(&pPage1->aData[32]); } testcase( iTrunk==mxPage ); - if( iTrunk>mxPage ){ + if( iTrunk>mxPage || nSearch++ > n ){ rc = SQLITE_CORRUPT_BKPT; }else{ - rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); + rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); } if( rc ){ pTrunk = 0; @@ -5398,7 +5602,7 @@ static int allocateBtreePage( goto end_allocate_page; } testcase( iNewTrunk==mxPage ); - rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); + rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); if( rc!=SQLITE_OK ){ goto end_allocate_page; } @@ -5478,11 +5682,12 @@ static int allocateBtreePage( } put4byte(&aData[4], k-1); noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; - rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); + rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); if( rc==SQLITE_OK ){ rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); + *ppPage = 0; } } searchList = 0; @@ -5526,7 +5731,7 @@ static int allocateBtreePage( MemPage *pPg = 0; TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); - rc = btreeGetPage(pBt, pBt->nPage, &pPg, bNoContent); + rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); if( rc==SQLITE_OK ){ rc = sqlite3PagerWrite(pPg->pDbPage); releasePage(pPg); @@ -5540,11 +5745,12 @@ static int allocateBtreePage( *pPgno = pBt->nPage; assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); - rc = btreeGetPage(pBt, *pPgno, ppPage, bNoContent); + rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); if( rc ) return rc; rc = sqlite3PagerWrite((*ppPage)->pDbPage); if( rc!=SQLITE_OK ){ releasePage(*ppPage); + *ppPage = 0; } TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); } @@ -5554,17 +5760,8 @@ static int allocateBtreePage( end_allocate_page: releasePage(pTrunk); releasePage(pPrevTrunk); - if( rc==SQLITE_OK ){ - if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ - releasePage(*ppPage); - *ppPage = 0; - return SQLITE_CORRUPT_BKPT; - } - (*ppPage)->isInit = 0; - }else{ - *ppPage = 0; - } - assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) ); + assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); + assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); return rc; } @@ -5589,9 +5786,10 @@ static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ int nFree; /* Initial number of pages on free-list */ assert( sqlite3_mutex_held(pBt->mutex) ); - assert( iPage>1 ); + assert( CORRUPT_DB || iPage>1 ); assert( !pMemPage || pMemPage->pgno==iPage ); + if( iPage<2 ) return SQLITE_CORRUPT_BKPT; if( pMemPage ){ pPage = pMemPage; sqlite3PagerRef(pPage->pDbPage); @@ -5731,7 +5929,7 @@ static int clearCell( u32 ovflPageSize; assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - btreeParseCellPtr(pPage, pCell, &info); + pPage->xParseCell(pPage, pCell, &info); *pnSize = info.nSize; if( info.iOverflow==0 ){ return SQLITE_OK; /* No overflow pages. Return without doing anything */ @@ -5743,7 +5941,9 @@ static int clearCell( assert( pBt->usableSize > 4 ); ovflPageSize = pBt->usableSize - 4; nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; - assert( ovflPgno==0 || nOvfl>0 ); + assert( nOvfl>0 || + (CORRUPT_DB && (info.nPayload + ovflPageSize)<ovflPageSize) + ); while( nOvfl-- ){ Pgno iNext = 0; MemPage *pOvfl = 0; @@ -5841,9 +6041,7 @@ static int fillInCell( nSrc = nData; nData = 0; }else{ - if( NEVER(nKey>0x7fffffff || pKey==0) ){ - return SQLITE_CORRUPT_BKPT; - } + assert( nKey<=0x7fffffff && pKey!=0 ); nPayload = (int)nKey; pSrc = pKey; nSrc = (int)nKey; @@ -5883,7 +6081,7 @@ static int fillInCell( #if SQLITE_DEBUG { CellInfo info; - btreeParseCellPtr(pPage, pCell, &info); + pPage->xParseCell(pPage, pCell, &info); assert( nHeader=(int)(info.pPayload - pCell) ); assert( info.nKey==nKey ); assert( *pnSize == info.nSize ); @@ -5998,7 +6196,7 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ if( *pRC ) return; assert( idx>=0 && idx<pPage->nCell ); - assert( sz==cellSize(pPage, idx) ); + assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); data = pPage->aData; @@ -6053,10 +6251,8 @@ static void insertCell( ){ int idx = 0; /* Where to write new cell content in data[] */ int j; /* Loop counter */ - int end; /* First byte past the last cell pointer in data[] */ - int ins; /* Index in data[] where new cell pointer is inserted */ - int cellOffset; /* Address of first cell pointer in data[] */ u8 *data; /* The content of the whole page */ + u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ if( *pRC ) return; @@ -6071,7 +6267,7 @@ static void insertCell( ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size ** might be less than 8 (leaf-size + pointer) on the interior node. Hence ** the term after the || in the following assert(). */ - assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); + assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) ); if( pPage->nOverflow || sz+2>pPage->nFree ){ if( pTemp ){ memcpy(pTemp, pCell, sz); @@ -6084,6 +6280,14 @@ static void insertCell( assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) ); pPage->apOvfl[j] = pCell; pPage->aiOvfl[j] = (u16)i; + + /* When multiple overflows occur, they are always sequential and in + ** sorted order. This invariants arise because multiple overflows can + ** only occur when inserting divider cells into the parent page during + ** balancing, and the dividers are adjacent and sorted. + */ + assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ + assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ @@ -6092,24 +6296,26 @@ static void insertCell( } assert( sqlite3PagerIswriteable(pPage->pDbPage) ); data = pPage->aData; - cellOffset = pPage->cellOffset; - end = cellOffset + 2*pPage->nCell; - ins = cellOffset + 2*i; + assert( &data[pPage->cellOffset]==pPage->aCellIdx ); rc = allocateSpace(pPage, sz, &idx); if( rc ){ *pRC = rc; return; } - /* The allocateSpace() routine guarantees the following two properties - ** if it returns success */ - assert( idx >= end+2 ); + /* The allocateSpace() routine guarantees the following properties + ** if it returns successfully */ + assert( idx >= 0 ); + assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); assert( idx+sz <= (int)pPage->pBt->usableSize ); - pPage->nCell++; pPage->nFree -= (u16)(2 + sz); memcpy(&data[idx], pCell, sz); if( iChild ){ put4byte(&data[idx], iChild); } - memmove(&data[ins+2], &data[ins], end-ins); - put2byte(&data[ins], idx); - put2byte(&data[pPage->hdrOffset+3], pPage->nCell); + pIns = pPage->aCellIdx + i*2; + memmove(pIns+2, pIns, 2*(pPage->nCell - i)); + put2byte(pIns, idx); + pPage->nCell++; + /* increment the cell count */ + if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; + assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell ); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ /* The cell may contain a pointer to an overflow page. If so, write @@ -6122,6 +6328,52 @@ static void insertCell( } /* +** A CellArray object contains a cache of pointers and sizes for a +** consecutive sequence of cells that might be held multiple pages. +*/ +typedef struct CellArray CellArray; +struct CellArray { + int nCell; /* Number of cells in apCell[] */ + MemPage *pRef; /* Reference page */ + u8 **apCell; /* All cells begin balanced */ + u16 *szCell; /* Local size of all cells in apCell[] */ +}; + +/* +** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been +** computed. +*/ +static void populateCellCache(CellArray *p, int idx, int N){ + assert( idx>=0 && idx+N<=p->nCell ); + while( N>0 ){ + assert( p->apCell[idx]!=0 ); + if( p->szCell[idx]==0 ){ + p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]); + }else{ + assert( CORRUPT_DB || + p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) ); + } + idx++; + N--; + } +} + +/* +** Return the size of the Nth element of the cell array +*/ +static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ + assert( N>=0 && N<p->nCell ); + assert( p->szCell[N]==0 ); + p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); + return p->szCell[N]; +} +static u16 cachedCellSize(CellArray *p, int N){ + assert( N>=0 && N<p->nCell ); + if( p->szCell[N] ) return p->szCell[N]; + return computeCellSize(p, N); +} + +/* ** Array apCell[] contains pointers to nCell b-tree page cells. The ** szCell[] array contains the size in bytes of each cell. This function ** replaces the current contents of page pPg with the contents of the cell @@ -6134,7 +6386,7 @@ static void insertCell( ** The MemPage.nFree field is invalidated by this function. It is the ** responsibility of the caller to set it correctly. */ -static void rebuildPage( +static int rebuildPage( MemPage *pPg, /* Edit this page */ int nCell, /* Final number of cells on page */ u8 **apCell, /* Array of cells */ @@ -6159,10 +6411,12 @@ static void rebuildPage( pCell = &pTmp[pCell - aData]; } pData -= szCell[i]; - memcpy(pData, pCell, szCell[i]); put2byte(pCellptr, (pData - aData)); pCellptr += 2; - assert( szCell[i]==cellSizePtr(pPg, pCell) ); + if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; + memcpy(pData, pCell, szCell[i]); + assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); + testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) ); } /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ @@ -6173,6 +6427,7 @@ static void rebuildPage( put2byte(&aData[hdr+3], pPg->nCell); put2byte(&aData[hdr+5], pData - aData); aData[hdr+7] = 0x00; + return SQLITE_OK; } /* @@ -6205,25 +6460,25 @@ static int pageInsertArray( u8 *pBegin, /* End of cell-pointer array */ u8 **ppData, /* IN/OUT: Page content -area pointer */ u8 *pCellptr, /* Pointer to cell-pointer area */ + int iFirst, /* Index of first cell to add */ int nCell, /* Number of cells to add to pPg */ - u8 **apCell, /* Array of cells */ - u16 *szCell /* Array of cell sizes */ + CellArray *pCArray /* Array of cells */ ){ int i; u8 *aData = pPg->aData; u8 *pData = *ppData; - const int bFreelist = aData[1] || aData[2]; + int iEnd = iFirst + nCell; assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ - for(i=0; i<nCell; i++){ - int sz = szCell[i]; - int rc; + for(i=iFirst; i<iEnd; i++){ + int sz, rc; u8 *pSlot; - if( bFreelist==0 || (pSlot = pageFindSlot(pPg, sz, &rc, 0))==0 ){ + sz = cachedCellSize(pCArray, i); + if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ pData -= sz; if( pData<pBegin ) return 1; pSlot = pData; } - memcpy(pSlot, apCell[i], sz); + memcpy(pSlot, pCArray->apCell[i], sz); put2byte(pCellptr, (pSlot - aData)); pCellptr += 2; } @@ -6242,22 +6497,27 @@ static int pageInsertArray( */ static int pageFreeArray( MemPage *pPg, /* Page to edit */ + int iFirst, /* First cell to delete */ int nCell, /* Cells to delete */ - u8 **apCell, /* Array of cells */ - u16 *szCell /* Array of cell sizes */ + CellArray *pCArray /* Array of cells */ ){ u8 * const aData = pPg->aData; u8 * const pEnd = &aData[pPg->pBt->usableSize]; u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; int nRet = 0; int i; + int iEnd = iFirst + nCell; u8 *pFree = 0; int szFree = 0; - for(i=0; i<nCell; i++){ - u8 *pCell = apCell[i]; + for(i=iFirst; i<iEnd; i++){ + u8 *pCell = pCArray->apCell[i]; if( pCell>=pStart && pCell<pEnd ){ - int sz = szCell[i]; + int sz; + /* No need to use cachedCellSize() here. The sizes of all cells that + ** are to be freed have already been computing while deciding which + ** cells need freeing */ + sz = pCArray->szCell[i]; assert( sz>0 ); if( pFree!=(pCell + sz) ){ if( pFree ){ assert( pFree>aData && (pFree - aData)<65536 ); @@ -6292,13 +6552,12 @@ static int pageFreeArray( ** The pPg->nFree field is invalid when this function returns. It is the ** responsibility of the caller to set it correctly. */ -static void editPage( +static int editPage( MemPage *pPg, /* Edit this page */ int iOld, /* Index of first cell currently on page */ int iNew, /* Index of new first cell on page */ int nNew, /* Final number of cells on page */ - u8 **apCell, /* Array of cells */ - u16 *szCell /* Array of cell sizes */ + CellArray *pCArray /* Array of cells and sizes */ ){ u8 * const aData = pPg->aData; const int hdr = pPg->hdrOffset; @@ -6317,16 +6576,12 @@ static void editPage( /* Remove cells from the start and end of the page */ if( iOld<iNew ){ - int nShift = pageFreeArray( - pPg, iNew-iOld, &apCell[iOld], &szCell[iOld] - ); + int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); nCell -= nShift; } if( iNewEnd < iOldEnd ){ - nCell -= pageFreeArray( - pPg, iOldEnd-iNewEnd, &apCell[iNewEnd], &szCell[iNewEnd] - ); + nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); } pData = &aData[get2byteNotZero(&aData[hdr+5])]; @@ -6340,7 +6595,7 @@ static void editPage( memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); if( pageInsertArray( pPg, pBegin, &pData, pCellptr, - nAdd, &apCell[iNew], &szCell[iNew] + iNew, nAdd, pCArray ) ) goto editpage_fail; nCell += nAdd; } @@ -6354,7 +6609,7 @@ static void editPage( nCell++; if( pageInsertArray( pPg, pBegin, &pData, pCellptr, - 1, &apCell[iCell + iNew], &szCell[iCell + iNew] + iCell+iNew, 1, pCArray ) ) goto editpage_fail; } } @@ -6363,7 +6618,7 @@ static void editPage( pCellptr = &pPg->aCellIdx[nCell*2]; if( pageInsertArray( pPg, pBegin, &pData, pCellptr, - nNew-nCell, &apCell[iNew+nCell], &szCell[iNew+nCell] + iNew+nCell, nNew-nCell, pCArray ) ) goto editpage_fail; pPg->nCell = nNew; @@ -6374,19 +6629,21 @@ static void editPage( #ifdef SQLITE_DEBUG for(i=0; i<nNew && !CORRUPT_DB; i++){ - u8 *pCell = apCell[i+iNew]; - int iOff = get2byte(&pPg->aCellIdx[i*2]); + u8 *pCell = pCArray->apCell[i+iNew]; + int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){ pCell = &pTmp[pCell - aData]; } - assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) ); + assert( 0==memcmp(pCell, &aData[iOff], + pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); } #endif - return; + return SQLITE_OK; editpage_fail: /* Unable to edit this page. Rebuild it from scratch instead. */ - rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]); + populateCellCache(pCArray, iNew, nNew); + return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]); } /* @@ -6452,13 +6709,14 @@ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ u8 *pOut = &pSpace[4]; u8 *pCell = pPage->apOvfl[0]; - u16 szCell = cellSizePtr(pPage, pCell); + u16 szCell = pPage->xCellSize(pPage, pCell); u8 *pStop; assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); - rebuildPage(pNew, 1, &pCell, &szCell); + rc = rebuildPage(pNew, 1, &pCell, &szCell); + if( NEVER(rc) ) return rc; pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; /* If this is an auto-vacuum database, update the pointer map @@ -6531,7 +6789,7 @@ static int ptrmapCheckPages(MemPage **apPage, int nPage){ u8 *z; z = findCell(pPage, j); - btreeParseCellPtr(pPage, z, &info); + pPage->xParseCell(pPage, z, &info); if( info.iOverflow ){ Pgno ovfl = get4byte(&z[info.iOverflow]); ptrmapGet(pBt, ovfl, &e, &n); @@ -6662,7 +6920,6 @@ static int balance_nonroot( int bBulk /* True if this call is part of a bulk load */ ){ BtShared *pBt; /* The whole database */ - int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nNew = 0; /* Number of pages in apNew[] */ int nOld; /* Number of pages in apOld[] */ @@ -6673,7 +6930,6 @@ static int balance_nonroot( int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ int usableSpace; /* Bytes in pPage beyond the header */ int pageFlags; /* Value of pPage->aData[0] */ - int subtotal; /* Subtotal of bytes in cells on one page */ int iSpace1 = 0; /* First unused byte of aSpace1[] */ int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ @@ -6681,19 +6937,20 @@ static int balance_nonroot( MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ - int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ - int cntOld[NB+2]; /* Old index in aCell[] after i-th page */ + int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ + int cntOld[NB+2]; /* Old index in b.apCell[] */ int szNew[NB+2]; /* Combined size of cells placed on i-th page */ - u8 **apCell = 0; /* All cells begin balanced */ - u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ u8 abDone[NB+2]; /* True after i'th new page is populated */ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ + CellArray b; /* Parsed information on cells being balanced */ memset(abDone, 0, sizeof(abDone)); + b.nCell = 0; + b.apCell = 0; pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); @@ -6735,7 +6992,6 @@ static int balance_nonroot( }else if( iParentIdx==i ){ nxDiv = i-2+bBulk; }else{ - assert( bBulk==0 ); nxDiv = iParentIdx-1; } i = 2-bBulk; @@ -6748,7 +7004,7 @@ static int balance_nonroot( } pgno = get4byte(pRight); while( 1 ){ - rc = getAndInitPage(pBt, pgno, &apOld[i], 0); + rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0); if( rc ){ memset(apOld, 0, (i+1)*sizeof(MemPage*)); goto balance_cleanup; @@ -6759,12 +7015,12 @@ static int balance_nonroot( if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){ apDiv[i] = pParent->apOvfl[0]; pgno = get4byte(apDiv[i]); - szNew[i] = cellSizePtr(pParent, apDiv[i]); + szNew[i] = pParent->xCellSize(pParent, apDiv[i]); pParent->nOverflow = 0; }else{ apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); pgno = get4byte(apDiv[i]); - szNew[i] = cellSizePtr(pParent, apDiv[i]); + szNew[i] = pParent->xCellSize(pParent, apDiv[i]); /* Drop the cell from the parent page. apDiv[i] still points to ** the cell within the parent, even though it has been dropped. @@ -6803,130 +7059,201 @@ static int balance_nonroot( ** Allocate space for memory structures */ szScratch = - nMaxCells*sizeof(u8*) /* apCell */ - + nMaxCells*sizeof(u16) /* szCell */ + nMaxCells*sizeof(u8*) /* b.apCell */ + + nMaxCells*sizeof(u16) /* b.szCell */ + pBt->pageSize; /* aSpace1 */ /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer ** that is more than 6 times the database page size. */ assert( szScratch<=6*(int)pBt->pageSize ); - apCell = sqlite3ScratchMalloc( szScratch ); - if( apCell==0 ){ + b.apCell = sqlite3ScratchMalloc( szScratch ); + if( b.apCell==0 ){ rc = SQLITE_NOMEM; goto balance_cleanup; } - szCell = (u16*)&apCell[nMaxCells]; - aSpace1 = (u8*)&szCell[nMaxCells]; + b.szCell = (u16*)&b.apCell[nMaxCells]; + aSpace1 = (u8*)&b.szCell[nMaxCells]; assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); /* ** Load pointers to all cells on sibling pages and the divider cells - ** into the local apCell[] array. Make copies of the divider cells + ** into the local b.apCell[] array. Make copies of the divider cells ** into space obtained from aSpace1[]. The divider cells have already ** been removed from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied - ** into aSpace1[]. In this way, all cells in apCell[] are without + ** into aSpace1[]. In this way, all cells in b.apCell[] are without ** child pointers. If siblings are not leaves, then all cell in - ** apCell[] include child pointers. Either way, all cells in apCell[] + ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] ** are alike. ** ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ - leafCorrection = apOld[0]->leaf*4; - leafData = apOld[0]->intKeyLeaf; + b.pRef = apOld[0]; + leafCorrection = b.pRef->leaf*4; + leafData = b.pRef->intKeyLeaf; for(i=0; i<nOld; i++){ - int limit; MemPage *pOld = apOld[i]; + int limit = pOld->nCell; + u8 *aData = pOld->aData; + u16 maskPage = pOld->maskPage; + u8 *piCell = aData + pOld->cellOffset; + u8 *piEnd; + + /* Verify that all sibling pages are of the same "type" (table-leaf, + ** table-interior, index-leaf, or index-interior). + */ + if( pOld->aData[0]!=apOld[0]->aData[0] ){ + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; + } - limit = pOld->nCell+pOld->nOverflow; + /* Load b.apCell[] with pointers to all cells in pOld. If pOld + ** constains overflow cells, include them in the b.apCell[] array + ** in the correct spot. + ** + ** Note that when there are multiple overflow cells, it is always the + ** case that they are sequential and adjacent. This invariant arises + ** because multiple overflows can only occurs when inserting divider + ** cells into a parent on a prior balance, and divider cells are always + ** adjacent and are inserted in order. There is an assert() tagged + ** with "NOTE 1" in the overflow cell insertion loop to prove this + ** invariant. + ** + ** This must be done in advance. Once the balance starts, the cell + ** offset section of the btree page will be overwritten and we will no + ** long be able to find the cells if a pointer to each cell is not saved + ** first. + */ + memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*limit); if( pOld->nOverflow>0 ){ + memset(&b.szCell[b.nCell+limit], 0, sizeof(b.szCell[0])*pOld->nOverflow); + limit = pOld->aiOvfl[0]; for(j=0; j<limit; j++){ - assert( nCell<nMaxCells ); - apCell[nCell] = findOverflowCell(pOld, j); - szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); - nCell++; + b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); + piCell += 2; + b.nCell++; } - }else{ - u8 *aData = pOld->aData; - u16 maskPage = pOld->maskPage; - u16 cellOffset = pOld->cellOffset; - for(j=0; j<limit; j++){ - assert( nCell<nMaxCells ); - apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j); - szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); - nCell++; + for(k=0; k<pOld->nOverflow; k++){ + assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ + b.apCell[b.nCell] = pOld->apOvfl[k]; + b.nCell++; } - } - cntOld[i] = nCell; + } + piEnd = aData + pOld->cellOffset + 2*pOld->nCell; + while( piCell<piEnd ){ + assert( b.nCell<nMaxCells ); + b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); + piCell += 2; + b.nCell++; + } + + cntOld[i] = b.nCell; if( i<nOld-1 && !leafData){ u16 sz = (u16)szNew[i]; u8 *pTemp; - assert( nCell<nMaxCells ); - szCell[nCell] = sz; + assert( b.nCell<nMaxCells ); + b.szCell[b.nCell] = sz; pTemp = &aSpace1[iSpace1]; iSpace1 += sz; assert( sz<=pBt->maxLocal+23 ); assert( iSpace1 <= (int)pBt->pageSize ); memcpy(pTemp, apDiv[i], sz); - apCell[nCell] = pTemp+leafCorrection; + b.apCell[b.nCell] = pTemp+leafCorrection; assert( leafCorrection==0 || leafCorrection==4 ); - szCell[nCell] = szCell[nCell] - leafCorrection; + b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; if( !pOld->leaf ){ assert( leafCorrection==0 ); assert( pOld->hdrOffset==0 ); /* The right pointer of the child page pOld becomes the left ** pointer of the divider cell */ - memcpy(apCell[nCell], &pOld->aData[8], 4); + memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); }else{ assert( leafCorrection==4 ); - if( szCell[nCell]<4 ){ + while( b.szCell[b.nCell]<4 ){ /* Do not allow any cells smaller than 4 bytes. If a smaller cell ** does exist, pad it with 0x00 bytes. */ - assert( szCell[nCell]==3 ); - assert( apCell[nCell]==&aSpace1[iSpace1-3] ); + assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); + assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); aSpace1[iSpace1++] = 0x00; - szCell[nCell] = 4; + b.szCell[b.nCell]++; } } - nCell++; + b.nCell++; } } /* - ** Figure out the number of pages needed to hold all nCell cells. + ** Figure out the number of pages needed to hold all b.nCell cells. ** Store this number in "k". Also compute szNew[] which is the total ** size of all cells on the i-th page and cntNew[] which is the index - ** in apCell[] of the cell that divides page i from page i+1. - ** cntNew[k] should equal nCell. + ** in b.apCell[] of the cell that divides page i from page i+1. + ** cntNew[k] should equal b.nCell. ** ** Values computed by this block: ** ** k: The total number of sibling pages ** szNew[i]: Spaced used on the i-th sibling page. - ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to + ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to ** the right of the i-th sibling page. ** usableSpace: Number of bytes of space available on each sibling. ** */ usableSpace = pBt->usableSize - 12 + leafCorrection; - for(subtotal=k=i=0; i<nCell; i++){ - assert( i<nMaxCells ); - subtotal += szCell[i] + 2; - if( subtotal > usableSpace ){ - szNew[k] = subtotal - szCell[i] - 2; - cntNew[k] = i; - if( leafData ){ i--; } - subtotal = 0; - k++; - if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } - } - } - szNew[k] = subtotal; - cntNew[k] = nCell; - k++; + for(i=0; i<nOld; i++){ + MemPage *p = apOld[i]; + szNew[i] = usableSpace - p->nFree; + if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } + for(j=0; j<p->nOverflow; j++){ + szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); + } + cntNew[i] = cntOld[i]; + } + k = nOld; + for(i=0; i<k; i++){ + int sz; + while( szNew[i]>usableSpace ){ + if( i+1>=k ){ + k = i+2; + if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } + szNew[k-1] = 0; + cntNew[k-1] = b.nCell; + } + sz = 2 + cachedCellSize(&b, cntNew[i]-1); + szNew[i] -= sz; + if( !leafData ){ + if( cntNew[i]<b.nCell ){ + sz = 2 + cachedCellSize(&b, cntNew[i]); + }else{ + sz = 0; + } + } + szNew[i+1] += sz; + cntNew[i]--; + } + while( cntNew[i]<b.nCell ){ + sz = 2 + cachedCellSize(&b, cntNew[i]); + if( szNew[i]+sz>usableSpace ) break; + szNew[i] += sz; + cntNew[i]++; + if( !leafData ){ + if( cntNew[i]<b.nCell ){ + sz = 2 + cachedCellSize(&b, cntNew[i]); + }else{ + sz = 0; + } + } + szNew[i+1] -= sz; + } + if( cntNew[i]>=b.nCell ){ + k = i+1; + }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; + } + } /* ** The packing computed by the previous block is biased toward the siblings @@ -6947,19 +7274,27 @@ static int balance_nonroot( r = cntNew[i-1] - 1; d = r + 1 - leafData; - assert( d<nMaxCells ); - assert( r<nMaxCells ); - while( szRight==0 - || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2)) - ){ - szRight += szCell[d] + 2; - szLeft -= szCell[r] + 2; - cntNew[i-1]--; - r = cntNew[i-1] - 1; - d = r + 1 - leafData; - } + (void)cachedCellSize(&b, d); + do{ + assert( d<nMaxCells ); + assert( r<nMaxCells ); + (void)cachedCellSize(&b, r); + if( szRight!=0 + && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){ + break; + } + szRight += b.szCell[d] + 2; + szLeft -= b.szCell[r] + 2; + cntNew[i-1] = r; + r--; + d--; + }while( r>=0 ); szNew[i] = szRight; szNew[i-1] = szLeft; + if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ + rc = SQLITE_CORRUPT_BKPT; + goto balance_cleanup; + } } /* Sanity check: For a non-corrupt database file one of the follwing @@ -6979,10 +7314,6 @@ static int balance_nonroot( /* ** Allocate k new pages. Reuse old pages where possible. */ - if( apOld[0]->pgno<=1 ){ - rc = SQLITE_CORRUPT_BKPT; - goto balance_cleanup; - } pageFlags = apOld[0]->aData[0]; for(i=0; i<k; i++){ MemPage *pNew; @@ -6999,7 +7330,7 @@ static int balance_nonroot( zeroPage(pNew, pageFlags); apNew[i] = pNew; nNew++; - cntOld[i] = nCell; + cntOld[i] = b.nCell; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ @@ -7104,8 +7435,8 @@ static int balance_nonroot( int iNew = 0; int iOld = 0; - for(i=0; i<nCell; i++){ - u8 *pCell = apCell[i]; + for(i=0; i<b.nCell; i++){ + u8 *pCell = b.apCell[i]; if( i==cntOldNext ){ MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld]; cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; @@ -7130,9 +7461,10 @@ static int balance_nonroot( if( !leafCorrection ){ ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); } - if( szCell[i]>pNew->minLocal ){ + if( cachedCellSize(&b,i)>pNew->minLocal ){ ptrmapPutOvflPtr(pNew, pCell, &rc); } + if( rc ) goto balance_cleanup; } } } @@ -7146,20 +7478,21 @@ static int balance_nonroot( j = cntNew[i]; assert( j<nMaxCells ); - pCell = apCell[j]; - sz = szCell[j] + leafCorrection; + assert( b.apCell[j]!=0 ); + pCell = b.apCell[j]; + sz = b.szCell[j] + leafCorrection; pTemp = &aOvflSpace[iOvflSpace]; if( !pNew->leaf ){ memcpy(&pNew->aData[8], pCell, 4); }else if( leafData ){ /* If the tree is a leaf-data tree, and the siblings are leaves, - ** then there is no divider cell in apCell[]. Instead, the divider + ** then there is no divider cell in b.apCell[]. Instead, the divider ** cell consists of the integer key for the right-most cell of ** the sibling-page assembled above only. */ CellInfo info; j--; - btreeParseCellPtr(pNew, apCell[j], &info); + pNew->xParseCell(pNew, b.apCell[j], &info); pCell = pTemp; sz = 4 + putVarint(&pCell[4], info.nKey); pTemp = 0; @@ -7176,9 +7509,9 @@ static int balance_nonroot( ** cells are at least 4 bytes. It only happens in b-trees used ** to evaluate "IN (SELECT ...)" and similar clauses. */ - if( szCell[j]==4 ){ + if( b.szCell[j]==4 ){ assert(leafCorrection==4); - sz = cellSizePtr(pParent, pCell); + sz = pParent->xCellSize(pParent, pCell); } } iOvflSpace += sz; @@ -7234,12 +7567,13 @@ static int balance_nonroot( iNew = iOld = 0; nNewCell = cntNew[0]; }else{ - iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell; + iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; iNew = cntNew[iPg-1] + !leafData; nNewCell = cntNew[iPg] - iNew; } - editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell); + rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); + if( rc ) goto balance_cleanup; abDone[iPg]++; apNew[iPg]->nFree = usableSpace-szNew[iPg]; assert( apNew[iPg]->nOverflow==0 ); @@ -7290,7 +7624,7 @@ static int balance_nonroot( assert( pParent->isInit ); TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", - nOld, nNew, nCell)); + nOld, nNew, b.nCell)); /* Free any old pages that were not reused as new pages. */ @@ -7313,7 +7647,7 @@ static int balance_nonroot( ** Cleanup before returning. */ balance_cleanup: - sqlite3ScratchFree(apCell); + sqlite3ScratchFree(b.apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); } @@ -7588,24 +7922,28 @@ int sqlite3BtreeInsert( ** doing any work. To avoid thwarting these optimizations, it is important ** not to clear the cursor here. */ - rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); - if( rc ) return rc; + if( pCur->curFlags & BTCF_Multiple ){ + rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); + if( rc ) return rc; + } if( pCur->pKeyInfo==0 ){ + assert( pKey==0 ); /* If this is an insert into a table b-tree, invalidate any incrblob ** cursors open on the row being replaced */ invalidateIncrblobCursors(p, nKey, 0); /* If the cursor is currently on the last row and we are appending a - ** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto() - ** call */ + ** new row onto the end, set the "loc" to avoid an unnecessary + ** btreeMoveto() call */ if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 && pCur->info.nKey==nKey-1 ){ - loc = -1; + loc = -1; + }else if( loc==0 ){ + rc = sqlite3BtreeMovetoUnpacked(pCur, 0, nKey, appendBias, &loc); + if( rc ) return rc; } - } - - if( !loc ){ + }else if( loc==0 ){ rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); if( rc ) return rc; } @@ -7623,7 +7961,7 @@ int sqlite3BtreeInsert( assert( newCell!=0 ); rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); if( rc ) goto end_insert; - assert( szNew==cellSizePtr(pPage, newCell) ); + assert( szNew==pPage->xCellSize(pPage, newCell) ); assert( szNew <= MX_CELL_SIZE(pBt) ); idx = pCur->aiIdx[pCur->iPage]; if( loc==0 ){ @@ -7707,12 +8045,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){ assert( pCur->curFlags & BTCF_WriteFlag ); assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); assert( !hasReadConflicts(p, pCur->pgnoRoot) ); - - if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) - || NEVER(pCur->eState!=CURSOR_VALID) - ){ - return SQLITE_ERROR; /* Something has gone awry. */ - } + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); + assert( pCur->eState==CURSOR_VALID ); iCellDepth = pCur->iPage; iCellIdx = pCur->aiIdx[iCellDepth]; @@ -7737,8 +8071,10 @@ int sqlite3BtreeDelete(BtCursor *pCur){ ** deleted writable. Then free any overflow pages associated with the ** entry and finally remove the cell itself from within the page. */ - rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); - if( rc ) return rc; + if( pCur->curFlags & BTCF_Multiple ){ + rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); + if( rc ) return rc; + } /* If this is a delete operation to remove a row from a table b-tree, ** invalidate any incrblob cursors open on the row being deleted. */ @@ -7764,7 +8100,8 @@ int sqlite3BtreeDelete(BtCursor *pCur){ unsigned char *pTmp; pCell = findCell(pLeaf, pLeaf->nCell-1); - nCell = cellSizePtr(pLeaf, pCell); + if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT; + nCell = pLeaf->xCellSize(pLeaf, pCell); assert( MX_CELL_SIZE(pBt) >= nCell ); pTmp = pBt->pTmpSpace; assert( pTmp!=0 ); @@ -7856,7 +8193,8 @@ static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ pgnoRoot++; } - assert( pgnoRoot>=3 ); + assert( pgnoRoot>=3 || CORRUPT_DB ); + testcase( pgnoRoot<3 ); /* Allocate a page. The page that currently resides at pgnoRoot will ** be moved to the allocated page (unless the allocated page happens @@ -7985,7 +8323,7 @@ static int clearDatabasePage( if( pgno>btreePagecount(pBt) ){ return SQLITE_CORRUPT_BKPT; } - rc = getAndInitPage(pBt, pgno, &pPage, 0); + rc = getAndInitPage(pBt, pgno, &pPage, 0, 0); if( rc ) return rc; if( pPage->bBusy ){ rc = SQLITE_CORRUPT_BKPT; @@ -8006,7 +8344,8 @@ static int clearDatabasePage( rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); if( rc ) goto cleardatabasepage_out; }else if( pnChange ){ - assert( pPage->intKey ); + assert( pPage->intKey || CORRUPT_DB ); + testcase( !pPage->intKey ); *pnChange += pPage->nCell; } if( freePageFlag ){ @@ -8524,6 +8863,57 @@ static void checkList( } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ +/* +** An implementation of a min-heap. +** +** aHeap[0] is the number of elements on the heap. aHeap[1] is the +** root element. The daughter nodes of aHeap[N] are aHeap[N*2] +** and aHeap[N*2+1]. +** +** The heap property is this: Every node is less than or equal to both +** of its daughter nodes. A consequence of the heap property is that the +** root node aHeap[1] is always the minimum value currently in the heap. +** +** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto +** the heap, preserving the heap property. The btreeHeapPull() routine +** removes the root element from the heap (the minimum value in the heap) +** and then moves other nodes around as necessary to preserve the heap +** property. +** +** This heap is used for cell overlap and coverage testing. Each u32 +** entry represents the span of a cell or freeblock on a btree page. +** The upper 16 bits are the index of the first byte of a range and the +** lower 16 bits are the index of the last byte of that range. +*/ +static void btreeHeapInsert(u32 *aHeap, u32 x){ + u32 j, i = ++aHeap[0]; + aHeap[i] = x; + while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ + x = aHeap[j]; + aHeap[j] = aHeap[i]; + aHeap[i] = x; + i = j; + } +} +static int btreeHeapPull(u32 *aHeap, u32 *pOut){ + u32 j, i, x; + if( (x = aHeap[0])==0 ) return 0; + *pOut = aHeap[1]; + aHeap[1] = aHeap[x]; + aHeap[x] = 0xffffffff; + aHeap[0]--; + i = 1; + while( (j = i*2)<=aHeap[0] ){ + if( aHeap[j]>aHeap[j+1] ) j++; + if( aHeap[i]<aHeap[j] ) break; + x = aHeap[i]; + aHeap[i] = aHeap[j]; + aHeap[j] = x; + i = j; + } + return 1; +} + #ifndef SQLITE_OMIT_INTEGRITY_CHECK /* ** Do various sanity checks on a single page of a tree. Return @@ -8534,34 +8924,42 @@ static void checkList( ** ** 1. Make sure that cells and freeblocks do not overlap ** but combine to completely cover the page. -** NO 2. Make sure cell keys are in order. -** NO 3. Make sure no key is less than or equal to zLowerBound. -** NO 4. Make sure no key is greater than or equal to zUpperBound. -** 5. Check the integrity of overflow pages. -** 6. Recursively call checkTreePage on all children. -** 7. Verify that the depth of all children is the same. -** 8. Make sure this page is at least 33% full or else it is -** the root of the tree. +** 2. Make sure integer cell keys are in order. +** 3. Check the integrity of overflow pages. +** 4. Recursively call checkTreePage on all children. +** 5. Verify that the depth of all children is the same. */ static int checkTreePage( IntegrityCk *pCheck, /* Context for the sanity check */ int iPage, /* Page number of the page to check */ - i64 *pnParentMinKey, - i64 *pnParentMaxKey + i64 *piMinKey, /* Write minimum integer primary key here */ + i64 maxKey /* Error if integer primary key greater than this */ ){ - MemPage *pPage; - int i, rc, depth, d2, pgno, cnt; - int hdr, cellStart; - int nCell; - u8 *data; - BtShared *pBt; - int usableSize; - char *hit = 0; - i64 nMinKey = 0; - i64 nMaxKey = 0; + MemPage *pPage = 0; /* The page being analyzed */ + int i; /* Loop counter */ + int rc; /* Result code from subroutine call */ + int depth = -1, d2; /* Depth of a subtree */ + int pgno; /* Page number */ + int nFrag; /* Number of fragmented bytes on the page */ + int hdr; /* Offset to the page header */ + int cellStart; /* Offset to the start of the cell pointer array */ + int nCell; /* Number of cells */ + int doCoverageCheck = 1; /* True if cell coverage checking should be done */ + int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey + ** False if IPK must be strictly less than maxKey */ + u8 *data; /* Page content */ + u8 *pCell; /* Cell content */ + u8 *pCellIdx; /* Next element of the cell pointer array */ + BtShared *pBt; /* The BtShared object that owns pPage */ + u32 pc; /* Address of a cell */ + u32 usableSize; /* Usable size of the page */ + u32 contentOffset; /* Offset to the start of the cell content area */ + u32 *heap = 0; /* Min-heap used for checking cell coverage */ + u32 x, prev = 0; /* Next and previous entry on the min-heap */ const char *saved_zPfx = pCheck->zPfx; int saved_v1 = pCheck->v1; int saved_v2 = pCheck->v2; + u8 savedIsInit = 0; /* Check that the page exists */ @@ -8574,54 +8972,95 @@ static int checkTreePage( if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ checkAppendMsg(pCheck, "unable to get the page. error code=%d", rc); - depth = -1; goto end_of_check; } /* Clear MemPage.isInit to make sure the corruption detection code in ** btreeInitPage() is executed. */ + savedIsInit = pPage->isInit; pPage->isInit = 0; if( (rc = btreeInitPage(pPage))!=0 ){ assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ checkAppendMsg(pCheck, "btreeInitPage() returns error code %d", rc); - releasePage(pPage); - depth = -1; goto end_of_check; } + data = pPage->aData; + hdr = pPage->hdrOffset; - /* Check out all the cells. - */ - depth = 0; - for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ - u8 *pCell; - u32 sz; + /* Set up for cell analysis */ + pCheck->zPfx = "On tree page %d cell %d: "; + contentOffset = get2byteNotZero(&data[hdr+5]); + assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ + + /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the + ** number of cells on the page. */ + nCell = get2byte(&data[hdr+3]); + assert( pPage->nCell==nCell ); + + /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page + ** immediately follows the b-tree page header. */ + cellStart = hdr + 12 - 4*pPage->leaf; + assert( pPage->aCellIdx==&data[cellStart] ); + pCellIdx = &data[cellStart + 2*(nCell-1)]; + + if( !pPage->leaf ){ + /* Analyze the right-child page of internal pages */ + pgno = get4byte(&data[hdr+8]); +#ifndef SQLITE_OMIT_AUTOVACUUM + if( pBt->autoVacuum ){ + pCheck->zPfx = "On page %d at right child: "; + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); + } +#endif + depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); + keyCanBeEqual = 0; + }else{ + /* For leaf pages, the coverage check will occur in the same loop + ** as the other cell checks, so initialize the heap. */ + heap = pCheck->heap; + heap[0] = 0; + } + + /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte + ** integer offsets to the cell contents. */ + for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ CellInfo info; - /* Check payload overflow pages - */ - pCheck->zPfx = "On tree page %d cell %d: "; - pCheck->v1 = iPage; + /* Check cell size */ pCheck->v2 = i; - pCell = findCell(pPage,i); - btreeParseCellPtr(pPage, pCell, &info); - sz = info.nPayload; - /* For intKey pages, check that the keys are in order. - */ + assert( pCellIdx==&data[cellStart + i*2] ); + pc = get2byteAligned(pCellIdx); + pCellIdx -= 2; + if( pc<contentOffset || pc>usableSize-4 ){ + checkAppendMsg(pCheck, "Offset %d out of range %d..%d", + pc, contentOffset, usableSize-4); + doCoverageCheck = 0; + continue; + } + pCell = &data[pc]; + pPage->xParseCell(pPage, pCell, &info); + if( pc+info.nSize>usableSize ){ + checkAppendMsg(pCheck, "Extends off end of page"); + doCoverageCheck = 0; + continue; + } + + /* Check for integer primary key out of range */ if( pPage->intKey ){ - if( i==0 ){ - nMinKey = nMaxKey = info.nKey; - }else if( info.nKey <= nMaxKey ){ - checkAppendMsg(pCheck, - "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); + if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ + checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); } - nMaxKey = info.nKey; + maxKey = info.nKey; } - if( (sz>info.nLocal) - && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) - ){ - int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); - Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); + + /* Check the content overflow list */ + if( info.nPayload>info.nLocal ){ + int nPage; /* Number of pages on the overflow chain */ + Pgno pgnoOvfl; /* First page of the overflow chain */ + assert( pc + info.iOverflow <= usableSize ); + nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); + pgnoOvfl = get4byte(&pCell[info.iOverflow]); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); @@ -8630,119 +9069,57 @@ static int checkTreePage( checkList(pCheck, 0, pgnoOvfl, nPage); } - /* Check sanity of left child page. - */ if( !pPage->leaf ){ + /* Check sanity of left child page for internal pages */ pgno = get4byte(pCell); #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum ){ checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); } #endif - d2 = checkTreePage(pCheck, pgno, &nMinKey, i==0?NULL:&nMaxKey); - if( i>0 && d2!=depth ){ + d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); + keyCanBeEqual = 0; + if( d2!=depth ){ checkAppendMsg(pCheck, "Child page depth differs"); + depth = d2; } - depth = d2; - } - } - - if( !pPage->leaf ){ - pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); - pCheck->zPfx = "On page %d at right child: "; - pCheck->v1 = iPage; -#ifndef SQLITE_OMIT_AUTOVACUUM - if( pBt->autoVacuum ){ - checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); - } -#endif - checkTreePage(pCheck, pgno, NULL, !pPage->nCell?NULL:&nMaxKey); - } - - /* For intKey leaf pages, check that the min/max keys are in order - ** with any left/parent/right pages. - */ - pCheck->zPfx = "Page %d: "; - pCheck->v1 = iPage; - if( pPage->leaf && pPage->intKey ){ - /* if we are a left child page */ - if( pnParentMinKey ){ - /* if we are the left most child page */ - if( !pnParentMaxKey ){ - if( nMaxKey > *pnParentMinKey ){ - checkAppendMsg(pCheck, - "Rowid %lld out of order (max larger than parent min of %lld)", - nMaxKey, *pnParentMinKey); - } - }else{ - if( nMinKey <= *pnParentMinKey ){ - checkAppendMsg(pCheck, - "Rowid %lld out of order (min less than parent min of %lld)", - nMinKey, *pnParentMinKey); - } - if( nMaxKey > *pnParentMaxKey ){ - checkAppendMsg(pCheck, - "Rowid %lld out of order (max larger than parent max of %lld)", - nMaxKey, *pnParentMaxKey); - } - *pnParentMinKey = nMaxKey; - } - /* else if we're a right child page */ - } else if( pnParentMaxKey ){ - if( nMinKey <= *pnParentMaxKey ){ - checkAppendMsg(pCheck, - "Rowid %lld out of order (min less than parent max of %lld)", - nMinKey, *pnParentMaxKey); - } + }else{ + /* Populate the coverage-checking heap for leaf pages */ + btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); } } + *piMinKey = maxKey; /* Check for complete coverage of the page */ - data = pPage->aData; - hdr = pPage->hdrOffset; - hit = sqlite3PageMalloc( pBt->pageSize ); pCheck->zPfx = 0; - if( hit==0 ){ - pCheck->mallocFailed = 1; - }else{ - int contentOffset = get2byteNotZero(&data[hdr+5]); - assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ - memset(hit+contentOffset, 0, usableSize-contentOffset); - memset(hit, 1, contentOffset); - /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the - ** number of cells on the page. */ - nCell = get2byte(&data[hdr+3]); - /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page - ** immediately follows the b-tree page header. */ - cellStart = hdr + 12 - 4*pPage->leaf; - /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte - ** integer offsets to the cell contents. */ - for(i=0; i<nCell; i++){ - int pc = get2byte(&data[cellStart+i*2]); - u32 size = 65536; - int j; - if( pc<=usableSize-4 ){ - size = cellSizePtr(pPage, &data[pc]); - } - if( (int)(pc+size-1)>=usableSize ){ - pCheck->zPfx = 0; - checkAppendMsg(pCheck, - "Corruption detected in cell %d on page %d",i,iPage); - }else{ - for(j=pc+size-1; j>=pc; j--) hit[j]++; + if( doCoverageCheck && pCheck->mxErr>0 ){ + /* For leaf pages, the min-heap has already been initialized and the + ** cells have already been inserted. But for internal pages, that has + ** not yet been done, so do it now */ + if( !pPage->leaf ){ + heap = pCheck->heap; + heap[0] = 0; + for(i=nCell-1; i>=0; i--){ + u32 size; + pc = get2byteAligned(&data[cellStart+i*2]); + size = pPage->xCellSize(pPage, &data[pc]); + btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); } } - /* EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header + /* Add the freeblocks to the min-heap + ** + ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header ** is the offset of the first freeblock, or zero if there are no - ** freeblocks on the page. */ + ** freeblocks on the page. + */ i = get2byte(&data[hdr+1]); while( i>0 ){ int size, j; - assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ + assert( (u32)i<=usableSize-4 ); /* Enforced by btreeInitPage() */ size = get2byte(&data[i+2]); - assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ - for(j=i+size-1; j>=i; j--) hit[j]++; + assert( (u32)(i+size)<=usableSize ); /* Enforced by btreeInitPage() */ + btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a ** big-endian integer which is the offset in the b-tree page of the next ** freeblock in the chain, or zero if the freeblock is the last on the @@ -8751,33 +9128,50 @@ static int checkTreePage( /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of ** increasing offset. */ assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ - assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ + assert( (u32)j<=usableSize-4 ); /* Enforced by btreeInitPage() */ i = j; } - for(i=cnt=0; i<usableSize; i++){ - if( hit[i]==0 ){ - cnt++; - }else if( hit[i]>1 ){ + /* Analyze the min-heap looking for overlap between cells and/or + ** freeblocks, and counting the number of untracked bytes in nFrag. + ** + ** Each min-heap entry is of the form: (start_address<<16)|end_address. + ** There is an implied first entry the covers the page header, the cell + ** pointer index, and the gap between the cell pointer index and the start + ** of cell content. + ** + ** The loop below pulls entries from the min-heap in order and compares + ** the start_address against the previous end_address. If there is an + ** overlap, that means bytes are used multiple times. If there is a gap, + ** that gap is added to the fragmentation count. + */ + nFrag = 0; + prev = contentOffset - 1; /* Implied first min-heap entry */ + while( btreeHeapPull(heap,&x) ){ + if( (prev&0xffff)>=(x>>16) ){ checkAppendMsg(pCheck, - "Multiple uses for byte %d of page %d", i, iPage); + "Multiple uses for byte %u of page %d", x>>16, iPage); break; + }else{ + nFrag += (x>>16) - (prev&0xffff) - 1; + prev = x; } } + nFrag += usableSize - (prev&0xffff) - 1; /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments ** is stored in the fifth field of the b-tree page header. ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the ** number of fragmented free bytes within the cell content area. */ - if( cnt!=data[hdr+7] ){ + if( heap[0]==0 && nFrag!=data[hdr+7] ){ checkAppendMsg(pCheck, "Fragmentation of %d bytes reported as %d on page %d", - cnt, data[hdr+7], iPage); + nFrag, data[hdr+7], iPage); } } - sqlite3PageFree(hit); - releasePage(pPage); end_of_check: + if( !doCoverageCheck ) pPage->isInit = savedIsInit; + releasePage(pPage); pCheck->zPfx = saved_zPfx; pCheck->v1 = saved_v1; pCheck->v2 = saved_v2; @@ -8807,14 +9201,15 @@ char *sqlite3BtreeIntegrityCheck( int *pnErr /* Write number of errors seen to this variable */ ){ Pgno i; - int nRef; IntegrityCk sCheck; BtShared *pBt = p->pBt; + int savedDbFlags = pBt->db->flags; char zErr[100]; + VVA_ONLY( int nRef ); sqlite3BtreeEnter(p); assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); - nRef = sqlite3PagerRefcount(pBt->pPager); + assert( (nRef = sqlite3PagerRefcount(pBt->pPager))>=0 ); sCheck.pBt = pBt; sCheck.pPager = pBt->pPager; sCheck.nPage = btreePagecount(sCheck.pBt); @@ -8824,22 +9219,26 @@ char *sqlite3BtreeIntegrityCheck( sCheck.zPfx = 0; sCheck.v1 = 0; sCheck.v2 = 0; - *pnErr = 0; + sCheck.aPgRef = 0; + sCheck.heap = 0; + sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); if( sCheck.nPage==0 ){ - sqlite3BtreeLeave(p); - return 0; + goto integrity_ck_cleanup; } sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1); if( !sCheck.aPgRef ){ - *pnErr = 1; - sqlite3BtreeLeave(p); - return 0; + sCheck.mallocFailed = 1; + goto integrity_ck_cleanup; + } + sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); + if( sCheck.heap==0 ){ + sCheck.mallocFailed = 1; + goto integrity_ck_cleanup; } + i = PENDING_BYTE_PAGE(pBt); if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i); - sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); - sCheck.errMsg.useMalloc = 2; /* Check the integrity of the freelist */ @@ -8850,17 +9249,19 @@ char *sqlite3BtreeIntegrityCheck( /* Check all the tables. */ + testcase( pBt->db->flags & SQLITE_CellSizeCk ); + pBt->db->flags &= ~SQLITE_CellSizeCk; for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ + i64 notUsed; if( aRoot[i]==0 ) continue; #ifndef SQLITE_OMIT_AUTOVACUUM if( pBt->autoVacuum && aRoot[i]>1 ){ checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); } #endif - sCheck.zPfx = "List of tree roots: "; - checkTreePage(&sCheck, aRoot[i], NULL, NULL); - sCheck.zPfx = 0; + checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); } + pBt->db->flags = savedDbFlags; /* Make sure every page in the file is referenced */ @@ -8884,28 +9285,20 @@ char *sqlite3BtreeIntegrityCheck( #endif } - /* Make sure this analysis did not leave any unref() pages. - ** This is an internal consistency check; an integrity check - ** of the integrity check. - */ - if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ - checkAppendMsg(&sCheck, - "Outstanding page count goes from %d to %d during this analysis", - nRef, sqlite3PagerRefcount(pBt->pPager) - ); - } - /* Clean up and report errors. */ - sqlite3BtreeLeave(p); +integrity_ck_cleanup: + sqlite3PageFree(sCheck.heap); sqlite3_free(sCheck.aPgRef); if( sCheck.mallocFailed ){ sqlite3StrAccumReset(&sCheck.errMsg); - *pnErr = sCheck.nErr+1; - return 0; + sCheck.nErr++; } *pnErr = sCheck.nErr; if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); + /* Make sure this analysis did not leave any unref() pages. */ + assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); + sqlite3BtreeLeave(p); return sqlite3StrAccumFinish(&sCheck.errMsg); } #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ @@ -9116,6 +9509,7 @@ int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ */ void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ pCur->curFlags |= BTCF_Incrblob; + pCur->pBtree->hasIncrblobCur = 1; } #endif |