monotone

monotone Mtn Source Tree

Root/sqlite/btree.c

1/*
2** 2004 April 6
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** $Id: btree.c,v 1.388 2007/05/24 09:20:16 danielk1977 Exp $
13**
14** This file implements a external (disk-based) database using BTrees.
15** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
17*/
18#include "btreeInt.h"
19
20/*
21** The header string that appears at the beginning of every
22** SQLite database.
23*/
24static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25
26
27/*
28** Set this global variable to 1 to enable tracing using the TRACE
29** macro.
30*/
31#if SQLITE_TEST
32int sqlite3_btree_trace=0; /* True to enable tracing */
33#endif
34
35/*
36** Forward declaration
37*/
38static int checkReadLocks(Btree*,Pgno,BtCursor*);
39
40
41#ifdef SQLITE_OMIT_SHARED_CACHE
42 /*
43 ** The functions queryTableLock(), lockTable() and unlockAllTables()
44 ** manipulate entries in the BtShared.pLock linked list used to store
45 ** shared-cache table level locks. If the library is compiled with the
46 ** shared-cache feature disabled, then there is only ever one user
47 ** of each BtShared structure and so this locking is not necessary.
48 ** So define the lock related functions as no-ops.
49 */
50 #define queryTableLock(a,b,c) SQLITE_OK
51 #define lockTable(a,b,c) SQLITE_OK
52 #define unlockAllTables(a)
53#else
54
55/*
56** Query to see if btree handle p may obtain a lock of type eLock
57** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
58** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
59** SQLITE_LOCKED if not.
60*/
61static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
62 BtShared *pBt = p->pBt;
63 BtLock *pIter;
64
65 /* This is a no-op if the shared-cache is not enabled */
66 if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){
67 return SQLITE_OK;
68 }
69
70 /* This (along with lockTable()) is where the ReadUncommitted flag is
71 ** dealt with. If the caller is querying for a read-lock and the flag is
72 ** set, it is unconditionally granted - even if there are write-locks
73 ** on the table. If a write-lock is requested, the ReadUncommitted flag
74 ** is not considered.
75 **
76 ** In function lockTable(), if a read-lock is demanded and the
77 ** ReadUncommitted flag is set, no entry is added to the locks list
78 ** (BtShared.pLock).
79 **
80 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
81 ** not create or respect table locks. The locking procedure for a
82 ** write-cursor does not change.
83 */
84 if(
85 !p->pSqlite ||
86 0==(p->pSqlite->flags&SQLITE_ReadUncommitted) ||
87 eLock==WRITE_LOCK ||
88 iTab==MASTER_ROOT
89 ){
90 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
91 if( pIter->pBtree!=p && pIter->iTable==iTab &&
92 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
93 return SQLITE_LOCKED;
94 }
95 }
96 }
97 return SQLITE_OK;
98}
99
100/*
101** Add a lock on the table with root-page iTable to the shared-btree used
102** by Btree handle p. Parameter eLock must be either READ_LOCK or
103** WRITE_LOCK.
104**
105** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
106** SQLITE_NOMEM may also be returned.
107*/
108static int lockTable(Btree *p, Pgno iTable, u8 eLock){
109 BtShared *pBt = p->pBt;
110 BtLock *pLock = 0;
111 BtLock *pIter;
112
113 /* This is a no-op if the shared-cache is not enabled */
114 if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){
115 return SQLITE_OK;
116 }
117
118 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
119
120 /* If the read-uncommitted flag is set and a read-lock is requested,
121 ** return early without adding an entry to the BtShared.pLock list. See
122 ** comment in function queryTableLock() for more info on handling
123 ** the ReadUncommitted flag.
124 */
125 if(
126 (p->pSqlite) &&
127 (p->pSqlite->flags&SQLITE_ReadUncommitted) &&
128 (eLock==READ_LOCK) &&
129 iTable!=MASTER_ROOT
130 ){
131 return SQLITE_OK;
132 }
133
134 /* First search the list for an existing lock on this table. */
135 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
136 if( pIter->iTable==iTable && pIter->pBtree==p ){
137 pLock = pIter;
138 break;
139 }
140 }
141
142 /* If the above search did not find a BtLock struct associating Btree p
143 ** with table iTable, allocate one and link it into the list.
144 */
145 if( !pLock ){
146 pLock = (BtLock *)sqliteMalloc(sizeof(BtLock));
147 if( !pLock ){
148 return SQLITE_NOMEM;
149 }
150 pLock->iTable = iTable;
151 pLock->pBtree = p;
152 pLock->pNext = pBt->pLock;
153 pBt->pLock = pLock;
154 }
155
156 /* Set the BtLock.eLock variable to the maximum of the current lock
157 ** and the requested lock. This means if a write-lock was already held
158 ** and a read-lock requested, we don't incorrectly downgrade the lock.
159 */
160 assert( WRITE_LOCK>READ_LOCK );
161 if( eLock>pLock->eLock ){
162 pLock->eLock = eLock;
163 }
164
165 return SQLITE_OK;
166}
167
168/*
169** Release all the table locks (locks obtained via calls to the lockTable()
170** procedure) held by Btree handle p.
171*/
172static void unlockAllTables(Btree *p){
173 BtLock **ppIter = &p->pBt->pLock;
174
175 /* If the shared-cache extension is not enabled, there should be no
176 ** locks in the BtShared.pLock list, making this procedure a no-op. Assert
177 ** that this is the case.
178 */
179 assert( sqlite3ThreadDataReadOnly()->useSharedData || 0==*ppIter );
180
181 while( *ppIter ){
182 BtLock *pLock = *ppIter;
183 if( pLock->pBtree==p ){
184 *ppIter = pLock->pNext;
185 sqliteFree(pLock);
186 }else{
187 ppIter = &pLock->pNext;
188 }
189 }
190}
191#endif /* SQLITE_OMIT_SHARED_CACHE */
192
193static void releasePage(MemPage *pPage); /* Forward reference */
194
195#ifndef SQLITE_OMIT_INCRBLOB
196/*
197** Invalidate the overflow page-list cache for cursor pCur, if any.
198*/
199static void invalidateOverflowCache(BtCursor *pCur){
200 sqliteFree(pCur->aOverflow);
201 pCur->aOverflow = 0;
202}
203
204/*
205** Invalidate the overflow page-list cache for all cursors opened
206** on the shared btree structure pBt.
207*/
208static void invalidateAllOverflowCache(BtShared *pBt){
209 BtCursor *p;
210 for(p=pBt->pCursor; p; p=p->pNext){
211 invalidateOverflowCache(p);
212 }
213}
214#else
215 #define invalidateOverflowCache(x)
216 #define invalidateAllOverflowCache(x)
217#endif
218
219/*
220** Save the current cursor position in the variables BtCursor.nKey
221** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
222*/
223static int saveCursorPosition(BtCursor *pCur){
224 int rc;
225
226 assert( CURSOR_VALID==pCur->eState );
227 assert( 0==pCur->pKey );
228
229 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
230
231 /* If this is an intKey table, then the above call to BtreeKeySize()
232 ** stores the integer key in pCur->nKey. In this case this value is
233 ** all that is required. Otherwise, if pCur is not open on an intKey
234 ** table, then malloc space for and store the pCur->nKey bytes of key
235 ** data.
236 */
237 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
238 void *pKey = sqliteMalloc(pCur->nKey);
239 if( pKey ){
240 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
241 if( rc==SQLITE_OK ){
242 pCur->pKey = pKey;
243 }else{
244 sqliteFree(pKey);
245 }
246 }else{
247 rc = SQLITE_NOMEM;
248 }
249 }
250 assert( !pCur->pPage->intKey || !pCur->pKey );
251
252 if( rc==SQLITE_OK ){
253 releasePage(pCur->pPage);
254 pCur->pPage = 0;
255 pCur->eState = CURSOR_REQUIRESEEK;
256 }
257
258 invalidateOverflowCache(pCur);
259 return rc;
260}
261
262/*
263** Save the positions of all cursors except pExcept open on the table
264** with root-page iRoot. Usually, this is called just before cursor
265** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
266*/
267static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
268 BtCursor *p;
269 for(p=pBt->pCursor; p; p=p->pNext){
270 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
271 p->eState==CURSOR_VALID ){
272 int rc = saveCursorPosition(p);
273 if( SQLITE_OK!=rc ){
274 return rc;
275 }
276 }
277 }
278 return SQLITE_OK;
279}
280
281/*
282** Clear the current cursor position.
283*/
284static void clearCursorPosition(BtCursor *pCur){
285 sqliteFree(pCur->pKey);
286 pCur->pKey = 0;
287 pCur->eState = CURSOR_INVALID;
288}
289
290/*
291** Restore the cursor to the position it was in (or as close to as possible)
292** when saveCursorPosition() was called. Note that this call deletes the
293** saved position info stored by saveCursorPosition(), so there can be
294** at most one effective restoreOrClearCursorPosition() call after each
295** saveCursorPosition().
296**
297** If the second argument argument - doSeek - is false, then instead of
298** returning the cursor to it's saved position, any saved position is deleted
299** and the cursor state set to CURSOR_INVALID.
300*/
301int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){
302 int rc;
303 assert( pCur->eState==CURSOR_REQUIRESEEK );
304#ifndef SQLITE_OMIT_INCRBLOB
305 if( pCur->isIncrblobHandle ){
306 return SQLITE_ABORT;
307 }
308#endif
309 pCur->eState = CURSOR_INVALID;
310 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
311 if( rc==SQLITE_OK ){
312 sqliteFree(pCur->pKey);
313 pCur->pKey = 0;
314 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
315 }
316 return rc;
317}
318
319#define restoreOrClearCursorPosition(p) \
320 (p->eState==CURSOR_REQUIRESEEK ? \
321 sqlite3BtreeRestoreOrClearCursorPosition(p) : \
322 SQLITE_OK)
323
324#ifndef SQLITE_OMIT_AUTOVACUUM
325/*
326** Given a page number of a regular database page, return the page
327** number for the pointer-map page that contains the entry for the
328** input page number.
329*/
330static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
331 int nPagesPerMapPage = (pBt->usableSize/5)+1;
332 int iPtrMap = (pgno-2)/nPagesPerMapPage;
333 int ret = (iPtrMap*nPagesPerMapPage) + 2;
334 if( ret==PENDING_BYTE_PAGE(pBt) ){
335 ret++;
336 }
337 return ret;
338}
339
340/*
341** Write an entry into the pointer map.
342**
343** This routine updates the pointer map entry for page number 'key'
344** so that it maps to type 'eType' and parent page number 'pgno'.
345** An error code is returned if something goes wrong, otherwise SQLITE_OK.
346*/
347static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
348 DbPage *pDbPage; /* The pointer map page */
349 u8 *pPtrmap; /* The pointer map data */
350 Pgno iPtrmap; /* The pointer map page number */
351 int offset; /* Offset in pointer map page */
352 int rc;
353
354 /* The master-journal page number must never be used as a pointer map page */
355 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
356
357 assert( pBt->autoVacuum );
358 if( key==0 ){
359 return SQLITE_CORRUPT_BKPT;
360 }
361 iPtrmap = PTRMAP_PAGENO(pBt, key);
362 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
363 if( rc!=SQLITE_OK ){
364 return rc;
365 }
366 offset = PTRMAP_PTROFFSET(pBt, key);
367 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
368
369 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
370 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
371 rc = sqlite3PagerWrite(pDbPage);
372 if( rc==SQLITE_OK ){
373 pPtrmap[offset] = eType;
374 put4byte(&pPtrmap[offset+1], parent);
375 }
376 }
377
378 sqlite3PagerUnref(pDbPage);
379 return rc;
380}
381
382/*
383** Read an entry from the pointer map.
384**
385** This routine retrieves the pointer map entry for page 'key', writing
386** the type and parent page number to *pEType and *pPgno respectively.
387** An error code is returned if something goes wrong, otherwise SQLITE_OK.
388*/
389static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
390 DbPage *pDbPage; /* The pointer map page */
391 int iPtrmap; /* Pointer map page index */
392 u8 *pPtrmap; /* Pointer map page data */
393 int offset; /* Offset of entry in pointer map */
394 int rc;
395
396 iPtrmap = PTRMAP_PAGENO(pBt, key);
397 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
398 if( rc!=0 ){
399 return rc;
400 }
401 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
402
403 offset = PTRMAP_PTROFFSET(pBt, key);
404 assert( pEType!=0 );
405 *pEType = pPtrmap[offset];
406 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
407
408 sqlite3PagerUnref(pDbPage);
409 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
410 return SQLITE_OK;
411}
412
413#endif /* SQLITE_OMIT_AUTOVACUUM */
414
415/*
416** Given a btree page and a cell index (0 means the first cell on
417** the page, 1 means the second cell, and so forth) return a pointer
418** to the cell content.
419**
420** This routine works only for pages that do not contain overflow cells.
421*/
422#define findCell(pPage, iCell) \
423 ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)]))
424u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){
425 u8 *data = pPage->aData;
426 assert( iCell>=0 );
427 assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
428 return findCell(pPage, iCell);
429}
430
431/*
432** This a more complex version of sqlite3BtreeFindCell() that works for
433** pages that do contain overflow cells. See insert
434*/
435static u8 *findOverflowCell(MemPage *pPage, int iCell){
436 int i;
437 for(i=pPage->nOverflow-1; i>=0; i--){
438 int k;
439 struct _OvflCell *pOvfl;
440 pOvfl = &pPage->aOvfl[i];
441 k = pOvfl->idx;
442 if( k<=iCell ){
443 if( k==iCell ){
444 return pOvfl->pCell;
445 }
446 iCell--;
447 }
448 }
449 return findCell(pPage, iCell);
450}
451
452/*
453** Parse a cell content block and fill in the CellInfo structure. There
454** are two versions of this function. sqlite3BtreeParseCell() takes a
455** cell index as the second argument and sqlite3BtreeParseCellPtr()
456** takes a pointer to the body of the cell as its second argument.
457**
458** Within this file, the parseCell() macro can be called instead of
459** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
460*/
461void sqlite3BtreeParseCellPtr(
462 MemPage *pPage, /* Page containing the cell */
463 u8 *pCell, /* Pointer to the cell text. */
464 CellInfo *pInfo /* Fill in this structure */
465){
466 int n; /* Number bytes in cell content header */
467 u32 nPayload; /* Number of bytes of cell payload */
468
469 pInfo->pCell = pCell;
470 assert( pPage->leaf==0 || pPage->leaf==1 );
471 n = pPage->childPtrSize;
472 assert( n==4-4*pPage->leaf );
473 if( pPage->hasData ){
474 n += getVarint32(&pCell[n], &nPayload);
475 }else{
476 nPayload = 0;
477 }
478 pInfo->nData = nPayload;
479 if( pPage->intKey ){
480 n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
481 }else{
482 u32 x;
483 n += getVarint32(&pCell[n], &x);
484 pInfo->nKey = x;
485 nPayload += x;
486 }
487 pInfo->nPayload = nPayload;
488 pInfo->nHeader = n;
489 if( nPayload<=pPage->maxLocal ){
490 /* This is the (easy) common case where the entire payload fits
491 ** on the local page. No overflow is required.
492 */
493 int nSize; /* Total size of cell content in bytes */
494 pInfo->nLocal = nPayload;
495 pInfo->iOverflow = 0;
496 nSize = nPayload + n;
497 if( nSize<4 ){
498 nSize = 4; /* Minimum cell size is 4 */
499 }
500 pInfo->nSize = nSize;
501 }else{
502 /* If the payload will not fit completely on the local page, we have
503 ** to decide how much to store locally and how much to spill onto
504 ** overflow pages. The strategy is to minimize the amount of unused
505 ** space on overflow pages while keeping the amount of local storage
506 ** in between minLocal and maxLocal.
507 **
508 ** Warning: changing the way overflow payload is distributed in any
509 ** way will result in an incompatible file format.
510 */
511 int minLocal; /* Minimum amount of payload held locally */
512 int maxLocal; /* Maximum amount of payload held locally */
513 int surplus; /* Overflow payload available for local storage */
514
515 minLocal = pPage->minLocal;
516 maxLocal = pPage->maxLocal;
517 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
518 if( surplus <= maxLocal ){
519 pInfo->nLocal = surplus;
520 }else{
521 pInfo->nLocal = minLocal;
522 }
523 pInfo->iOverflow = pInfo->nLocal + n;
524 pInfo->nSize = pInfo->iOverflow + 4;
525 }
526}
527#define parseCell(pPage, iCell, pInfo) \
528 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
529void sqlite3BtreeParseCell(
530 MemPage *pPage, /* Page containing the cell */
531 int iCell, /* The cell index. First cell is 0 */
532 CellInfo *pInfo /* Fill in this structure */
533){
534 parseCell(pPage, iCell, pInfo);
535}
536
537/*
538** Compute the total number of bytes that a Cell needs in the cell
539** data area of the btree-page. The return number includes the cell
540** data header and the local payload, but not any overflow page or
541** the space used by the cell pointer.
542*/
543#ifndef NDEBUG
544static int cellSize(MemPage *pPage, int iCell){
545 CellInfo info;
546 sqlite3BtreeParseCell(pPage, iCell, &info);
547 return info.nSize;
548}
549#endif
550static int cellSizePtr(MemPage *pPage, u8 *pCell){
551 CellInfo info;
552 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
553 return info.nSize;
554}
555
556#ifndef SQLITE_OMIT_AUTOVACUUM
557/*
558** If the cell pCell, part of page pPage contains a pointer
559** to an overflow page, insert an entry into the pointer-map
560** for the overflow page.
561*/
562static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
563 if( pCell ){
564 CellInfo info;
565 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
566 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
567 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
568 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
569 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
570 }
571 }
572 return SQLITE_OK;
573}
574/*
575** If the cell with index iCell on page pPage contains a pointer
576** to an overflow page, insert an entry into the pointer-map
577** for the overflow page.
578*/
579static int ptrmapPutOvfl(MemPage *pPage, int iCell){
580 u8 *pCell;
581 pCell = findOverflowCell(pPage, iCell);
582 return ptrmapPutOvflPtr(pPage, pCell);
583}
584#endif
585
586
587/*
588** Defragment the page given. All Cells are moved to the
589** end of the page and all free space is collected into one
590** big FreeBlk that occurs in between the header and cell
591** pointer array and the cell content area.
592*/
593static int defragmentPage(MemPage *pPage){
594 int i; /* Loop counter */
595 int pc; /* Address of a i-th cell */
596 int addr; /* Offset of first byte after cell pointer array */
597 int hdr; /* Offset to the page header */
598 int size; /* Size of a cell */
599 int usableSize; /* Number of usable bytes on a page */
600 int cellOffset; /* Offset to the cell pointer array */
601 int brk; /* Offset to the cell content area */
602 int nCell; /* Number of cells on the page */
603 unsigned char *data; /* The page data */
604 unsigned char *temp; /* Temp area for cell content */
605
606 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
607 assert( pPage->pBt!=0 );
608 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
609 assert( pPage->nOverflow==0 );
610 temp = sqliteMalloc( pPage->pBt->pageSize );
611 if( temp==0 ) return SQLITE_NOMEM;
612 data = pPage->aData;
613 hdr = pPage->hdrOffset;
614 cellOffset = pPage->cellOffset;
615 nCell = pPage->nCell;
616 assert( nCell==get2byte(&data[hdr+3]) );
617 usableSize = pPage->pBt->usableSize;
618 brk = get2byte(&data[hdr+5]);
619 memcpy(&temp[brk], &data[brk], usableSize - brk);
620 brk = usableSize;
621 for(i=0; i<nCell; i++){
622 u8 *pAddr; /* The i-th cell pointer */
623 pAddr = &data[cellOffset + i*2];
624 pc = get2byte(pAddr);
625 assert( pc<pPage->pBt->usableSize );
626 size = cellSizePtr(pPage, &temp[pc]);
627 brk -= size;
628 memcpy(&data[brk], &temp[pc], size);
629 put2byte(pAddr, brk);
630 }
631 assert( brk>=cellOffset+2*nCell );
632 put2byte(&data[hdr+5], brk);
633 data[hdr+1] = 0;
634 data[hdr+2] = 0;
635 data[hdr+7] = 0;
636 addr = cellOffset+2*nCell;
637 memset(&data[addr], 0, brk-addr);
638 sqliteFree(temp);
639 return SQLITE_OK;
640}
641
642/*
643** Allocate nByte bytes of space on a page.
644**
645** Return the index into pPage->aData[] of the first byte of
646** the new allocation. Or return 0 if there is not enough free
647** space on the page to satisfy the allocation request.
648**
649** If the page contains nBytes of free space but does not contain
650** nBytes of contiguous free space, then this routine automatically
651** calls defragementPage() to consolidate all free space before
652** allocating the new chunk.
653*/
654static int allocateSpace(MemPage *pPage, int nByte){
655 int addr, pc, hdr;
656 int size;
657 int nFrag;
658 int top;
659 int nCell;
660 int cellOffset;
661 unsigned char *data;
662
663 data = pPage->aData;
664 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
665 assert( pPage->pBt );
666 if( nByte<4 ) nByte = 4;
667 if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
668 pPage->nFree -= nByte;
669 hdr = pPage->hdrOffset;
670
671 nFrag = data[hdr+7];
672 if( nFrag<60 ){
673 /* Search the freelist looking for a slot big enough to satisfy the
674 ** space request. */
675 addr = hdr+1;
676 while( (pc = get2byte(&data[addr]))>0 ){
677 size = get2byte(&data[pc+2]);
678 if( size>=nByte ){
679 if( size<nByte+4 ){
680 memcpy(&data[addr], &data[pc], 2);
681 data[hdr+7] = nFrag + size - nByte;
682 return pc;
683 }else{
684 put2byte(&data[pc+2], size-nByte);
685 return pc + size - nByte;
686 }
687 }
688 addr = pc;
689 }
690 }
691
692 /* Allocate memory from the gap in between the cell pointer array
693 ** and the cell content area.
694 */
695 top = get2byte(&data[hdr+5]);
696 nCell = get2byte(&data[hdr+3]);
697 cellOffset = pPage->cellOffset;
698 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
699 if( defragmentPage(pPage) ) return 0;
700 top = get2byte(&data[hdr+5]);
701 }
702 top -= nByte;
703 assert( cellOffset + 2*nCell <= top );
704 put2byte(&data[hdr+5], top);
705 return top;
706}
707
708/*
709** Return a section of the pPage->aData to the freelist.
710** The first byte of the new free block is pPage->aDisk[start]
711** and the size of the block is "size" bytes.
712**
713** Most of the effort here is involved in coalesing adjacent
714** free blocks into a single big free block.
715*/
716static void freeSpace(MemPage *pPage, int start, int size){
717 int addr, pbegin, hdr;
718 unsigned char *data = pPage->aData;
719
720 assert( pPage->pBt!=0 );
721 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
722 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
723 assert( (start + size)<=pPage->pBt->usableSize );
724 if( size<4 ) size = 4;
725
726#ifdef SQLITE_SECURE_DELETE
727 /* Overwrite deleted information with zeros when the SECURE_DELETE
728 ** option is enabled at compile-time */
729 memset(&data[start], 0, size);
730#endif
731
732 /* Add the space back into the linked list of freeblocks */
733 hdr = pPage->hdrOffset;
734 addr = hdr + 1;
735 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
736 assert( pbegin<=pPage->pBt->usableSize-4 );
737 assert( pbegin>addr );
738 addr = pbegin;
739 }
740 assert( pbegin<=pPage->pBt->usableSize-4 );
741 assert( pbegin>addr || pbegin==0 );
742 put2byte(&data[addr], start);
743 put2byte(&data[start], pbegin);
744 put2byte(&data[start+2], size);
745 pPage->nFree += size;
746
747 /* Coalesce adjacent free blocks */
748 addr = pPage->hdrOffset + 1;
749 while( (pbegin = get2byte(&data[addr]))>0 ){
750 int pnext, psize;
751 assert( pbegin>addr );
752 assert( pbegin<=pPage->pBt->usableSize-4 );
753 pnext = get2byte(&data[pbegin]);
754 psize = get2byte(&data[pbegin+2]);
755 if( pbegin + psize + 3 >= pnext && pnext>0 ){
756 int frag = pnext - (pbegin+psize);
757 assert( frag<=data[pPage->hdrOffset+7] );
758 data[pPage->hdrOffset+7] -= frag;
759 put2byte(&data[pbegin], get2byte(&data[pnext]));
760 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
761 }else{
762 addr = pbegin;
763 }
764 }
765
766 /* If the cell content area begins with a freeblock, remove it. */
767 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
768 int top;
769 pbegin = get2byte(&data[hdr+1]);
770 memcpy(&data[hdr+1], &data[pbegin], 2);
771 top = get2byte(&data[hdr+5]);
772 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
773 }
774}
775
776/*
777** Decode the flags byte (the first byte of the header) for a page
778** and initialize fields of the MemPage structure accordingly.
779*/
780static void decodeFlags(MemPage *pPage, int flagByte){
781 BtShared *pBt; /* A copy of pPage->pBt */
782
783 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
784 pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
785 pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
786 pPage->leaf = (flagByte & PTF_LEAF)!=0;
787 pPage->childPtrSize = 4*(pPage->leaf==0);
788 pBt = pPage->pBt;
789 if( flagByte & PTF_LEAFDATA ){
790 pPage->leafData = 1;
791 pPage->maxLocal = pBt->maxLeaf;
792 pPage->minLocal = pBt->minLeaf;
793 }else{
794 pPage->leafData = 0;
795 pPage->maxLocal = pBt->maxLocal;
796 pPage->minLocal = pBt->minLocal;
797 }
798 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
799}
800
801/*
802** Initialize the auxiliary information for a disk block.
803**
804** The pParent parameter must be a pointer to the MemPage which
805** is the parent of the page being initialized. The root of a
806** BTree has no parent and so for that page, pParent==NULL.
807**
808** Return SQLITE_OK on success. If we see that the page does
809** not contain a well-formed database page, then return
810** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
811** guarantee that the page is well-formed. It only shows that
812** we failed to detect any corruption.
813*/
814int sqlite3BtreeInitPage(
815 MemPage *pPage, /* The page to be initialized */
816 MemPage *pParent /* The parent. Might be NULL */
817){
818 int pc; /* Address of a freeblock within pPage->aData[] */
819 int hdr; /* Offset to beginning of page header */
820 u8 *data; /* Equal to pPage->aData */
821 BtShared *pBt; /* The main btree structure */
822 int usableSize; /* Amount of usable space on each page */
823 int cellOffset; /* Offset from start of page to first cell pointer */
824 int nFree; /* Number of unused bytes on the page */
825 int top; /* First byte of the cell content area */
826
827 pBt = pPage->pBt;
828 assert( pBt!=0 );
829 assert( pParent==0 || pParent->pBt==pBt );
830 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
831 assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] );
832 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
833 /* The parent page should never change unless the file is corrupt */
834 return SQLITE_CORRUPT_BKPT;
835 }
836 if( pPage->isInit ) return SQLITE_OK;
837 if( pPage->pParent==0 && pParent!=0 ){
838 pPage->pParent = pParent;
839 sqlite3PagerRef(pParent->pDbPage);
840 }
841 hdr = pPage->hdrOffset;
842 data = pPage->aData;
843 decodeFlags(pPage, data[hdr]);
844 pPage->nOverflow = 0;
845 pPage->idxShift = 0;
846 usableSize = pBt->usableSize;
847 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
848 top = get2byte(&data[hdr+5]);
849 pPage->nCell = get2byte(&data[hdr+3]);
850 if( pPage->nCell>MX_CELL(pBt) ){
851 /* To many cells for a single page. The page must be corrupt */
852 return SQLITE_CORRUPT_BKPT;
853 }
854 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
855 /* All pages must have at least one cell, except for root pages */
856 return SQLITE_CORRUPT_BKPT;
857 }
858
859 /* Compute the total free space on the page */
860 pc = get2byte(&data[hdr+1]);
861 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
862 while( pc>0 ){
863 int next, size;
864 if( pc>usableSize-4 ){
865 /* Free block is off the page */
866 return SQLITE_CORRUPT_BKPT;
867 }
868 next = get2byte(&data[pc]);
869 size = get2byte(&data[pc+2]);
870 if( next>0 && next<=pc+size+3 ){
871 /* Free blocks must be in accending order */
872 return SQLITE_CORRUPT_BKPT;
873 }
874 nFree += size;
875 pc = next;
876 }
877 pPage->nFree = nFree;
878 if( nFree>=usableSize ){
879 /* Free space cannot exceed total page size */
880 return SQLITE_CORRUPT_BKPT;
881 }
882
883 pPage->isInit = 1;
884 return SQLITE_OK;
885}
886
887/*
888** Set up a raw page so that it looks like a database page holding
889** no entries.
890*/
891static void zeroPage(MemPage *pPage, int flags){
892 unsigned char *data = pPage->aData;
893 BtShared *pBt = pPage->pBt;
894 int hdr = pPage->hdrOffset;
895 int first;
896
897 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
898 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
899 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
900 memset(&data[hdr], 0, pBt->usableSize - hdr);
901 data[hdr] = flags;
902 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
903 memset(&data[hdr+1], 0, 4);
904 data[hdr+7] = 0;
905 put2byte(&data[hdr+5], pBt->usableSize);
906 pPage->nFree = pBt->usableSize - first;
907 decodeFlags(pPage, flags);
908 pPage->hdrOffset = hdr;
909 pPage->cellOffset = first;
910 pPage->nOverflow = 0;
911 pPage->idxShift = 0;
912 pPage->nCell = 0;
913 pPage->isInit = 1;
914}
915
916/*
917** Get a page from the pager. Initialize the MemPage.pBt and
918** MemPage.aData elements if needed.
919**
920** If the noContent flag is set, it means that we do not care about
921** the content of the page at this time. So do not go to the disk
922** to fetch the content. Just fill in the content with zeros for now.
923** If in the future we call sqlite3PagerWrite() on this page, that
924** means we have started to be concerned about content and the disk
925** read should occur at that point.
926*/
927int sqlite3BtreeGetPage(
928 BtShared *pBt, /* The btree */
929 Pgno pgno, /* Number of the page to fetch */
930 MemPage **ppPage, /* Return the page in this parameter */
931 int noContent /* Do not load page content if true */
932){
933 int rc;
934 MemPage *pPage;
935 DbPage *pDbPage;
936
937 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
938 if( rc ) return rc;
939 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
940 pPage->aData = sqlite3PagerGetData(pDbPage);
941 pPage->pDbPage = pDbPage;
942 pPage->pBt = pBt;
943 pPage->pgno = pgno;
944 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
945 *ppPage = pPage;
946 return SQLITE_OK;
947}
948
949/*
950** Get a page from the pager and initialize it. This routine
951** is just a convenience wrapper around separate calls to
952** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
953*/
954static int getAndInitPage(
955 BtShared *pBt, /* The database file */
956 Pgno pgno, /* Number of the page to get */
957 MemPage **ppPage, /* Write the page pointer here */
958 MemPage *pParent /* Parent of the page */
959){
960 int rc;
961 if( pgno==0 ){
962 return SQLITE_CORRUPT_BKPT;
963 }
964 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
965 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
966 rc = sqlite3BtreeInitPage(*ppPage, pParent);
967 }
968 return rc;
969}
970
971/*
972** Release a MemPage. This should be called once for each prior
973** call to sqlite3BtreeGetPage.
974*/
975static void releasePage(MemPage *pPage){
976 if( pPage ){
977 assert( pPage->aData );
978 assert( pPage->pBt );
979 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
980 sqlite3PagerUnref(pPage->pDbPage);
981 }
982}
983
984/*
985** This routine is called when the reference count for a page
986** reaches zero. We need to unref the pParent pointer when that
987** happens.
988*/
989static void pageDestructor(DbPage *pData, int pageSize){
990 MemPage *pPage;
991 assert( (pageSize & 7)==0 );
992 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
993 if( pPage->pParent ){
994 MemPage *pParent = pPage->pParent;
995 pPage->pParent = 0;
996 releasePage(pParent);
997 }
998 pPage->isInit = 0;
999}
1000
1001/*
1002** During a rollback, when the pager reloads information into the cache
1003** so that the cache is restored to its original state at the start of
1004** the transaction, for each page restored this routine is called.
1005**
1006** This routine needs to reset the extra data section at the end of the
1007** page to agree with the restored data.
1008*/
1009static void pageReinit(DbPage *pData, int pageSize){
1010 MemPage *pPage;
1011 assert( (pageSize & 7)==0 );
1012 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1013 if( pPage->isInit ){
1014 pPage->isInit = 0;
1015 sqlite3BtreeInitPage(pPage, pPage->pParent);
1016 }
1017}
1018
1019/*
1020** Open a database file.
1021**
1022** zFilename is the name of the database file. If zFilename is NULL
1023** a new database with a random name is created. This randomly named
1024** database file will be deleted when sqlite3BtreeClose() is called.
1025*/
1026int sqlite3BtreeOpen(
1027 const char *zFilename, /* Name of the file containing the BTree database */
1028 sqlite3 *pSqlite, /* Associated database handle */
1029 Btree **ppBtree, /* Pointer to new Btree object written here */
1030 int flags /* Options */
1031){
1032 BtShared *pBt; /* Shared part of btree structure */
1033 Btree *p; /* Handle to return */
1034 int rc = SQLITE_OK;
1035 int nReserve;
1036 unsigned char zDbHeader[100];
1037#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1038 const ThreadData *pTsdro;
1039#endif
1040
1041 /* Set the variable isMemdb to true for an in-memory database, or
1042 ** false for a file-based database. This symbol is only required if
1043 ** either of the shared-data or autovacuum features are compiled
1044 ** into the library.
1045 */
1046#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1047 #ifdef SQLITE_OMIT_MEMORYDB
1048 const int isMemdb = 0;
1049 #else
1050 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1051 #endif
1052#endif
1053
1054 p = sqliteMalloc(sizeof(Btree));
1055 if( !p ){
1056 return SQLITE_NOMEM;
1057 }
1058 p->inTrans = TRANS_NONE;
1059 p->pSqlite = pSqlite;
1060
1061 /* Try to find an existing Btree structure opened on zFilename. */
1062#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1063 pTsdro = sqlite3ThreadDataReadOnly();
1064 if( pTsdro->useSharedData && zFilename && !isMemdb ){
1065 char *zFullPathname = sqlite3OsFullPathname(zFilename);
1066 if( !zFullPathname ){
1067 sqliteFree(p);
1068 return SQLITE_NOMEM;
1069 }
1070 for(pBt=pTsdro->pBtree; pBt; pBt=pBt->pNext){
1071 assert( pBt->nRef>0 );
1072 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) ){
1073 p->pBt = pBt;
1074 *ppBtree = p;
1075 pBt->nRef++;
1076 sqliteFree(zFullPathname);
1077 return SQLITE_OK;
1078 }
1079 }
1080 sqliteFree(zFullPathname);
1081 }
1082#endif
1083
1084 /*
1085 ** The following asserts make sure that structures used by the btree are
1086 ** the right size. This is to guard against size changes that result
1087 ** when compiling on a different architecture.
1088 */
1089 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1090 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1091 assert( sizeof(u32)==4 );
1092 assert( sizeof(u16)==2 );
1093 assert( sizeof(Pgno)==4 );
1094
1095 pBt = sqliteMalloc( sizeof(*pBt) );
1096 if( pBt==0 ){
1097 rc = SQLITE_NOMEM;
1098 goto btree_open_out;
1099 }
1100 rc = sqlite3PagerOpen(&pBt->pPager, zFilename, EXTRA_SIZE, flags);
1101 if( rc==SQLITE_OK ){
1102 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1103 }
1104 if( rc!=SQLITE_OK ){
1105 goto btree_open_out;
1106 }
1107 p->pBt = pBt;
1108
1109 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1110 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1111 pBt->pCursor = 0;
1112 pBt->pPage1 = 0;
1113 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1114 pBt->pageSize = get2byte(&zDbHeader[16]);
1115 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1116 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1117 pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1118 pBt->maxEmbedFrac = 64; /* 25% */
1119 pBt->minEmbedFrac = 32; /* 12.5% */
1120 pBt->minLeafFrac = 32; /* 12.5% */
1121#ifndef SQLITE_OMIT_AUTOVACUUM
1122 /* If the magic name ":memory:" will create an in-memory database, then
1123 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1124 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1125 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1126 ** regular file-name. In this case the auto-vacuum applies as per normal.
1127 */
1128 if( zFilename && !isMemdb ){
1129 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1130 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1131 }
1132#endif
1133 nReserve = 0;
1134 }else{
1135 nReserve = zDbHeader[20];
1136 pBt->maxEmbedFrac = zDbHeader[21];
1137 pBt->minEmbedFrac = zDbHeader[22];
1138 pBt->minLeafFrac = zDbHeader[23];
1139 pBt->pageSizeFixed = 1;
1140#ifndef SQLITE_OMIT_AUTOVACUUM
1141 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1142#endif
1143 }
1144 pBt->usableSize = pBt->pageSize - nReserve;
1145 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1146 sqlite3PagerSetPagesize(pBt->pPager, pBt->pageSize);
1147
1148#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1149 /* Add the new btree to the linked list starting at ThreadData.pBtree.
1150 ** There is no chance that a malloc() may fail inside of the
1151 ** sqlite3ThreadData() call, as the ThreadData structure must have already
1152 ** been allocated for pTsdro->useSharedData to be non-zero.
1153 */
1154 if( pTsdro->useSharedData && zFilename && !isMemdb ){
1155 pBt->pNext = pTsdro->pBtree;
1156 sqlite3ThreadData()->pBtree = pBt;
1157 }
1158#endif
1159 pBt->nRef = 1;
1160 *ppBtree = p;
1161
1162btree_open_out:
1163 if( rc!=SQLITE_OK ){
1164 if( pBt && pBt->pPager ){
1165 sqlite3PagerClose(pBt->pPager);
1166 }
1167 sqliteFree(pBt);
1168 sqliteFree(p);
1169 *ppBtree = 0;
1170 }
1171 return rc;
1172}
1173
1174/*
1175** Close an open database and invalidate all cursors.
1176*/
1177int sqlite3BtreeClose(Btree *p){
1178 BtShared *pBt = p->pBt;
1179 BtCursor *pCur;
1180
1181#ifndef SQLITE_OMIT_SHARED_CACHE
1182 ThreadData *pTsd;
1183#endif
1184
1185 /* Close all cursors opened via this handle. */
1186 pCur = pBt->pCursor;
1187 while( pCur ){
1188 BtCursor *pTmp = pCur;
1189 pCur = pCur->pNext;
1190 if( pTmp->pBtree==p ){
1191 sqlite3BtreeCloseCursor(pTmp);
1192 }
1193 }
1194
1195 /* Rollback any active transaction and free the handle structure.
1196 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1197 ** this handle.
1198 */
1199 sqlite3BtreeRollback(p);
1200 sqliteFree(p);
1201
1202#ifndef SQLITE_OMIT_SHARED_CACHE
1203 /* If there are still other outstanding references to the shared-btree
1204 ** structure, return now. The remainder of this procedure cleans
1205 ** up the shared-btree.
1206 */
1207 assert( pBt->nRef>0 );
1208 pBt->nRef--;
1209 if( pBt->nRef ){
1210 return SQLITE_OK;
1211 }
1212
1213 /* Remove the shared-btree from the thread wide list. Call
1214 ** ThreadDataReadOnly() and then cast away the const property of the
1215 ** pointer to avoid allocating thread data if it is not really required.
1216 */
1217 pTsd = (ThreadData *)sqlite3ThreadDataReadOnly();
1218 if( pTsd->pBtree==pBt ){
1219 assert( pTsd==sqlite3ThreadData() );
1220 pTsd->pBtree = pBt->pNext;
1221 }else{
1222 BtShared *pPrev;
1223 for(pPrev=pTsd->pBtree; pPrev && pPrev->pNext!=pBt; pPrev=pPrev->pNext){}
1224 if( pPrev ){
1225 assert( pTsd==sqlite3ThreadData() );
1226 pPrev->pNext = pBt->pNext;
1227 }
1228 }
1229#endif
1230
1231 /* Close the pager and free the shared-btree structure */
1232 assert( !pBt->pCursor );
1233 sqlite3PagerClose(pBt->pPager);
1234 if( pBt->xFreeSchema && pBt->pSchema ){
1235 pBt->xFreeSchema(pBt->pSchema);
1236 }
1237 sqliteFree(pBt->pSchema);
1238 sqliteFree(pBt);
1239 return SQLITE_OK;
1240}
1241
1242/*
1243** Change the busy handler callback function.
1244*/
1245int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){
1246 BtShared *pBt = p->pBt;
1247 pBt->pBusyHandler = pHandler;
1248 sqlite3PagerSetBusyhandler(pBt->pPager, pHandler);
1249 return SQLITE_OK;
1250}
1251
1252/*
1253** Change the limit on the number of pages allowed in the cache.
1254**
1255** The maximum number of cache pages is set to the absolute
1256** value of mxPage. If mxPage is negative, the pager will
1257** operate asynchronously - it will not stop to do fsync()s
1258** to insure data is written to the disk surface before
1259** continuing. Transactions still work if synchronous is off,
1260** and the database cannot be corrupted if this program
1261** crashes. But if the operating system crashes or there is
1262** an abrupt power failure when synchronous is off, the database
1263** could be left in an inconsistent and unrecoverable state.
1264** Synchronous is on by default so database corruption is not
1265** normally a worry.
1266*/
1267int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1268 BtShared *pBt = p->pBt;
1269 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1270 return SQLITE_OK;
1271}
1272
1273/*
1274** Change the way data is synced to disk in order to increase or decrease
1275** how well the database resists damage due to OS crashes and power
1276** failures. Level 1 is the same as asynchronous (no syncs() occur and
1277** there is a high probability of damage) Level 2 is the default. There
1278** is a very low but non-zero probability of damage. Level 3 reduces the
1279** probability of damage to near zero but with a write performance reduction.
1280*/
1281#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1282int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1283 BtShared *pBt = p->pBt;
1284 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1285 return SQLITE_OK;
1286}
1287#endif
1288
1289/*
1290** Return TRUE if the given btree is set to safety level 1. In other
1291** words, return TRUE if no sync() occurs on the disk files.
1292*/
1293int sqlite3BtreeSyncDisabled(Btree *p){
1294 BtShared *pBt = p->pBt;
1295 assert( pBt && pBt->pPager );
1296 return sqlite3PagerNosync(pBt->pPager);
1297}
1298
1299#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1300/*
1301** Change the default pages size and the number of reserved bytes per page.
1302**
1303** The page size must be a power of 2 between 512 and 65536. If the page
1304** size supplied does not meet this constraint then the page size is not
1305** changed.
1306**
1307** Page sizes are constrained to be a power of two so that the region
1308** of the database file used for locking (beginning at PENDING_BYTE,
1309** the first byte past the 1GB boundary, 0x40000000) needs to occur
1310** at the beginning of a page.
1311**
1312** If parameter nReserve is less than zero, then the number of reserved
1313** bytes per page is left unchanged.
1314*/
1315int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1316 BtShared *pBt = p->pBt;
1317 if( pBt->pageSizeFixed ){
1318 return SQLITE_READONLY;
1319 }
1320 if( nReserve<0 ){
1321 nReserve = pBt->pageSize - pBt->usableSize;
1322 }
1323 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1324 ((pageSize-1)&pageSize)==0 ){
1325 assert( (pageSize & 7)==0 );
1326 assert( !pBt->pPage1 && !pBt->pCursor );
1327 pBt->pageSize = sqlite3PagerSetPagesize(pBt->pPager, pageSize);
1328 }
1329 pBt->usableSize = pBt->pageSize - nReserve;
1330 return SQLITE_OK;
1331}
1332
1333/*
1334** Return the currently defined page size
1335*/
1336int sqlite3BtreeGetPageSize(Btree *p){
1337 return p->pBt->pageSize;
1338}
1339int sqlite3BtreeGetReserve(Btree *p){
1340 return p->pBt->pageSize - p->pBt->usableSize;
1341}
1342
1343/*
1344** Set the maximum page count for a database if mxPage is positive.
1345** No changes are made if mxPage is 0 or negative.
1346** Regardless of the value of mxPage, return the maximum page count.
1347*/
1348int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1349 return sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1350}
1351#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1352
1353/*
1354** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1355** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1356** is disabled. The default value for the auto-vacuum property is
1357** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1358*/
1359int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1360#ifdef SQLITE_OMIT_AUTOVACUUM
1361 return SQLITE_READONLY;
1362#else
1363 BtShared *pBt = p->pBt;
1364 int av = (autoVacuum?1:0);
1365 int iv = (autoVacuum==BTREE_AUTOVACUUM_INCR?1:0);
1366 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1367 return SQLITE_READONLY;
1368 }
1369 pBt->autoVacuum = av;
1370 pBt->incrVacuum = iv;
1371 return SQLITE_OK;
1372#endif
1373}
1374
1375/*
1376** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1377** enabled 1 is returned. Otherwise 0.
1378*/
1379int sqlite3BtreeGetAutoVacuum(Btree *p){
1380#ifdef SQLITE_OMIT_AUTOVACUUM
1381 return BTREE_AUTOVACUUM_NONE;
1382#else
1383 return (
1384 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1385 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1386 BTREE_AUTOVACUUM_INCR
1387 );
1388#endif
1389}
1390
1391
1392/*
1393** Get a reference to pPage1 of the database file. This will
1394** also acquire a readlock on that file.
1395**
1396** SQLITE_OK is returned on success. If the file is not a
1397** well-formed database file, then SQLITE_CORRUPT is returned.
1398** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1399** is returned if we run out of memory.
1400*/
1401static int lockBtree(BtShared *pBt){
1402 int rc, pageSize;
1403 MemPage *pPage1;
1404 if( pBt->pPage1 ) return SQLITE_OK;
1405 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1406 if( rc!=SQLITE_OK ) return rc;
1407
1408
1409 /* Do some checking to help insure the file we opened really is
1410 ** a valid database file.
1411 */
1412 rc = SQLITE_NOTADB;
1413 if( sqlite3PagerPagecount(pBt->pPager)>0 ){
1414 u8 *page1 = pPage1->aData;
1415 if( memcmp(page1, zMagicHeader, 16)!=0 ){
1416 goto page1_init_failed;
1417 }
1418 if( page1[18]>1 ){
1419 pBt->readOnly = 1;
1420 }
1421 if( page1[19]>1 ){
1422 goto page1_init_failed;
1423 }
1424 pageSize = get2byte(&page1[16]);
1425 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ){
1426 goto page1_init_failed;
1427 }
1428 assert( (pageSize & 7)==0 );
1429 pBt->pageSize = pageSize;
1430 pBt->usableSize = pageSize - page1[20];
1431 if( pBt->usableSize<500 ){
1432 goto page1_init_failed;
1433 }
1434 pBt->maxEmbedFrac = page1[21];
1435 pBt->minEmbedFrac = page1[22];
1436 pBt->minLeafFrac = page1[23];
1437#ifndef SQLITE_OMIT_AUTOVACUUM
1438 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1439#endif
1440 }
1441
1442 /* maxLocal is the maximum amount of payload to store locally for
1443 ** a cell. Make sure it is small enough so that at least minFanout
1444 ** cells can will fit on one page. We assume a 10-byte page header.
1445 ** Besides the payload, the cell must store:
1446 ** 2-byte pointer to the cell
1447 ** 4-byte child pointer
1448 ** 9-byte nKey value
1449 ** 4-byte nData value
1450 ** 4-byte overflow page pointer
1451 ** So a cell consists of a 2-byte poiner, a header which is as much as
1452 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1453 ** page pointer.
1454 */
1455 pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1456 pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1457 pBt->maxLeaf = pBt->usableSize - 35;
1458 pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1459 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1460 goto page1_init_failed;
1461 }
1462 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1463 pBt->pPage1 = pPage1;
1464 return SQLITE_OK;
1465
1466page1_init_failed:
1467 releasePage(pPage1);
1468 pBt->pPage1 = 0;
1469 return rc;
1470}
1471
1472/*
1473** This routine works like lockBtree() except that it also invokes the
1474** busy callback if there is lock contention.
1475*/
1476static int lockBtreeWithRetry(Btree *pRef){
1477 int rc = SQLITE_OK;
1478 if( pRef->inTrans==TRANS_NONE ){
1479 u8 inTransaction = pRef->pBt->inTransaction;
1480 btreeIntegrity(pRef);
1481 rc = sqlite3BtreeBeginTrans(pRef, 0);
1482 pRef->pBt->inTransaction = inTransaction;
1483 pRef->inTrans = TRANS_NONE;
1484 if( rc==SQLITE_OK ){
1485 pRef->pBt->nTransaction--;
1486 }
1487 btreeIntegrity(pRef);
1488 }
1489 return rc;
1490}
1491
1492
1493/*
1494** If there are no outstanding cursors and we are not in the middle
1495** of a transaction but there is a read lock on the database, then
1496** this routine unrefs the first page of the database file which
1497** has the effect of releasing the read lock.
1498**
1499** If there are any outstanding cursors, this routine is a no-op.
1500**
1501** If there is a transaction in progress, this routine is a no-op.
1502*/
1503static void unlockBtreeIfUnused(BtShared *pBt){
1504 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1505 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1506 if( pBt->pPage1->aData==0 ){
1507 MemPage *pPage = pBt->pPage1;
1508 pPage->aData = &((u8*)pPage)[-pBt->pageSize];
1509 pPage->pBt = pBt;
1510 pPage->pgno = 1;
1511 }
1512 releasePage(pBt->pPage1);
1513 }
1514 pBt->pPage1 = 0;
1515 pBt->inStmt = 0;
1516 }
1517}
1518
1519/*
1520** Create a new database by initializing the first page of the
1521** file.
1522*/
1523static int newDatabase(BtShared *pBt){
1524 MemPage *pP1;
1525 unsigned char *data;
1526 int rc;
1527 if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK;
1528 pP1 = pBt->pPage1;
1529 assert( pP1!=0 );
1530 data = pP1->aData;
1531 rc = sqlite3PagerWrite(pP1->pDbPage);
1532 if( rc ) return rc;
1533 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1534 assert( sizeof(zMagicHeader)==16 );
1535 put2byte(&data[16], pBt->pageSize);
1536 data[18] = 1;
1537 data[19] = 1;
1538 data[20] = pBt->pageSize - pBt->usableSize;
1539 data[21] = pBt->maxEmbedFrac;
1540 data[22] = pBt->minEmbedFrac;
1541 data[23] = pBt->minLeafFrac;
1542 memset(&data[24], 0, 100-24);
1543 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1544 pBt->pageSizeFixed = 1;
1545#ifndef SQLITE_OMIT_AUTOVACUUM
1546 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1547 put4byte(&data[36 + 4*4], pBt->autoVacuum);
1548#endif
1549 return SQLITE_OK;
1550}
1551
1552/*
1553** Attempt to start a new transaction. A write-transaction
1554** is started if the second argument is nonzero, otherwise a read-
1555** transaction. If the second argument is 2 or more and exclusive
1556** transaction is started, meaning that no other process is allowed
1557** to access the database. A preexisting transaction may not be
1558** upgraded to exclusive by calling this routine a second time - the
1559** exclusivity flag only works for a new transaction.
1560**
1561** A write-transaction must be started before attempting any
1562** changes to the database. None of the following routines
1563** will work unless a transaction is started first:
1564**
1565** sqlite3BtreeCreateTable()
1566** sqlite3BtreeCreateIndex()
1567** sqlite3BtreeClearTable()
1568** sqlite3BtreeDropTable()
1569** sqlite3BtreeInsert()
1570** sqlite3BtreeDelete()
1571** sqlite3BtreeUpdateMeta()
1572**
1573** If an initial attempt to acquire the lock fails because of lock contention
1574** and the database was previously unlocked, then invoke the busy handler
1575** if there is one. But if there was previously a read-lock, do not
1576** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1577** returned when there is already a read-lock in order to avoid a deadlock.
1578**
1579** Suppose there are two processes A and B. A has a read lock and B has
1580** a reserved lock. B tries to promote to exclusive but is blocked because
1581** of A's read lock. A tries to promote to reserved but is blocked by B.
1582** One or the other of the two processes must give way or there can be
1583** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1584** when A already has a read lock, we encourage A to give up and let B
1585** proceed.
1586*/
1587int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1588 BtShared *pBt = p->pBt;
1589 int rc = SQLITE_OK;
1590
1591 btreeIntegrity(p);
1592
1593 /* If the btree is already in a write-transaction, or it
1594 ** is already in a read-transaction and a read-transaction
1595 ** is requested, this is a no-op.
1596 */
1597 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1598 return SQLITE_OK;
1599 }
1600
1601 /* Write transactions are not possible on a read-only database */
1602 if( pBt->readOnly && wrflag ){
1603 return SQLITE_READONLY;
1604 }
1605
1606 /* If another database handle has already opened a write transaction
1607 ** on this shared-btree structure and a second write transaction is
1608 ** requested, return SQLITE_BUSY.
1609 */
1610 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1611 return SQLITE_BUSY;
1612 }
1613
1614 do {
1615 if( pBt->pPage1==0 ){
1616 rc = lockBtree(pBt);
1617 }
1618
1619 if( rc==SQLITE_OK && wrflag ){
1620 if( pBt->readOnly ){
1621 rc = SQLITE_READONLY;
1622 }else{
1623 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1624 if( rc==SQLITE_OK ){
1625 rc = newDatabase(pBt);
1626 }
1627 }
1628 }
1629
1630 if( rc==SQLITE_OK ){
1631 if( wrflag ) pBt->inStmt = 0;
1632 }else{
1633 unlockBtreeIfUnused(pBt);
1634 }
1635 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1636 sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
1637
1638 if( rc==SQLITE_OK ){
1639 if( p->inTrans==TRANS_NONE ){
1640 pBt->nTransaction++;
1641 }
1642 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1643 if( p->inTrans>pBt->inTransaction ){
1644 pBt->inTransaction = p->inTrans;
1645 }
1646 }
1647
1648 btreeIntegrity(p);
1649 return rc;
1650}
1651
1652#ifndef SQLITE_OMIT_AUTOVACUUM
1653
1654/*
1655** Set the pointer-map entries for all children of page pPage. Also, if
1656** pPage contains cells that point to overflow pages, set the pointer
1657** map entries for the overflow pages as well.
1658*/
1659static int setChildPtrmaps(MemPage *pPage){
1660 int i; /* Counter variable */
1661 int nCell; /* Number of cells in page pPage */
1662 int rc; /* Return code */
1663 BtShared *pBt = pPage->pBt;
1664 int isInitOrig = pPage->isInit;
1665 Pgno pgno = pPage->pgno;
1666
1667 rc = sqlite3BtreeInitPage(pPage, pPage->pParent);
1668 if( rc!=SQLITE_OK ){
1669 goto set_child_ptrmaps_out;
1670 }
1671 nCell = pPage->nCell;
1672
1673 for(i=0; i<nCell; i++){
1674 u8 *pCell = findCell(pPage, i);
1675
1676 rc = ptrmapPutOvflPtr(pPage, pCell);
1677 if( rc!=SQLITE_OK ){
1678 goto set_child_ptrmaps_out;
1679 }
1680
1681 if( !pPage->leaf ){
1682 Pgno childPgno = get4byte(pCell);
1683 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1684 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1685 }
1686 }
1687
1688 if( !pPage->leaf ){
1689 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1690 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1691 }
1692
1693set_child_ptrmaps_out:
1694 pPage->isInit = isInitOrig;
1695 return rc;
1696}
1697
1698/*
1699** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1700** page, is a pointer to page iFrom. Modify this pointer so that it points to
1701** iTo. Parameter eType describes the type of pointer to be modified, as
1702** follows:
1703**
1704** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1705** page of pPage.
1706**
1707** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1708** page pointed to by one of the cells on pPage.
1709**
1710** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1711** overflow page in the list.
1712*/
1713static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1714 if( eType==PTRMAP_OVERFLOW2 ){
1715 /* The pointer is always the first 4 bytes of the page in this case. */
1716 if( get4byte(pPage->aData)!=iFrom ){
1717 return SQLITE_CORRUPT_BKPT;
1718 }
1719 put4byte(pPage->aData, iTo);
1720 }else{
1721 int isInitOrig = pPage->isInit;
1722 int i;
1723 int nCell;
1724
1725 sqlite3BtreeInitPage(pPage, 0);
1726 nCell = pPage->nCell;
1727
1728 for(i=0; i<nCell; i++){
1729 u8 *pCell = findCell(pPage, i);
1730 if( eType==PTRMAP_OVERFLOW1 ){
1731 CellInfo info;
1732 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1733 if( info.iOverflow ){
1734 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1735 put4byte(&pCell[info.iOverflow], iTo);
1736 break;
1737 }
1738 }
1739 }else{
1740 if( get4byte(pCell)==iFrom ){
1741 put4byte(pCell, iTo);
1742 break;
1743 }
1744 }
1745 }
1746
1747 if( i==nCell ){
1748 if( eType!=PTRMAP_BTREE ||
1749 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1750 return SQLITE_CORRUPT_BKPT;
1751 }
1752 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1753 }
1754
1755 pPage->isInit = isInitOrig;
1756 }
1757 return SQLITE_OK;
1758}
1759
1760
1761/*
1762** Move the open database page pDbPage to location iFreePage in the
1763** database. The pDbPage reference remains valid.
1764*/
1765static int relocatePage(
1766 BtShared *pBt, /* Btree */
1767 MemPage *pDbPage, /* Open page to move */
1768 u8 eType, /* Pointer map 'type' entry for pDbPage */
1769 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
1770 Pgno iFreePage /* The location to move pDbPage to */
1771){
1772 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
1773 Pgno iDbPage = pDbPage->pgno;
1774 Pager *pPager = pBt->pPager;
1775 int rc;
1776
1777 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
1778 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
1779
1780 /* Move page iDbPage from it's current location to page number iFreePage */
1781 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
1782 iDbPage, iFreePage, iPtrPage, eType));
1783 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage);
1784 if( rc!=SQLITE_OK ){
1785 return rc;
1786 }
1787 pDbPage->pgno = iFreePage;
1788
1789 /* If pDbPage was a btree-page, then it may have child pages and/or cells
1790 ** that point to overflow pages. The pointer map entries for all these
1791 ** pages need to be changed.
1792 **
1793 ** If pDbPage is an overflow page, then the first 4 bytes may store a
1794 ** pointer to a subsequent overflow page. If this is the case, then
1795 ** the pointer map needs to be updated for the subsequent overflow page.
1796 */
1797 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
1798 rc = setChildPtrmaps(pDbPage);
1799 if( rc!=SQLITE_OK ){
1800 return rc;
1801 }
1802 }else{
1803 Pgno nextOvfl = get4byte(pDbPage->aData);
1804 if( nextOvfl!=0 ){
1805 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
1806 if( rc!=SQLITE_OK ){
1807 return rc;
1808 }
1809 }
1810 }
1811
1812 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
1813 ** that it points at iFreePage. Also fix the pointer map entry for
1814 ** iPtrPage.
1815 */
1816 if( eType!=PTRMAP_ROOTPAGE ){
1817 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
1818 if( rc!=SQLITE_OK ){
1819 return rc;
1820 }
1821 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
1822 if( rc!=SQLITE_OK ){
1823 releasePage(pPtrPage);
1824 return rc;
1825 }
1826 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
1827 releasePage(pPtrPage);
1828 if( rc==SQLITE_OK ){
1829 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
1830 }
1831 }
1832 return rc;
1833}
1834
1835/* Forward declaration required by incrVacuumStep(). */
1836static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
1837
1838/*
1839** Perform a single step of an incremental-vacuum. If successful,
1840** return SQLITE_OK. If there is no work to do (and therefore no
1841** point in calling this function again), return SQLITE_DONE.
1842**
1843** More specificly, this function attempts to re-organize the
1844** database so that the last page of the file currently in use
1845** is no longer in use.
1846**
1847** If the nFin parameter is non-zero, the implementation assumes
1848** that the caller will keep calling incrVacuumStep() until
1849** it returns SQLITE_DONE or an error, and that nFin is the
1850** number of pages the database file will contain after this
1851** process is complete.
1852*/
1853static int incrVacuumStep(BtShared *pBt, Pgno nFin){
1854 Pgno iLastPg; /* Last page in the database */
1855 Pgno nFreeList; /* Number of pages still on the free-list */
1856
1857 iLastPg = pBt->nTrunc;
1858 if( iLastPg==0 ){
1859 iLastPg = sqlite3PagerPagecount(pBt->pPager);
1860 }
1861
1862 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
1863 int rc;
1864 u8 eType;
1865 Pgno iPtrPage;
1866
1867 nFreeList = get4byte(&pBt->pPage1->aData[36]);
1868 if( nFreeList==0 || nFin==iLastPg ){
1869 return SQLITE_DONE;
1870 }
1871
1872 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
1873 if( rc!=SQLITE_OK ){
1874 return rc;
1875 }
1876 if( eType==PTRMAP_ROOTPAGE ){
1877 return SQLITE_CORRUPT_BKPT;
1878 }
1879
1880 if( eType==PTRMAP_FREEPAGE ){
1881 if( nFin==0 ){
1882 /* Remove the page from the files free-list. This is not required
1883 ** if nFin is non-zero. In that case, the free-list will be
1884 ** truncated to zero after this function returns, so it doesn't
1885 ** matter if it still contains some garbage entries.
1886 */
1887 Pgno iFreePg;
1888 MemPage *pFreePg;
1889 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
1890 if( rc!=SQLITE_OK ){
1891 return rc;
1892 }
1893 assert( iFreePg==iLastPg );
1894 releasePage(pFreePg);
1895 }
1896 } else {
1897 Pgno iFreePg; /* Index of free page to move pLastPg to */
1898 MemPage *pLastPg;
1899
1900 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
1901 if( rc!=SQLITE_OK ){
1902 return rc;
1903 }
1904
1905 /* If nFin is zero, this loop runs exactly once and page pLastPg
1906 ** is swapped with the first free page pulled off the free list.
1907 **
1908 ** On the other hand, if nFin is greater than zero, then keep
1909 ** looping until a free-page located within the first nFin pages
1910 ** of the file is found.
1911 */
1912 do {
1913 MemPage *pFreePg;
1914 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
1915 if( rc!=SQLITE_OK ){
1916 releasePage(pLastPg);
1917 return rc;
1918 }
1919 releasePage(pFreePg);
1920 }while( nFin!=0 && iFreePg>nFin );
1921 assert( iFreePg<iLastPg );
1922
1923 rc = sqlite3PagerWrite(pLastPg->pDbPage);
1924 if( rc!=SQLITE_OK ){
1925 return rc;
1926 }
1927 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg);
1928 releasePage(pLastPg);
1929 if( rc!=SQLITE_OK ){
1930 return rc;
1931 }
1932 }
1933 }
1934
1935 pBt->nTrunc = iLastPg - 1;
1936 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
1937 pBt->nTrunc--;
1938 }
1939 return SQLITE_OK;
1940}
1941
1942/*
1943** A write-transaction must be opened before calling this function.
1944** It performs a single unit of work towards an incremental vacuum.
1945**
1946** If the incremental vacuum is finished after this function has run,
1947** SQLITE_DONE is returned. If it is not finished, but no error occured,
1948** SQLITE_OK is returned. Otherwise an SQLite error code.
1949*/
1950int sqlite3BtreeIncrVacuum(Btree *p){
1951 BtShared *pBt = p->pBt;
1952 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
1953 if( !pBt->autoVacuum ){
1954 return SQLITE_DONE;
1955 }
1956 invalidateAllOverflowCache(pBt);
1957 return incrVacuumStep(pBt, 0);
1958}
1959
1960/*
1961** This routine is called prior to sqlite3PagerCommit when a transaction
1962** is commited for an auto-vacuum database.
1963**
1964** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
1965** the database file should be truncated to during the commit process.
1966** i.e. the database has been reorganized so that only the first *pnTrunc
1967** pages are in use.
1968*/
1969static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
1970 int rc = SQLITE_OK;
1971 Pager *pPager = pBt->pPager;
1972#ifndef NDEBUG
1973 int nRef = sqlite3PagerRefcount(pPager);
1974#endif
1975
1976 invalidateAllOverflowCache(pBt);
1977 assert(pBt->autoVacuum);
1978 if( !pBt->incrVacuum ){
1979 Pgno nFin = 0;
1980
1981 if( pBt->nTrunc==0 ){
1982 Pgno nFree;
1983 Pgno nPtrmap;
1984 const int pgsz = pBt->pageSize;
1985 Pgno nOrig = sqlite3PagerPagecount(pBt->pPager);
1986
1987 if( PTRMAP_ISPAGE(pBt, nOrig) ){
1988 return SQLITE_CORRUPT_BKPT;
1989 }
1990 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
1991 nOrig--;
1992 }
1993 nFree = get4byte(&pBt->pPage1->aData[36]);
1994 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
1995 nFin = nOrig - nFree - nPtrmap;
1996 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
1997 nFin--;
1998 }
1999 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2000 nFin--;
2001 }
2002 }
2003
2004 while( rc==SQLITE_OK ){
2005 rc = incrVacuumStep(pBt, nFin);
2006 }
2007 if( rc==SQLITE_DONE ){
2008 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2009 rc = SQLITE_OK;
2010 if( pBt->nTrunc ){
2011 sqlite3PagerWrite(pBt->pPage1->pDbPage);
2012 put4byte(&pBt->pPage1->aData[32], 0);
2013 put4byte(&pBt->pPage1->aData[36], 0);
2014 pBt->nTrunc = nFin;
2015 }
2016 }
2017 if( rc!=SQLITE_OK ){
2018 sqlite3PagerRollback(pPager);
2019 }
2020 }
2021
2022 if( rc==SQLITE_OK ){
2023 *pnTrunc = pBt->nTrunc;
2024 pBt->nTrunc = 0;
2025 }
2026 assert( nRef==sqlite3PagerRefcount(pPager) );
2027 return rc;
2028}
2029
2030#endif
2031
2032/*
2033** This routine does the first phase of a two-phase commit. This routine
2034** causes a rollback journal to be created (if it does not already exist)
2035** and populated with enough information so that if a power loss occurs
2036** the database can be restored to its original state by playing back
2037** the journal. Then the contents of the journal are flushed out to
2038** the disk. After the journal is safely on oxide, the changes to the
2039** database are written into the database file and flushed to oxide.
2040** At the end of this call, the rollback journal still exists on the
2041** disk and we are still holding all locks, so the transaction has not
2042** committed. See sqlite3BtreeCommit() for the second phase of the
2043** commit process.
2044**
2045** This call is a no-op if no write-transaction is currently active on pBt.
2046**
2047** Otherwise, sync the database file for the btree pBt. zMaster points to
2048** the name of a master journal file that should be written into the
2049** individual journal file, or is NULL, indicating no master journal file
2050** (single database transaction).
2051**
2052** When this is called, the master journal should already have been
2053** created, populated with this journal pointer and synced to disk.
2054**
2055** Once this is routine has returned, the only thing required to commit
2056** the write-transaction for this database file is to delete the journal.
2057*/
2058int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2059 int rc = SQLITE_OK;
2060 if( p->inTrans==TRANS_WRITE ){
2061 BtShared *pBt = p->pBt;
2062 Pgno nTrunc = 0;
2063#ifndef SQLITE_OMIT_AUTOVACUUM
2064 if( pBt->autoVacuum ){
2065 rc = autoVacuumCommit(pBt, &nTrunc);
2066 if( rc!=SQLITE_OK ){
2067 return rc;
2068 }
2069 }
2070#endif
2071 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc);
2072 }
2073 return rc;
2074}
2075
2076/*
2077** Commit the transaction currently in progress.
2078**
2079** This routine implements the second phase of a 2-phase commit. The
2080** sqlite3BtreeSync() routine does the first phase and should be invoked
2081** prior to calling this routine. The sqlite3BtreeSync() routine did
2082** all the work of writing information out to disk and flushing the
2083** contents so that they are written onto the disk platter. All this
2084** routine has to do is delete or truncate the rollback journal
2085** (which causes the transaction to commit) and drop locks.
2086**
2087** This will release the write lock on the database file. If there
2088** are no active cursors, it also releases the read lock.
2089*/
2090int sqlite3BtreeCommitPhaseTwo(Btree *p){
2091 BtShared *pBt = p->pBt;
2092
2093 btreeIntegrity(p);
2094
2095 /* If the handle has a write-transaction open, commit the shared-btrees
2096 ** transaction and set the shared state to TRANS_READ.
2097 */
2098 if( p->inTrans==TRANS_WRITE ){
2099 int rc;
2100 assert( pBt->inTransaction==TRANS_WRITE );
2101 assert( pBt->nTransaction>0 );
2102 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2103 if( rc!=SQLITE_OK ){
2104 return rc;
2105 }
2106 pBt->inTransaction = TRANS_READ;
2107 pBt->inStmt = 0;
2108 }
2109 unlockAllTables(p);
2110
2111 /* If the handle has any kind of transaction open, decrement the transaction
2112 ** count of the shared btree. If the transaction count reaches 0, set
2113 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2114 ** will unlock the pager.
2115 */
2116 if( p->inTrans!=TRANS_NONE ){
2117 pBt->nTransaction--;
2118 if( 0==pBt->nTransaction ){
2119 pBt->inTransaction = TRANS_NONE;
2120 }
2121 }
2122
2123 /* Set the handles current transaction state to TRANS_NONE and unlock
2124 ** the pager if this call closed the only read or write transaction.
2125 */
2126 p->inTrans = TRANS_NONE;
2127 unlockBtreeIfUnused(pBt);
2128
2129 btreeIntegrity(p);
2130 return SQLITE_OK;
2131}
2132
2133/*
2134** Do both phases of a commit.
2135*/
2136int sqlite3BtreeCommit(Btree *p){
2137 int rc;
2138 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2139 if( rc==SQLITE_OK ){
2140 rc = sqlite3BtreeCommitPhaseTwo(p);
2141 }
2142 return rc;
2143}
2144
2145#ifndef NDEBUG
2146/*
2147** Return the number of write-cursors open on this handle. This is for use
2148** in assert() expressions, so it is only compiled if NDEBUG is not
2149** defined.
2150*/
2151static int countWriteCursors(BtShared *pBt){
2152 BtCursor *pCur;
2153 int r = 0;
2154 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2155 if( pCur->wrFlag ) r++;
2156 }
2157 return r;
2158}
2159#endif
2160
2161/*
2162** Rollback the transaction in progress. All cursors will be
2163** invalided by this operation. Any attempt to use a cursor
2164** that was open at the beginning of this operation will result
2165** in an error.
2166**
2167** This will release the write lock on the database file. If there
2168** are no active cursors, it also releases the read lock.
2169*/
2170int sqlite3BtreeRollback(Btree *p){
2171 int rc;
2172 BtShared *pBt = p->pBt;
2173 MemPage *pPage1;
2174
2175 rc = saveAllCursors(pBt, 0, 0);
2176#ifndef SQLITE_OMIT_SHARED_CACHE
2177 if( rc!=SQLITE_OK ){
2178 /* This is a horrible situation. An IO or malloc() error occured whilst
2179 ** trying to save cursor positions. If this is an automatic rollback (as
2180 ** the result of a constraint, malloc() failure or IO error) then
2181 ** the cache may be internally inconsistent (not contain valid trees) so
2182 ** we cannot simply return the error to the caller. Instead, abort
2183 ** all queries that may be using any of the cursors that failed to save.
2184 */
2185 while( pBt->pCursor ){
2186 sqlite3 *db = pBt->pCursor->pBtree->pSqlite;
2187 if( db ){
2188 sqlite3AbortOtherActiveVdbes(db, 0);
2189 }
2190 }
2191 }
2192#endif
2193 btreeIntegrity(p);
2194 unlockAllTables(p);
2195
2196 if( p->inTrans==TRANS_WRITE ){
2197 int rc2;
2198
2199#ifndef SQLITE_OMIT_AUTOVACUUM
2200 pBt->nTrunc = 0;
2201#endif
2202
2203 assert( TRANS_WRITE==pBt->inTransaction );
2204 rc2 = sqlite3PagerRollback(pBt->pPager);
2205 if( rc2!=SQLITE_OK ){
2206 rc = rc2;
2207 }
2208
2209 /* The rollback may have destroyed the pPage1->aData value. So
2210 ** call sqlite3BtreeGetPage() on page 1 again to make
2211 ** sure pPage1->aData is set correctly. */
2212 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2213 releasePage(pPage1);
2214 }
2215 assert( countWriteCursors(pBt)==0 );
2216 pBt->inTransaction = TRANS_READ;
2217 }
2218
2219 if( p->inTrans!=TRANS_NONE ){
2220 assert( pBt->nTransaction>0 );
2221 pBt->nTransaction--;
2222 if( 0==pBt->nTransaction ){
2223 pBt->inTransaction = TRANS_NONE;
2224 }
2225 }
2226
2227 p->inTrans = TRANS_NONE;
2228 pBt->inStmt = 0;
2229 unlockBtreeIfUnused(pBt);
2230
2231 btreeIntegrity(p);
2232 return rc;
2233}
2234
2235/*
2236** Start a statement subtransaction. The subtransaction can
2237** can be rolled back independently of the main transaction.
2238** You must start a transaction before starting a subtransaction.
2239** The subtransaction is ended automatically if the main transaction
2240** commits or rolls back.
2241**
2242** Only one subtransaction may be active at a time. It is an error to try
2243** to start a new subtransaction if another subtransaction is already active.
2244**
2245** Statement subtransactions are used around individual SQL statements
2246** that are contained within a BEGIN...COMMIT block. If a constraint
2247** error occurs within the statement, the effect of that one statement
2248** can be rolled back without having to rollback the entire transaction.
2249*/
2250int sqlite3BtreeBeginStmt(Btree *p){
2251 int rc;
2252 BtShared *pBt = p->pBt;
2253 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2254 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2255 }
2256 assert( pBt->inTransaction==TRANS_WRITE );
2257 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2258 pBt->inStmt = 1;
2259 return rc;
2260}
2261
2262
2263/*
2264** Commit the statment subtransaction currently in progress. If no
2265** subtransaction is active, this is a no-op.
2266*/
2267int sqlite3BtreeCommitStmt(Btree *p){
2268 int rc;
2269 BtShared *pBt = p->pBt;
2270 if( pBt->inStmt && !pBt->readOnly ){
2271 rc = sqlite3PagerStmtCommit(pBt->pPager);
2272 }else{
2273 rc = SQLITE_OK;
2274 }
2275 pBt->inStmt = 0;
2276 return rc;
2277}
2278
2279/*
2280** Rollback the active statement subtransaction. If no subtransaction
2281** is active this routine is a no-op.
2282**
2283** All cursors will be invalidated by this operation. Any attempt
2284** to use a cursor that was open at the beginning of this operation
2285** will result in an error.
2286*/
2287int sqlite3BtreeRollbackStmt(Btree *p){
2288 int rc = SQLITE_OK;
2289 BtShared *pBt = p->pBt;
2290 sqlite3MallocDisallow();
2291 if( pBt->inStmt && !pBt->readOnly ){
2292 rc = sqlite3PagerStmtRollback(pBt->pPager);
2293 assert( countWriteCursors(pBt)==0 );
2294 pBt->inStmt = 0;
2295 }
2296 sqlite3MallocAllow();
2297 return rc;
2298}
2299
2300/*
2301** Default key comparison function to be used if no comparison function
2302** is specified on the sqlite3BtreeCursor() call.
2303*/
2304static int dfltCompare(
2305 void *NotUsed, /* User data is not used */
2306 int n1, const void *p1, /* First key to compare */
2307 int n2, const void *p2 /* Second key to compare */
2308){
2309 int c;
2310 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2311 if( c==0 ){
2312 c = n1 - n2;
2313 }
2314 return c;
2315}
2316
2317/*
2318** Create a new cursor for the BTree whose root is on the page
2319** iTable. The act of acquiring a cursor gets a read lock on
2320** the database file.
2321**
2322** If wrFlag==0, then the cursor can only be used for reading.
2323** If wrFlag==1, then the cursor can be used for reading or for
2324** writing if other conditions for writing are also met. These
2325** are the conditions that must be met in order for writing to
2326** be allowed:
2327**
2328** 1: The cursor must have been opened with wrFlag==1
2329**
2330** 2: Other database connections that share the same pager cache
2331** but which are not in the READ_UNCOMMITTED state may not have
2332** cursors open with wrFlag==0 on the same table. Otherwise
2333** the changes made by this write cursor would be visible to
2334** the read cursors in the other database connection.
2335**
2336** 3: The database must be writable (not on read-only media)
2337**
2338** 4: There must be an active transaction.
2339**
2340** No checking is done to make sure that page iTable really is the
2341** root page of a b-tree. If it is not, then the cursor acquired
2342** will not work correctly.
2343**
2344** The comparison function must be logically the same for every cursor
2345** on a particular table. Changing the comparison function will result
2346** in incorrect operations. If the comparison function is NULL, a
2347** default comparison function is used. The comparison function is
2348** always ignored for INTKEY tables.
2349*/
2350int sqlite3BtreeCursor(
2351 Btree *p, /* The btree */
2352 int iTable, /* Root page of table to open */
2353 int wrFlag, /* 1 to write. 0 read-only */
2354 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2355 void *pArg, /* First arg to xCompare() */
2356 BtCursor **ppCur /* Write new cursor here */
2357){
2358 int rc;
2359 BtCursor *pCur;
2360 BtShared *pBt = p->pBt;
2361
2362 *ppCur = 0;
2363 if( wrFlag ){
2364 if( pBt->readOnly ){
2365 return SQLITE_READONLY;
2366 }
2367 if( checkReadLocks(p, iTable, 0) ){
2368 return SQLITE_LOCKED;
2369 }
2370 }
2371
2372 if( pBt->pPage1==0 ){
2373 rc = lockBtreeWithRetry(p);
2374 if( rc!=SQLITE_OK ){
2375 return rc;
2376 }
2377 if( pBt->readOnly && wrFlag ){
2378 return SQLITE_READONLY;
2379 }
2380 }
2381 pCur = sqliteMalloc( sizeof(*pCur) );
2382 if( pCur==0 ){
2383 rc = SQLITE_NOMEM;
2384 goto create_cursor_exception;
2385 }
2386 pCur->pgnoRoot = (Pgno)iTable;
2387 if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){
2388 rc = SQLITE_EMPTY;
2389 goto create_cursor_exception;
2390 }
2391 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2392 if( rc!=SQLITE_OK ){
2393 goto create_cursor_exception;
2394 }
2395
2396 /* Now that no other errors can occur, finish filling in the BtCursor
2397 ** variables, link the cursor into the BtShared list and set *ppCur (the
2398 ** output argument to this function).
2399 */
2400 pCur->xCompare = xCmp ? xCmp : dfltCompare;
2401 pCur->pArg = pArg;
2402 pCur->pBtree = p;
2403 pCur->wrFlag = wrFlag;
2404 pCur->pNext = pBt->pCursor;
2405 if( pCur->pNext ){
2406 pCur->pNext->pPrev = pCur;
2407 }
2408 pBt->pCursor = pCur;
2409 pCur->eState = CURSOR_INVALID;
2410 *ppCur = pCur;
2411
2412 return SQLITE_OK;
2413create_cursor_exception:
2414 if( pCur ){
2415 releasePage(pCur->pPage);
2416 sqliteFree(pCur);
2417 }
2418 unlockBtreeIfUnused(pBt);
2419 return rc;
2420}
2421
2422/*
2423** Close a cursor. The read lock on the database file is released
2424** when the last cursor is closed.
2425*/
2426int sqlite3BtreeCloseCursor(BtCursor *pCur){
2427 BtShared *pBt = pCur->pBtree->pBt;
2428 clearCursorPosition(pCur);
2429 if( pCur->pPrev ){
2430 pCur->pPrev->pNext = pCur->pNext;
2431 }else{
2432 pBt->pCursor = pCur->pNext;
2433 }
2434 if( pCur->pNext ){
2435 pCur->pNext->pPrev = pCur->pPrev;
2436 }
2437 releasePage(pCur->pPage);
2438 unlockBtreeIfUnused(pBt);
2439 invalidateOverflowCache(pCur);
2440 sqliteFree(pCur);
2441 return SQLITE_OK;
2442}
2443
2444/*
2445** Make a temporary cursor by filling in the fields of pTempCur.
2446** The temporary cursor is not on the cursor list for the Btree.
2447*/
2448void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2449 memcpy(pTempCur, pCur, sizeof(*pCur));
2450 pTempCur->pNext = 0;
2451 pTempCur->pPrev = 0;
2452 if( pTempCur->pPage ){
2453 sqlite3PagerRef(pTempCur->pPage->pDbPage);
2454 }
2455}
2456
2457/*
2458** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2459** function above.
2460*/
2461void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2462 if( pCur->pPage ){
2463 sqlite3PagerUnref(pCur->pPage->pDbPage);
2464 }
2465}
2466
2467/*
2468** The GET_CELL_INFO() macro. Takes one argument, a pointer to a valid
2469** btree cursor (type BtCursor*). This macro makes sure the BtCursor.info
2470** field of the given cursor is valid. If it is not already valid, call
2471** sqlite3BtreeParseCell() to fill it in.
2472**
2473** BtCursor.info is a cache of the information in the current cell.
2474** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2475*/
2476#ifndef NDEBUG
2477 static void assertCellInfo(BtCursor *pCur){
2478 CellInfo info;
2479 memset(&info, 0, sizeof(info));
2480 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info);
2481 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2482 }
2483#else
2484 #define assertCellInfo(x)
2485#endif
2486
2487#define GET_CELL_INFO(pCur) \
2488 if( pCur->info.nSize==0 ) \
2489 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \
2490 else \
2491 assertCellInfo(pCur);
2492
2493
2494/*
2495** Set *pSize to the size of the buffer needed to hold the value of
2496** the key for the current entry. If the cursor is not pointing
2497** to a valid entry, *pSize is set to 0.
2498**
2499** For a table with the INTKEY flag set, this routine returns the key
2500** itself, not the number of bytes in the key.
2501*/
2502int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2503 int rc = restoreOrClearCursorPosition(pCur);
2504 if( rc==SQLITE_OK ){
2505 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2506 if( pCur->eState==CURSOR_INVALID ){
2507 *pSize = 0;
2508 }else{
2509 GET_CELL_INFO(pCur);
2510 *pSize = pCur->info.nKey;
2511 }
2512 }
2513 return rc;
2514}
2515
2516/*
2517** Set *pSize to the number of bytes of data in the entry the
2518** cursor currently points to. Always return SQLITE_OK.
2519** Failure is not possible. If the cursor is not currently
2520** pointing to an entry (which can happen, for example, if
2521** the database is empty) then *pSize is set to 0.
2522*/
2523int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2524 int rc = restoreOrClearCursorPosition(pCur);
2525 if( rc==SQLITE_OK ){
2526 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2527 if( pCur->eState==CURSOR_INVALID ){
2528 /* Not pointing at a valid entry - set *pSize to 0. */
2529 *pSize = 0;
2530 }else{
2531 GET_CELL_INFO(pCur);
2532 *pSize = pCur->info.nData;
2533 }
2534 }
2535 return rc;
2536}
2537
2538/*
2539** Given the page number of an overflow page in the database (parameter
2540** ovfl), this function finds the page number of the next page in the
2541** linked list of overflow pages. If possible, it uses the auto-vacuum
2542** pointer-map data instead of reading the content of page ovfl to do so.
2543**
2544** If an error occurs an SQLite error code is returned. Otherwise:
2545**
2546** Unless pPgnoNext is NULL, the page number of the next overflow
2547** page in the linked list is written to *pPgnoNext. If page ovfl
2548** is the last page in it's linked list, *pPgnoNext is set to zero.
2549**
2550** If ppPage is not NULL, *ppPage is set to the MemPage* handle
2551** for page ovfl. The underlying pager page may have been requested
2552** with the noContent flag set, so the page data accessable via
2553** this handle may not be trusted.
2554*/
2555static int getOverflowPage(
2556 BtShared *pBt,
2557 Pgno ovfl, /* Overflow page */
2558 MemPage **ppPage, /* OUT: MemPage handle */
2559 Pgno *pPgnoNext /* OUT: Next overflow page number */
2560){
2561 Pgno next = 0;
2562 int rc;
2563
2564 /* One of these must not be NULL. Otherwise, why call this function? */
2565 assert(ppPage || pPgnoNext);
2566
2567 /* If pPgnoNext is NULL, then this function is being called to obtain
2568 ** a MemPage* reference only. No page-data is required in this case.
2569 */
2570 if( !pPgnoNext ){
2571 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
2572 }
2573
2574#ifndef SQLITE_OMIT_AUTOVACUUM
2575 /* Try to find the next page in the overflow list using the
2576 ** autovacuum pointer-map pages. Guess that the next page in
2577 ** the overflow list is page number (ovfl+1). If that guess turns
2578 ** out to be wrong, fall back to loading the data of page
2579 ** number ovfl to determine the next page number.
2580 */
2581 if( pBt->autoVacuum ){
2582 Pgno pgno;
2583 Pgno iGuess = ovfl+1;
2584 u8 eType;
2585
2586 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
2587 iGuess++;
2588 }
2589
2590 if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){
2591 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
2592 if( rc!=SQLITE_OK ){
2593 return rc;
2594 }
2595 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
2596 next = iGuess;
2597 }
2598 }
2599 }
2600#endif
2601
2602 if( next==0 || ppPage ){
2603 MemPage *pPage = 0;
2604
2605 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
2606 assert(rc==SQLITE_OK || pPage==0);
2607 if( next==0 && rc==SQLITE_OK ){
2608 next = get4byte(pPage->aData);
2609 }
2610
2611 if( ppPage ){
2612 *ppPage = pPage;
2613 }else{
2614 releasePage(pPage);
2615 }
2616 }
2617 *pPgnoNext = next;
2618
2619 return rc;
2620}
2621
2622/*
2623** Copy data from a buffer to a page, or from a page to a buffer.
2624**
2625** pPayload is a pointer to data stored on database page pDbPage.
2626** If argument eOp is false, then nByte bytes of data are copied
2627** from pPayload to the buffer pointed at by pBuf. If eOp is true,
2628** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
2629** of data are copied from the buffer pBuf to pPayload.
2630**
2631** SQLITE_OK is returned on success, otherwise an error code.
2632*/
2633static int copyPayload(
2634 void *pPayload, /* Pointer to page data */
2635 void *pBuf, /* Pointer to buffer */
2636 int nByte, /* Number of bytes to copy */
2637 int eOp, /* 0 -> copy from page, 1 -> copy to page */
2638 DbPage *pDbPage /* Page containing pPayload */
2639){
2640 if( eOp ){
2641 /* Copy data from buffer to page (a write operation) */
2642 int rc = sqlite3PagerWrite(pDbPage);
2643 if( rc!=SQLITE_OK ){
2644 return rc;
2645 }
2646 memcpy(pPayload, pBuf, nByte);
2647 }else{
2648 /* Copy data from page to buffer (a read operation) */
2649 memcpy(pBuf, pPayload, nByte);
2650 }
2651 return SQLITE_OK;
2652}
2653
2654/*
2655** This function is used to read or overwrite payload information
2656** for the entry that the pCur cursor is pointing to. If the eOp
2657** parameter is 0, this is a read operation (data copied into
2658** buffer pBuf). If it is non-zero, a write (data copied from
2659** buffer pBuf).
2660**
2661** A total of "amt" bytes are read or written beginning at "offset".
2662** Data is read to or from the buffer pBuf.
2663**
2664** This routine does not make a distinction between key and data.
2665** It just reads or writes bytes from the payload area. Data might
2666** appear on the main page or be scattered out on multiple overflow
2667** pages.
2668**
2669** If the BtCursor.isIncrblobHandle flag is set, and the current
2670** cursor entry uses one or more overflow pages, this function
2671** allocates space for and lazily popluates the overflow page-list
2672** cache array (BtCursor.aOverflow). Subsequent calls use this
2673** cache to make seeking to the supplied offset more efficient.
2674**
2675** Once an overflow page-list cache has been allocated, it may be
2676** invalidated if some other cursor writes to the same table, or if
2677** the cursor is moved to a different row. Additionally, in auto-vacuum
2678** mode, the following events may invalidate an overflow page-list cache.
2679**
2680** * An incremental vacuum,
2681** * A commit in auto_vacuum="full" mode,
2682** * Creating a table (may require moving an overflow page).
2683*/
2684static int accessPayload(
2685 BtCursor *pCur, /* Cursor pointing to entry to read from */
2686 int offset, /* Begin reading this far into payload */
2687 int amt, /* Read this many bytes */
2688 unsigned char *pBuf, /* Write the bytes into this buffer */
2689 int skipKey, /* offset begins at data if this is true */
2690 int eOp /* zero to read. non-zero to write. */
2691){
2692 unsigned char *aPayload;
2693 int rc = SQLITE_OK;
2694 u32 nKey;
2695 int iIdx = 0;
2696 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */
2697 BtShared *pBt = pCur->pBtree->pBt; /* Btree this cursor belongs to */
2698
2699 assert( pPage );
2700 assert( pCur->eState==CURSOR_VALID );
2701 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2702 assert( offset>=0 );
2703
2704 GET_CELL_INFO(pCur);
2705 aPayload = pCur->info.pCell + pCur->info.nHeader;
2706 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
2707
2708 if( skipKey ){
2709 offset += nKey;
2710 }
2711 if( offset+amt > nKey+pCur->info.nData ){
2712 /* Trying to read or write past the end of the data is an error */
2713 return SQLITE_ERROR;
2714 }
2715
2716 /* Check if data must be read/written to/from the btree page itself. */
2717 if( offset<pCur->info.nLocal ){
2718 int a = amt;
2719 if( a+offset>pCur->info.nLocal ){
2720 a = pCur->info.nLocal - offset;
2721 }
2722 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
2723 offset = 0;
2724 pBuf += a;
2725 amt -= a;
2726 }else{
2727 offset -= pCur->info.nLocal;
2728 }
2729
2730 if( rc==SQLITE_OK && amt>0 ){
2731 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
2732 Pgno nextPage;
2733
2734 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
2735
2736#ifndef SQLITE_OMIT_INCRBLOB
2737 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
2738 ** has not been allocated, allocate it now. The array is sized at
2739 ** one entry for each overflow page in the overflow chain. The
2740 ** page number of the first overflow page is stored in aOverflow[0],
2741 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
2742 ** (the cache is lazily populated).
2743 */
2744 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
2745 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
2746 pCur->aOverflow = (Pgno *)sqliteMalloc(sizeof(Pgno)*nOvfl);
2747 if( nOvfl && !pCur->aOverflow ){
2748 rc = SQLITE_NOMEM;
2749 }
2750 }
2751
2752 /* If the overflow page-list cache has been allocated and the
2753 ** entry for the first required overflow page is valid, skip
2754 ** directly to it.
2755 */
2756 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
2757 iIdx = (offset/ovflSize);
2758 nextPage = pCur->aOverflow[iIdx];
2759 offset = (offset%ovflSize);
2760 }
2761#endif
2762
2763 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
2764
2765#ifndef SQLITE_OMIT_INCRBLOB
2766 /* If required, populate the overflow page-list cache. */
2767 if( pCur->aOverflow ){
2768 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
2769 pCur->aOverflow[iIdx] = nextPage;
2770 }
2771#endif
2772
2773 if( offset>=ovflSize ){
2774 /* The only reason to read this page is to obtain the page
2775 ** number for the next page in the overflow chain. The page
2776** data is not required. So first try to lookup the overflow
2777** page-list cache, if any, then fall back to the getOverflowPage()
2778 ** function.
2779 */
2780#ifndef SQLITE_OMIT_INCRBLOB
2781 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
2782 nextPage = pCur->aOverflow[iIdx+1];
2783 } else
2784#endif
2785 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
2786 offset -= ovflSize;
2787 }else{
2788 /* Need to read this page properly. It contains some of the
2789 ** range of data that is being read (eOp==0) or written (eOp!=0).
2790 */
2791 DbPage *pDbPage;
2792 int a = amt;
2793 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
2794 if( rc==SQLITE_OK ){
2795 aPayload = sqlite3PagerGetData(pDbPage);
2796 nextPage = get4byte(aPayload);
2797 if( a + offset > ovflSize ){
2798 a = ovflSize - offset;
2799 }
2800 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
2801 sqlite3PagerUnref(pDbPage);
2802 offset = 0;
2803 amt -= a;
2804 pBuf += a;
2805 }
2806 }
2807 }
2808 }
2809
2810 if( rc==SQLITE_OK && amt>0 ){
2811 return SQLITE_CORRUPT_BKPT;
2812 }
2813 return rc;
2814}
2815
2816/*
2817** Read part of the key associated with cursor pCur. Exactly
2818** "amt" bytes will be transfered into pBuf[]. The transfer
2819** begins at "offset".
2820**
2821** Return SQLITE_OK on success or an error code if anything goes
2822** wrong. An error is returned if "offset+amt" is larger than
2823** the available payload.
2824*/
2825int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2826 int rc = restoreOrClearCursorPosition(pCur);
2827 if( rc==SQLITE_OK ){
2828 assert( pCur->eState==CURSOR_VALID );
2829 assert( pCur->pPage!=0 );
2830 if( pCur->pPage->intKey ){
2831 return SQLITE_CORRUPT_BKPT;
2832 }
2833 assert( pCur->pPage->intKey==0 );
2834 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2835 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
2836 }
2837 return rc;
2838}
2839
2840/*
2841** Read part of the data associated with cursor pCur. Exactly
2842** "amt" bytes will be transfered into pBuf[]. The transfer
2843** begins at "offset".
2844**
2845** Return SQLITE_OK on success or an error code if anything goes
2846** wrong. An error is returned if "offset+amt" is larger than
2847** the available payload.
2848*/
2849int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
2850 int rc = restoreOrClearCursorPosition(pCur);
2851 if( rc==SQLITE_OK ){
2852 assert( pCur->eState==CURSOR_VALID );
2853 assert( pCur->pPage!=0 );
2854 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
2855 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
2856 }
2857 return rc;
2858}
2859
2860/*
2861** Return a pointer to payload information from the entry that the
2862** pCur cursor is pointing to. The pointer is to the beginning of
2863** the key if skipKey==0 and it points to the beginning of data if
2864** skipKey==1. The number of bytes of available key/data is written
2865** into *pAmt. If *pAmt==0, then the value returned will not be
2866** a valid pointer.
2867**
2868** This routine is an optimization. It is common for the entire key
2869** and data to fit on the local page and for there to be no overflow
2870** pages. When that is so, this routine can be used to access the
2871** key and data without making a copy. If the key and/or data spills
2872** onto overflow pages, then accessPayload() must be used to reassembly
2873** the key/data and copy it into a preallocated buffer.
2874**
2875** The pointer returned by this routine looks directly into the cached
2876** page of the database. The data might change or move the next time
2877** any btree routine is called.
2878*/
2879static const unsigned char *fetchPayload(
2880 BtCursor *pCur, /* Cursor pointing to entry to read from */
2881 int *pAmt, /* Write the number of available bytes here */
2882 int skipKey /* read beginning at data if this is true */
2883){
2884 unsigned char *aPayload;
2885 MemPage *pPage;
2886 u32 nKey;
2887 int nLocal;
2888
2889 assert( pCur!=0 && pCur->pPage!=0 );
2890 assert( pCur->eState==CURSOR_VALID );
2891 pPage = pCur->pPage;
2892 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2893 GET_CELL_INFO(pCur);
2894 aPayload = pCur->info.pCell;
2895 aPayload += pCur->info.nHeader;
2896 if( pPage->intKey ){
2897 nKey = 0;
2898 }else{
2899 nKey = pCur->info.nKey;
2900 }
2901 if( skipKey ){
2902 aPayload += nKey;
2903 nLocal = pCur->info.nLocal - nKey;
2904 }else{
2905 nLocal = pCur->info.nLocal;
2906 if( nLocal>nKey ){
2907 nLocal = nKey;
2908 }
2909 }
2910 *pAmt = nLocal;
2911 return aPayload;
2912}
2913
2914
2915/*
2916** For the entry that cursor pCur is point to, return as
2917** many bytes of the key or data as are available on the local
2918** b-tree page. Write the number of available bytes into *pAmt.
2919**
2920** The pointer returned is ephemeral. The key/data may move
2921** or be destroyed on the next call to any Btree routine.
2922**
2923** These routines is used to get quick access to key and data
2924** in the common case where no overflow pages are used.
2925*/
2926const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
2927 if( pCur->eState==CURSOR_VALID ){
2928 return (const void*)fetchPayload(pCur, pAmt, 0);
2929 }
2930 return 0;
2931}
2932const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
2933 if( pCur->eState==CURSOR_VALID ){
2934 return (const void*)fetchPayload(pCur, pAmt, 1);
2935 }
2936 return 0;
2937}
2938
2939
2940/*
2941** Move the cursor down to a new child page. The newPgno argument is the
2942** page number of the child page to move to.
2943*/
2944static int moveToChild(BtCursor *pCur, u32 newPgno){
2945 int rc;
2946 MemPage *pNewPage;
2947 MemPage *pOldPage;
2948 BtShared *pBt = pCur->pBtree->pBt;
2949
2950 assert( pCur->eState==CURSOR_VALID );
2951 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
2952 if( rc ) return rc;
2953 pNewPage->idxParent = pCur->idx;
2954 pOldPage = pCur->pPage;
2955 pOldPage->idxShift = 0;
2956 releasePage(pOldPage);
2957 pCur->pPage = pNewPage;
2958 pCur->idx = 0;
2959 pCur->info.nSize = 0;
2960 if( pNewPage->nCell<1 ){
2961 return SQLITE_CORRUPT_BKPT;
2962 }
2963 return SQLITE_OK;
2964}
2965
2966/*
2967** Return true if the page is the virtual root of its table.
2968**
2969** The virtual root page is the root page for most tables. But
2970** for the table rooted on page 1, sometime the real root page
2971** is empty except for the right-pointer. In such cases the
2972** virtual root page is the page that the right-pointer of page
2973** 1 is pointing to.
2974*/
2975int sqlite3BtreeIsRootPage(MemPage *pPage){
2976 MemPage *pParent = pPage->pParent;
2977 if( pParent==0 ) return 1;
2978 if( pParent->pgno>1 ) return 0;
2979 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
2980 return 0;
2981}
2982
2983/*
2984** Move the cursor up to the parent page.
2985**
2986** pCur->idx is set to the cell index that contains the pointer
2987** to the page we are coming from. If we are coming from the
2988** right-most child page then pCur->idx is set to one more than
2989** the largest cell index.
2990*/
2991void sqlite3BtreeMoveToParent(BtCursor *pCur){
2992 MemPage *pParent;
2993 MemPage *pPage;
2994 int idxParent;
2995
2996 assert( pCur->eState==CURSOR_VALID );
2997 pPage = pCur->pPage;
2998 assert( pPage!=0 );
2999 assert( !sqlite3BtreeIsRootPage(pPage) );
3000 pParent = pPage->pParent;
3001 assert( pParent!=0 );
3002 idxParent = pPage->idxParent;
3003 sqlite3PagerRef(pParent->pDbPage);
3004 releasePage(pPage);
3005 pCur->pPage = pParent;
3006 pCur->info.nSize = 0;
3007 assert( pParent->idxShift==0 );
3008 pCur->idx = idxParent;
3009}
3010
3011/*
3012** Move the cursor to the root page
3013*/
3014static int moveToRoot(BtCursor *pCur){
3015 MemPage *pRoot;
3016 int rc = SQLITE_OK;
3017 BtShared *pBt = pCur->pBtree->pBt;
3018
3019 if( pCur->eState==CURSOR_REQUIRESEEK ){
3020 clearCursorPosition(pCur);
3021 }
3022 pRoot = pCur->pPage;
3023 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3024 assert( pRoot->isInit );
3025 }else{
3026 if(
3027 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3028 ){
3029 pCur->eState = CURSOR_INVALID;
3030 return rc;
3031 }
3032 releasePage(pCur->pPage);
3033 pCur->pPage = pRoot;
3034 }
3035 pCur->idx = 0;
3036 pCur->info.nSize = 0;
3037 if( pRoot->nCell==0 && !pRoot->leaf ){
3038 Pgno subpage;
3039 assert( pRoot->pgno==1 );
3040 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3041 assert( subpage>0 );
3042 pCur->eState = CURSOR_VALID;
3043 rc = moveToChild(pCur, subpage);
3044 }
3045 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3046 return rc;
3047}
3048
3049/*
3050** Move the cursor down to the left-most leaf entry beneath the
3051** entry to which it is currently pointing.
3052**
3053** The left-most leaf is the one with the smallest key - the first
3054** in ascending order.
3055*/
3056static int moveToLeftmost(BtCursor *pCur){
3057 Pgno pgno;
3058 int rc;
3059 MemPage *pPage;
3060
3061 assert( pCur->eState==CURSOR_VALID );
3062 while( !(pPage = pCur->pPage)->leaf ){
3063 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3064 pgno = get4byte(findCell(pPage, pCur->idx));
3065 rc = moveToChild(pCur, pgno);
3066 if( rc ) return rc;
3067 }
3068 return SQLITE_OK;
3069}
3070
3071/*
3072** Move the cursor down to the right-most leaf entry beneath the
3073** page to which it is currently pointing. Notice the difference
3074** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3075** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3076** finds the right-most entry beneath the *page*.
3077**
3078** The right-most entry is the one with the largest key - the last
3079** key in ascending order.
3080*/
3081static int moveToRightmost(BtCursor *pCur){
3082 Pgno pgno;
3083 int rc;
3084 MemPage *pPage;
3085
3086 assert( pCur->eState==CURSOR_VALID );
3087 while( !(pPage = pCur->pPage)->leaf ){
3088 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3089 pCur->idx = pPage->nCell;
3090 rc = moveToChild(pCur, pgno);
3091 if( rc ) return rc;
3092 }
3093 pCur->idx = pPage->nCell - 1;
3094 pCur->info.nSize = 0;
3095 return SQLITE_OK;
3096}
3097
3098/* Move the cursor to the first entry in the table. Return SQLITE_OK
3099** on success. Set *pRes to 0 if the cursor actually points to something
3100** or set *pRes to 1 if the table is empty.
3101*/
3102int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3103 int rc;
3104 rc = moveToRoot(pCur);
3105 if( rc ) return rc;
3106 if( pCur->eState==CURSOR_INVALID ){
3107 assert( pCur->pPage->nCell==0 );
3108 *pRes = 1;
3109 return SQLITE_OK;
3110 }
3111 assert( pCur->pPage->nCell>0 );
3112 *pRes = 0;
3113 rc = moveToLeftmost(pCur);
3114 return rc;
3115}
3116
3117/* Move the cursor to the last entry in the table. Return SQLITE_OK
3118** on success. Set *pRes to 0 if the cursor actually points to something
3119** or set *pRes to 1 if the table is empty.
3120*/
3121int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3122 int rc;
3123 rc = moveToRoot(pCur);
3124 if( rc ) return rc;
3125 if( CURSOR_INVALID==pCur->eState ){
3126 assert( pCur->pPage->nCell==0 );
3127 *pRes = 1;
3128 return SQLITE_OK;
3129 }
3130 assert( pCur->eState==CURSOR_VALID );
3131 *pRes = 0;
3132 rc = moveToRightmost(pCur);
3133 return rc;
3134}
3135
3136/* Move the cursor so that it points to an entry near pKey/nKey.
3137** Return a success code.
3138**
3139** For INTKEY tables, only the nKey parameter is used. pKey is
3140** ignored. For other tables, nKey is the number of bytes of data
3141** in pKey. The comparison function specified when the cursor was
3142** created is used to compare keys.
3143**
3144** If an exact match is not found, then the cursor is always
3145** left pointing at a leaf page which would hold the entry if it
3146** were present. The cursor might point to an entry that comes
3147** before or after the key.
3148**
3149** The result of comparing the key with the entry to which the
3150** cursor is written to *pRes if pRes!=NULL. The meaning of
3151** this value is as follows:
3152**
3153** *pRes<0 The cursor is left pointing at an entry that
3154** is smaller than pKey or if the table is empty
3155** and the cursor is therefore left point to nothing.
3156**
3157** *pRes==0 The cursor is left pointing at an entry that
3158** exactly matches pKey.
3159**
3160** *pRes>0 The cursor is left pointing at an entry that
3161** is larger than pKey.
3162*/
3163int sqlite3BtreeMoveto(
3164 BtCursor *pCur, /* The cursor to be moved */
3165 const void *pKey, /* The key content for indices. Not used by tables */
3166 i64 nKey, /* Size of pKey. Or the key for tables */
3167 int biasRight, /* If true, bias the search to the high end */
3168 int *pRes /* Search result flag */
3169){
3170 int rc;
3171 rc = moveToRoot(pCur);
3172 if( rc ) return rc;
3173 assert( pCur->pPage );
3174 assert( pCur->pPage->isInit );
3175 if( pCur->eState==CURSOR_INVALID ){
3176 *pRes = -1;
3177 assert( pCur->pPage->nCell==0 );
3178 return SQLITE_OK;
3179 }
3180 for(;;){
3181 int lwr, upr;
3182 Pgno chldPg;
3183 MemPage *pPage = pCur->pPage;
3184 int c = -1; /* pRes return if table is empty must be -1 */
3185 lwr = 0;
3186 upr = pPage->nCell-1;
3187 if( !pPage->intKey && pKey==0 ){
3188 return SQLITE_CORRUPT_BKPT;
3189 }
3190 if( biasRight ){
3191 pCur->idx = upr;
3192 }else{
3193 pCur->idx = (upr+lwr)/2;
3194 }
3195 if( lwr<=upr ) for(;;){
3196 void *pCellKey;
3197 i64 nCellKey;
3198 pCur->info.nSize = 0;
3199 if( pPage->intKey ){
3200 u8 *pCell;
3201 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3202 if( pPage->hasData ){
3203 u32 dummy;
3204 pCell += getVarint32(pCell, &dummy);
3205 }
3206 getVarint(pCell, (u64 *)&nCellKey);
3207 if( nCellKey<nKey ){
3208 c = -1;
3209 }else if( nCellKey>nKey ){
3210 c = +1;
3211 }else{
3212 c = 0;
3213 }
3214 }else{
3215 int available;
3216 pCellKey = (void *)fetchPayload(pCur, &available, 0);
3217 nCellKey = pCur->info.nKey;
3218 if( available>=nCellKey ){
3219 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3220 }else{
3221 pCellKey = sqliteMallocRaw( nCellKey );
3222 if( pCellKey==0 ) return SQLITE_NOMEM;
3223 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3224 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3225 sqliteFree(pCellKey);
3226 if( rc ) return rc;
3227 }
3228 }
3229 if( c==0 ){
3230 if( pPage->leafData && !pPage->leaf ){
3231 lwr = pCur->idx;
3232 upr = lwr - 1;
3233 break;
3234 }else{
3235 if( pRes ) *pRes = 0;
3236 return SQLITE_OK;
3237 }
3238 }
3239 if( c<0 ){
3240 lwr = pCur->idx+1;
3241 }else{
3242 upr = pCur->idx-1;
3243 }
3244 if( lwr>upr ){
3245 break;
3246 }
3247 pCur->idx = (lwr+upr)/2;
3248 }
3249 assert( lwr==upr+1 );
3250 assert( pPage->isInit );
3251 if( pPage->leaf ){
3252 chldPg = 0;
3253 }else if( lwr>=pPage->nCell ){
3254 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3255 }else{
3256 chldPg = get4byte(findCell(pPage, lwr));
3257 }
3258 if( chldPg==0 ){
3259 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3260 if( pRes ) *pRes = c;
3261 return SQLITE_OK;
3262 }
3263 pCur->idx = lwr;
3264 pCur->info.nSize = 0;
3265 rc = moveToChild(pCur, chldPg);
3266 if( rc ){
3267 return rc;
3268 }
3269 }
3270 /* NOT REACHED */
3271}
3272
3273/*
3274** Return TRUE if the cursor is not pointing at an entry of the table.
3275**
3276** TRUE will be returned after a call to sqlite3BtreeNext() moves
3277** past the last entry in the table or sqlite3BtreePrev() moves past
3278** the first entry. TRUE is also returned if the table is empty.
3279*/
3280int sqlite3BtreeEof(BtCursor *pCur){
3281 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3282 ** have been deleted? This API will need to change to return an error code
3283 ** as well as the boolean result value.
3284 */
3285 return (CURSOR_VALID!=pCur->eState);
3286}
3287
3288/*
3289** Advance the cursor to the next entry in the database. If
3290** successful then set *pRes=0. If the cursor
3291** was already pointing to the last entry in the database before
3292** this routine was called, then set *pRes=1.
3293*/
3294int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3295 int rc;
3296 MemPage *pPage;
3297
3298 rc = restoreOrClearCursorPosition(pCur);
3299 if( rc!=SQLITE_OK ){
3300 return rc;
3301 }
3302 assert( pRes!=0 );
3303 pPage = pCur->pPage;
3304 if( CURSOR_INVALID==pCur->eState ){
3305 *pRes = 1;
3306 return SQLITE_OK;
3307 }
3308 if( pCur->skip>0 ){
3309 pCur->skip = 0;
3310 *pRes = 0;
3311 return SQLITE_OK;
3312 }
3313 pCur->skip = 0;
3314
3315 assert( pPage->isInit );
3316 assert( pCur->idx<pPage->nCell );
3317
3318 pCur->idx++;
3319 pCur->info.nSize = 0;
3320 if( pCur->idx>=pPage->nCell ){
3321 if( !pPage->leaf ){
3322 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3323 if( rc ) return rc;
3324 rc = moveToLeftmost(pCur);
3325 *pRes = 0;
3326 return rc;
3327 }
3328 do{
3329 if( sqlite3BtreeIsRootPage(pPage) ){
3330 *pRes = 1;
3331 pCur->eState = CURSOR_INVALID;
3332 return SQLITE_OK;
3333 }
3334 sqlite3BtreeMoveToParent(pCur);
3335 pPage = pCur->pPage;
3336 }while( pCur->idx>=pPage->nCell );
3337 *pRes = 0;
3338 if( pPage->leafData ){
3339 rc = sqlite3BtreeNext(pCur, pRes);
3340 }else{
3341 rc = SQLITE_OK;
3342 }
3343 return rc;
3344 }
3345 *pRes = 0;
3346 if( pPage->leaf ){
3347 return SQLITE_OK;
3348 }
3349 rc = moveToLeftmost(pCur);
3350 return rc;
3351}
3352
3353/*
3354** Step the cursor to the back to the previous entry in the database. If
3355** successful then set *pRes=0. If the cursor
3356** was already pointing to the first entry in the database before
3357** this routine was called, then set *pRes=1.
3358*/
3359int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3360 int rc;
3361 Pgno pgno;
3362 MemPage *pPage;
3363
3364 rc = restoreOrClearCursorPosition(pCur);
3365 if( rc!=SQLITE_OK ){
3366 return rc;
3367 }
3368 if( CURSOR_INVALID==pCur->eState ){
3369 *pRes = 1;
3370 return SQLITE_OK;
3371 }
3372 if( pCur->skip<0 ){
3373 pCur->skip = 0;
3374 *pRes = 0;
3375 return SQLITE_OK;
3376 }
3377 pCur->skip = 0;
3378
3379 pPage = pCur->pPage;
3380 assert( pPage->isInit );
3381 assert( pCur->idx>=0 );
3382 if( !pPage->leaf ){
3383 pgno = get4byte( findCell(pPage, pCur->idx) );
3384 rc = moveToChild(pCur, pgno);
3385 if( rc ) return rc;
3386 rc = moveToRightmost(pCur);
3387 }else{
3388 while( pCur->idx==0 ){
3389 if( sqlite3BtreeIsRootPage(pPage) ){
3390 pCur->eState = CURSOR_INVALID;
3391 *pRes = 1;
3392 return SQLITE_OK;
3393 }
3394 sqlite3BtreeMoveToParent(pCur);
3395 pPage = pCur->pPage;
3396 }
3397 pCur->idx--;
3398 pCur->info.nSize = 0;
3399 if( pPage->leafData && !pPage->leaf ){
3400 rc = sqlite3BtreePrevious(pCur, pRes);
3401 }else{
3402 rc = SQLITE_OK;
3403 }
3404 }
3405 *pRes = 0;
3406 return rc;
3407}
3408
3409/*
3410** Allocate a new page from the database file.
3411**
3412** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
3413** has already been called on the new page.) The new page has also
3414** been referenced and the calling routine is responsible for calling
3415** sqlite3PagerUnref() on the new page when it is done.
3416**
3417** SQLITE_OK is returned on success. Any other return value indicates
3418** an error. *ppPage and *pPgno are undefined in the event of an error.
3419** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
3420**
3421** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3422** locate a page close to the page number "nearby". This can be used in an
3423** attempt to keep related pages close to each other in the database file,
3424** which in turn can make database access faster.
3425**
3426** If the "exact" parameter is not 0, and the page-number nearby exists
3427** anywhere on the free-list, then it is guarenteed to be returned. This
3428** is only used by auto-vacuum databases when allocating a new table.
3429*/
3430static int allocateBtreePage(
3431 BtShared *pBt,
3432 MemPage **ppPage,
3433 Pgno *pPgno,
3434 Pgno nearby,
3435 u8 exact
3436){
3437 MemPage *pPage1;
3438 int rc;
3439 int n; /* Number of pages on the freelist */
3440 int k; /* Number of leaves on the trunk of the freelist */
3441 MemPage *pTrunk = 0;
3442 MemPage *pPrevTrunk = 0;
3443
3444 pPage1 = pBt->pPage1;
3445 n = get4byte(&pPage1->aData[36]);
3446 if( n>0 ){
3447 /* There are pages on the freelist. Reuse one of those pages. */
3448 Pgno iTrunk;
3449 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3450
3451 /* If the 'exact' parameter was true and a query of the pointer-map
3452 ** shows that the page 'nearby' is somewhere on the free-list, then
3453 ** the entire-list will be searched for that page.
3454 */
3455#ifndef SQLITE_OMIT_AUTOVACUUM
3456 if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){
3457 u8 eType;
3458 assert( nearby>0 );
3459 assert( pBt->autoVacuum );
3460 rc = ptrmapGet(pBt, nearby, &eType, 0);
3461 if( rc ) return rc;
3462 if( eType==PTRMAP_FREEPAGE ){
3463 searchList = 1;
3464 }
3465 *pPgno = nearby;
3466 }
3467#endif
3468
3469 /* Decrement the free-list count by 1. Set iTrunk to the index of the
3470 ** first free-list trunk page. iPrevTrunk is initially 1.
3471 */
3472 rc = sqlite3PagerWrite(pPage1->pDbPage);
3473 if( rc ) return rc;
3474 put4byte(&pPage1->aData[36], n-1);
3475
3476 /* The code within this loop is run only once if the 'searchList' variable
3477 ** is not true. Otherwise, it runs once for each trunk-page on the
3478 ** free-list until the page 'nearby' is located.
3479 */
3480 do {
3481 pPrevTrunk = pTrunk;
3482 if( pPrevTrunk ){
3483 iTrunk = get4byte(&pPrevTrunk->aData[0]);
3484 }else{
3485 iTrunk = get4byte(&pPage1->aData[32]);
3486 }
3487 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
3488 if( rc ){
3489 pTrunk = 0;
3490 goto end_allocate_page;
3491 }
3492
3493 k = get4byte(&pTrunk->aData[4]);
3494 if( k==0 && !searchList ){
3495 /* The trunk has no leaves and the list is not being searched.
3496 ** So extract the trunk page itself and use it as the newly
3497 ** allocated page */
3498 assert( pPrevTrunk==0 );
3499 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3500 if( rc ){
3501 goto end_allocate_page;
3502 }
3503 *pPgno = iTrunk;
3504 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3505 *ppPage = pTrunk;
3506 pTrunk = 0;
3507 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3508 }else if( k>pBt->usableSize/4 - 8 ){
3509 /* Value of k is out of range. Database corruption */
3510 rc = SQLITE_CORRUPT_BKPT;
3511 goto end_allocate_page;
3512#ifndef SQLITE_OMIT_AUTOVACUUM
3513 }else if( searchList && nearby==iTrunk ){
3514 /* The list is being searched and this trunk page is the page
3515 ** to allocate, regardless of whether it has leaves.
3516 */
3517 assert( *pPgno==iTrunk );
3518 *ppPage = pTrunk;
3519 searchList = 0;
3520 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3521 if( rc ){
3522 goto end_allocate_page;
3523 }
3524 if( k==0 ){
3525 if( !pPrevTrunk ){
3526 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3527 }else{
3528 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3529 }
3530 }else{
3531 /* The trunk page is required by the caller but it contains
3532 ** pointers to free-list leaves. The first leaf becomes a trunk
3533 ** page in this case.
3534 */
3535 MemPage *pNewTrunk;
3536 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3537 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
3538 if( rc!=SQLITE_OK ){
3539 goto end_allocate_page;
3540 }
3541 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
3542 if( rc!=SQLITE_OK ){
3543 releasePage(pNewTrunk);
3544 goto end_allocate_page;
3545 }
3546 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3547 put4byte(&pNewTrunk->aData[4], k-1);
3548 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3549 releasePage(pNewTrunk);
3550 if( !pPrevTrunk ){
3551 put4byte(&pPage1->aData[32], iNewTrunk);
3552 }else{
3553 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
3554 if( rc ){
3555 goto end_allocate_page;
3556 }
3557 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3558 }
3559 }
3560 pTrunk = 0;
3561 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3562#endif
3563 }else{
3564 /* Extract a leaf from the trunk */
3565 int closest;
3566 Pgno iPage;
3567 unsigned char *aData = pTrunk->aData;
3568 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3569 if( rc ){
3570 goto end_allocate_page;
3571 }
3572 if( nearby>0 ){
3573 int i, dist;
3574 closest = 0;
3575 dist = get4byte(&aData[8]) - nearby;
3576 if( dist<0 ) dist = -dist;
3577 for(i=1; i<k; i++){
3578 int d2 = get4byte(&aData[8+i*4]) - nearby;
3579 if( d2<0 ) d2 = -d2;
3580 if( d2<dist ){
3581 closest = i;
3582 dist = d2;
3583 }
3584 }
3585 }else{
3586 closest = 0;
3587 }
3588
3589 iPage = get4byte(&aData[8+closest*4]);
3590 if( !searchList || iPage==nearby ){
3591 *pPgno = iPage;
3592 if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){
3593 /* Free page off the end of the file */
3594 return SQLITE_CORRUPT_BKPT;
3595 }
3596 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
3597 ": %d more free pages\n",
3598 *pPgno, closest+1, k, pTrunk->pgno, n-1));
3599 if( closest<k-1 ){
3600 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
3601 }
3602 put4byte(&aData[4], k-1);
3603 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
3604 if( rc==SQLITE_OK ){
3605 sqlite3PagerDontRollback((*ppPage)->pDbPage);
3606 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
3607 if( rc!=SQLITE_OK ){
3608 releasePage(*ppPage);
3609 }
3610 }
3611 searchList = 0;
3612 }
3613 }
3614 releasePage(pPrevTrunk);
3615 pPrevTrunk = 0;
3616 }while( searchList );
3617 }else{
3618 /* There are no pages on the freelist, so create a new page at the
3619 ** end of the file */
3620 *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1;
3621
3622#ifndef SQLITE_OMIT_AUTOVACUUM
3623 if( pBt->nTrunc ){
3624 /* An incr-vacuum has already run within this transaction. So the
3625 ** page to allocate is not from the physical end of the file, but
3626 ** at pBt->nTrunc.
3627 */
3628 *pPgno = pBt->nTrunc+1;
3629 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
3630 (*pPgno)++;
3631 }
3632 }
3633 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
3634 /* If *pPgno refers to a pointer-map page, allocate two new pages
3635 ** at the end of the file instead of one. The first allocated page
3636 ** becomes a new pointer-map page, the second is used by the caller.
3637 */
3638 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
3639 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3640 (*pPgno)++;
3641 }
3642 if( pBt->nTrunc ){
3643 pBt->nTrunc = *pPgno;
3644 }
3645#endif
3646
3647 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3648 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
3649 if( rc ) return rc;
3650 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
3651 if( rc!=SQLITE_OK ){
3652 releasePage(*ppPage);
3653 }
3654 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
3655 }
3656
3657 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3658
3659end_allocate_page:
3660 releasePage(pTrunk);
3661 releasePage(pPrevTrunk);
3662 return rc;
3663}
3664
3665/*
3666** Add a page of the database file to the freelist.
3667**
3668** sqlite3PagerUnref() is NOT called for pPage.
3669*/
3670static int freePage(MemPage *pPage){
3671 BtShared *pBt = pPage->pBt;
3672 MemPage *pPage1 = pBt->pPage1;
3673 int rc, n, k;
3674
3675 /* Prepare the page for freeing */
3676 assert( pPage->pgno>1 );
3677 pPage->isInit = 0;
3678 releasePage(pPage->pParent);
3679 pPage->pParent = 0;
3680
3681 /* Increment the free page count on pPage1 */
3682 rc = sqlite3PagerWrite(pPage1->pDbPage);
3683 if( rc ) return rc;
3684 n = get4byte(&pPage1->aData[36]);
3685 put4byte(&pPage1->aData[36], n+1);
3686
3687#ifdef SQLITE_SECURE_DELETE
3688 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
3689 ** always fully overwrite deleted information with zeros.
3690 */
3691 rc = sqlite3PagerWrite(pPage->pDbPage);
3692 if( rc ) return rc;
3693 memset(pPage->aData, 0, pPage->pBt->pageSize);
3694#endif
3695
3696#ifndef SQLITE_OMIT_AUTOVACUUM
3697 /* If the database supports auto-vacuum, write an entry in the pointer-map
3698 ** to indicate that the page is free.
3699 */
3700 if( pBt->autoVacuum ){
3701 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
3702 if( rc ) return rc;
3703 }
3704#endif
3705
3706 if( n==0 ){
3707 /* This is the first free page */
3708 rc = sqlite3PagerWrite(pPage->pDbPage);
3709 if( rc ) return rc;
3710 memset(pPage->aData, 0, 8);
3711 put4byte(&pPage1->aData[32], pPage->pgno);
3712 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
3713 }else{
3714 /* Other free pages already exist. Retrive the first trunk page
3715 ** of the freelist and find out how many leaves it has. */
3716 MemPage *pTrunk;
3717 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
3718 if( rc ) return rc;
3719 k = get4byte(&pTrunk->aData[4]);
3720 if( k>=pBt->usableSize/4 - 8 ){
3721 /* The trunk is full. Turn the page being freed into a new
3722 ** trunk page with no leaves. */
3723 rc = sqlite3PagerWrite(pPage->pDbPage);
3724 if( rc ) return rc;
3725 put4byte(pPage->aData, pTrunk->pgno);
3726 put4byte(&pPage->aData[4], 0);
3727 put4byte(&pPage1->aData[32], pPage->pgno);
3728 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
3729 pPage->pgno, pTrunk->pgno));
3730 }else{
3731 /* Add the newly freed page as a leaf on the current trunk */
3732 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3733 if( rc==SQLITE_OK ){
3734 put4byte(&pTrunk->aData[4], k+1);
3735 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
3736#ifndef SQLITE_SECURE_DELETE
3737 sqlite3PagerDontWrite(pPage->pDbPage);
3738#endif
3739 }
3740 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
3741 }
3742 releasePage(pTrunk);
3743 }
3744 return rc;
3745}
3746
3747/*
3748** Free any overflow pages associated with the given Cell.
3749*/
3750static int clearCell(MemPage *pPage, unsigned char *pCell){
3751 BtShared *pBt = pPage->pBt;
3752 CellInfo info;
3753 Pgno ovflPgno;
3754 int rc;
3755 int nOvfl;
3756 int ovflPageSize;
3757
3758 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
3759 if( info.iOverflow==0 ){
3760 return SQLITE_OK; /* No overflow pages. Return without doing anything */
3761 }
3762 ovflPgno = get4byte(&pCell[info.iOverflow]);
3763 ovflPageSize = pBt->usableSize - 4;
3764 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
3765 assert( ovflPgno==0 || nOvfl>0 );
3766 while( nOvfl-- ){
3767 MemPage *pOvfl;
3768 if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){
3769 return SQLITE_CORRUPT_BKPT;
3770 }
3771
3772 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
3773 if( rc ) return rc;
3774 rc = freePage(pOvfl);
3775 sqlite3PagerUnref(pOvfl->pDbPage);
3776 if( rc ) return rc;
3777 }
3778 return SQLITE_OK;
3779}
3780
3781/*
3782** Create the byte sequence used to represent a cell on page pPage
3783** and write that byte sequence into pCell[]. Overflow pages are
3784** allocated and filled in as necessary. The calling procedure
3785** is responsible for making sure sufficient space has been allocated
3786** for pCell[].
3787**
3788** Note that pCell does not necessary need to point to the pPage->aData
3789** area. pCell might point to some temporary storage. The cell will
3790** be constructed in this temporary area then copied into pPage->aData
3791** later.
3792*/
3793static int fillInCell(
3794 MemPage *pPage, /* The page that contains the cell */
3795 unsigned char *pCell, /* Complete text of the cell */
3796 const void *pKey, i64 nKey, /* The key */
3797 const void *pData,int nData, /* The data */
3798 int nZero, /* Extra zero bytes to append to pData */
3799 int *pnSize /* Write cell size here */
3800){
3801 int nPayload;
3802 const u8 *pSrc;
3803 int nSrc, n, rc;
3804 int spaceLeft;
3805 MemPage *pOvfl = 0;
3806 MemPage *pToRelease = 0;
3807 unsigned char *pPrior;
3808 unsigned char *pPayload;
3809 BtShared *pBt = pPage->pBt;
3810 Pgno pgnoOvfl = 0;
3811 int nHeader;
3812 CellInfo info;
3813
3814 /* Fill in the header. */
3815 nHeader = 0;
3816 if( !pPage->leaf ){
3817 nHeader += 4;
3818 }
3819 if( pPage->hasData ){
3820 nHeader += putVarint(&pCell[nHeader], nData+nZero);
3821 }else{
3822 nData = nZero = 0;
3823 }
3824 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
3825 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
3826 assert( info.nHeader==nHeader );
3827 assert( info.nKey==nKey );
3828 assert( info.nData==nData+nZero );
3829
3830 /* Fill in the payload */
3831 nPayload = nData + nZero;
3832 if( pPage->intKey ){
3833 pSrc = pData;
3834 nSrc = nData;
3835 nData = 0;
3836 }else{
3837 nPayload += nKey;
3838 pSrc = pKey;
3839 nSrc = nKey;
3840 }
3841 *pnSize = info.nSize;
3842 spaceLeft = info.nLocal;
3843 pPayload = &pCell[nHeader];
3844 pPrior = &pCell[info.iOverflow];
3845
3846 while( nPayload>0 ){
3847 if( spaceLeft==0 ){
3848 int isExact = 0;
3849#ifndef SQLITE_OMIT_AUTOVACUUM
3850 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
3851 if( pBt->autoVacuum ){
3852 do{
3853 pgnoOvfl++;
3854 } while(
3855 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
3856 );
3857 if( pgnoOvfl>1 ){
3858 /* isExact = 1; */
3859 }
3860 }
3861#endif
3862 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
3863#ifndef SQLITE_OMIT_AUTOVACUUM
3864 /* If the database supports auto-vacuum, and the second or subsequent
3865 ** overflow page is being allocated, add an entry to the pointer-map
3866 ** for that page now.
3867 **
3868 ** If this is the first overflow page, then write a partial entry
3869 ** to the pointer-map. If we write nothing to this pointer-map slot,
3870 ** then the optimistic overflow chain processing in clearCell()
3871 ** may misinterpret the uninitialised values and delete the
3872 ** wrong pages from the database.
3873 */
3874 if( pBt->autoVacuum && rc==SQLITE_OK ){
3875 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
3876 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
3877 if( rc ){
3878 releasePage(pOvfl);
3879 }
3880 }
3881#endif
3882 if( rc ){
3883 releasePage(pToRelease);
3884 return rc;
3885 }
3886 put4byte(pPrior, pgnoOvfl);
3887 releasePage(pToRelease);
3888 pToRelease = pOvfl;
3889 pPrior = pOvfl->aData;
3890 put4byte(pPrior, 0);
3891 pPayload = &pOvfl->aData[4];
3892 spaceLeft = pBt->usableSize - 4;
3893 }
3894 n = nPayload;
3895 if( n>spaceLeft ) n = spaceLeft;
3896 if( nSrc>0 ){
3897 if( n>nSrc ) n = nSrc;
3898 assert( pSrc );
3899 memcpy(pPayload, pSrc, n);
3900 }else{
3901 memset(pPayload, 0, n);
3902 }
3903 nPayload -= n;
3904 pPayload += n;
3905 pSrc += n;
3906 nSrc -= n;
3907 spaceLeft -= n;
3908 if( nSrc==0 ){
3909 nSrc = nData;
3910 pSrc = pData;
3911 }
3912 }
3913 releasePage(pToRelease);
3914 return SQLITE_OK;
3915}
3916
3917/*
3918** Change the MemPage.pParent pointer on the page whose number is
3919** given in the second argument so that MemPage.pParent holds the
3920** pointer in the third argument.
3921*/
3922static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
3923 MemPage *pThis;
3924 DbPage *pDbPage;
3925
3926 assert( pNewParent!=0 );
3927 if( pgno==0 ) return SQLITE_OK;
3928 assert( pBt->pPager!=0 );
3929 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
3930 if( pDbPage ){
3931 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
3932 if( pThis->isInit ){
3933 assert( pThis->aData==(sqlite3PagerGetData(pDbPage)) );
3934 if( pThis->pParent!=pNewParent ){
3935 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
3936 pThis->pParent = pNewParent;
3937 sqlite3PagerRef(pNewParent->pDbPage);
3938 }
3939 pThis->idxParent = idx;
3940 }
3941 sqlite3PagerUnref(pDbPage);
3942 }
3943
3944#ifndef SQLITE_OMIT_AUTOVACUUM
3945 if( pBt->autoVacuum ){
3946 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
3947 }
3948#endif
3949 return SQLITE_OK;
3950}
3951
3952
3953
3954/*
3955** Change the pParent pointer of all children of pPage to point back
3956** to pPage.
3957**
3958** In other words, for every child of pPage, invoke reparentPage()
3959** to make sure that each child knows that pPage is its parent.
3960**
3961** This routine gets called after you memcpy() one page into
3962** another.
3963*/
3964static int reparentChildPages(MemPage *pPage){
3965 int i;
3966 BtShared *pBt = pPage->pBt;
3967 int rc = SQLITE_OK;
3968
3969 if( pPage->leaf ) return SQLITE_OK;
3970
3971 for(i=0; i<pPage->nCell; i++){
3972 u8 *pCell = findCell(pPage, i);
3973 if( !pPage->leaf ){
3974 rc = reparentPage(pBt, get4byte(pCell), pPage, i);
3975 if( rc!=SQLITE_OK ) return rc;
3976 }
3977 }
3978 if( !pPage->leaf ){
3979 rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
3980 pPage, i);
3981 pPage->idxShift = 0;
3982 }
3983 return rc;
3984}
3985
3986/*
3987** Remove the i-th cell from pPage. This routine effects pPage only.
3988** The cell content is not freed or deallocated. It is assumed that
3989** the cell content has been copied someplace else. This routine just
3990** removes the reference to the cell from pPage.
3991**
3992** "sz" must be the number of bytes in the cell.
3993*/
3994static void dropCell(MemPage *pPage, int idx, int sz){
3995 int i; /* Loop counter */
3996 int pc; /* Offset to cell content of cell being deleted */
3997 u8 *data; /* pPage->aData */
3998 u8 *ptr; /* Used to move bytes around within data[] */
3999
4000 assert( idx>=0 && idx<pPage->nCell );
4001 assert( sz==cellSize(pPage, idx) );
4002 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4003 data = pPage->aData;
4004 ptr = &data[pPage->cellOffset + 2*idx];
4005 pc = get2byte(ptr);
4006 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4007 freeSpace(pPage, pc, sz);
4008 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4009 ptr[0] = ptr[2];
4010 ptr[1] = ptr[3];
4011 }
4012 pPage->nCell--;
4013 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4014 pPage->nFree += 2;
4015 pPage->idxShift = 1;
4016}
4017
4018/*
4019** Insert a new cell on pPage at cell index "i". pCell points to the
4020** content of the cell.
4021**
4022** If the cell content will fit on the page, then put it there. If it
4023** will not fit, then make a copy of the cell content into pTemp if
4024** pTemp is not null. Regardless of pTemp, allocate a new entry
4025** in pPage->aOvfl[] and make it point to the cell content (either
4026** in pTemp or the original pCell) and also record its index.
4027** Allocating a new entry in pPage->aCell[] implies that
4028** pPage->nOverflow is incremented.
4029**
4030** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4031** cell. The caller will overwrite them after this function returns. If
4032** nSkip is non-zero, then pCell may not point to an invalid memory location
4033** (but pCell+nSkip is always valid).
4034*/
4035static int insertCell(
4036 MemPage *pPage, /* Page into which we are copying */
4037 int i, /* New cell becomes the i-th cell of the page */
4038 u8 *pCell, /* Content of the new cell */
4039 int sz, /* Bytes of content in pCell */
4040 u8 *pTemp, /* Temp storage space for pCell, if needed */
4041 u8 nSkip /* Do not write the first nSkip bytes of the cell */
4042){
4043 int idx; /* Where to write new cell content in data[] */
4044 int j; /* Loop counter */
4045 int top; /* First byte of content for any cell in data[] */
4046 int end; /* First byte past the last cell pointer in data[] */
4047 int ins; /* Index in data[] where new cell pointer is inserted */
4048 int hdr; /* Offset into data[] of the page header */
4049 int cellOffset; /* Address of first cell pointer in data[] */
4050 u8 *data; /* The content of the whole page */
4051 u8 *ptr; /* Used for moving information around in data[] */
4052
4053 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4054 assert( sz==cellSizePtr(pPage, pCell) );
4055 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4056 if( pPage->nOverflow || sz+2>pPage->nFree ){
4057 if( pTemp ){
4058 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4059 pCell = pTemp;
4060 }
4061 j = pPage->nOverflow++;
4062 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4063 pPage->aOvfl[j].pCell = pCell;
4064 pPage->aOvfl[j].idx = i;
4065 pPage->nFree = 0;
4066 }else{
4067 data = pPage->aData;
4068 hdr = pPage->hdrOffset;
4069 top = get2byte(&data[hdr+5]);
4070 cellOffset = pPage->cellOffset;
4071 end = cellOffset + 2*pPage->nCell + 2;
4072 ins = cellOffset + 2*i;
4073 if( end > top - sz ){
4074 int rc = defragmentPage(pPage);
4075 if( rc!=SQLITE_OK ) return rc;
4076 top = get2byte(&data[hdr+5]);
4077 assert( end + sz <= top );
4078 }
4079 idx = allocateSpace(pPage, sz);
4080 assert( idx>0 );
4081 assert( end <= get2byte(&data[hdr+5]) );
4082 pPage->nCell++;
4083 pPage->nFree -= 2;
4084 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4085 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4086 ptr[0] = ptr[-2];
4087 ptr[1] = ptr[-1];
4088 }
4089 put2byte(&data[ins], idx);
4090 put2byte(&data[hdr+3], pPage->nCell);
4091 pPage->idxShift = 1;
4092#ifndef SQLITE_OMIT_AUTOVACUUM
4093 if( pPage->pBt->autoVacuum ){
4094 /* The cell may contain a pointer to an overflow page. If so, write
4095 ** the entry for the overflow page into the pointer map.
4096 */
4097 CellInfo info;
4098 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4099 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4100 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4101 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4102 int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4103 if( rc!=SQLITE_OK ) return rc;
4104 }
4105 }
4106#endif
4107 }
4108
4109 return SQLITE_OK;
4110}
4111
4112/*
4113** Add a list of cells to a page. The page should be initially empty.
4114** The cells are guaranteed to fit on the page.
4115*/
4116static void assemblePage(
4117 MemPage *pPage, /* The page to be assemblied */
4118 int nCell, /* The number of cells to add to this page */
4119 u8 **apCell, /* Pointers to cell bodies */
4120 int *aSize /* Sizes of the cells */
4121){
4122 int i; /* Loop counter */
4123 int totalSize; /* Total size of all cells */
4124 int hdr; /* Index of page header */
4125 int cellptr; /* Address of next cell pointer */
4126 int cellbody; /* Address of next cell body */
4127 u8 *data; /* Data for the page */
4128
4129 assert( pPage->nOverflow==0 );
4130 totalSize = 0;
4131 for(i=0; i<nCell; i++){
4132 totalSize += aSize[i];
4133 }
4134 assert( totalSize+2*nCell<=pPage->nFree );
4135 assert( pPage->nCell==0 );
4136 cellptr = pPage->cellOffset;
4137 data = pPage->aData;
4138 hdr = pPage->hdrOffset;
4139 put2byte(&data[hdr+3], nCell);
4140 if( nCell ){
4141 cellbody = allocateSpace(pPage, totalSize);
4142 assert( cellbody>0 );
4143 assert( pPage->nFree >= 2*nCell );
4144 pPage->nFree -= 2*nCell;
4145 for(i=0; i<nCell; i++){
4146 put2byte(&data[cellptr], cellbody);
4147 memcpy(&data[cellbody], apCell[i], aSize[i]);
4148 cellptr += 2;
4149 cellbody += aSize[i];
4150 }
4151 assert( cellbody==pPage->pBt->usableSize );
4152 }
4153 pPage->nCell = nCell;
4154}
4155
4156/*
4157** The following parameters determine how many adjacent pages get involved
4158** in a balancing operation. NN is the number of neighbors on either side
4159** of the page that participate in the balancing operation. NB is the
4160** total number of pages that participate, including the target page and
4161** NN neighbors on either side.
4162**
4163** The minimum value of NN is 1 (of course). Increasing NN above 1
4164** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4165** in exchange for a larger degradation in INSERT and UPDATE performance.
4166** The value of NN appears to give the best results overall.
4167*/
4168#define NN 1 /* Number of neighbors on either side of pPage */
4169#define NB (NN*2+1) /* Total pages involved in the balance */
4170
4171/* Forward reference */
4172static int balance(MemPage*, int);
4173
4174#ifndef SQLITE_OMIT_QUICKBALANCE
4175/*
4176** This version of balance() handles the common special case where
4177** a new entry is being inserted on the extreme right-end of the
4178** tree, in other words, when the new entry will become the largest
4179** entry in the tree.
4180**
4181** Instead of trying balance the 3 right-most leaf pages, just add
4182** a new page to the right-hand side and put the one new entry in
4183** that page. This leaves the right side of the tree somewhat
4184** unbalanced. But odds are that we will be inserting new entries
4185** at the end soon afterwards so the nearly empty page will quickly
4186** fill up. On average.
4187**
4188** pPage is the leaf page which is the right-most page in the tree.
4189** pParent is its parent. pPage must have a single overflow entry
4190** which is also the right-most entry on the page.
4191*/
4192static int balance_quick(MemPage *pPage, MemPage *pParent){
4193 int rc;
4194 MemPage *pNew;
4195 Pgno pgnoNew;
4196 u8 *pCell;
4197 int szCell;
4198 CellInfo info;
4199 BtShared *pBt = pPage->pBt;
4200 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4201 int parentSize; /* Size of new divider cell */
4202 u8 parentCell[64]; /* Space for the new divider cell */
4203
4204 /* Allocate a new page. Insert the overflow cell from pPage
4205 ** into it. Then remove the overflow cell from pPage.
4206 */
4207 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4208 if( rc!=SQLITE_OK ){
4209 return rc;
4210 }
4211 pCell = pPage->aOvfl[0].pCell;
4212 szCell = cellSizePtr(pPage, pCell);
4213 zeroPage(pNew, pPage->aData[0]);
4214 assemblePage(pNew, 1, &pCell, &szCell);
4215 pPage->nOverflow = 0;
4216
4217 /* Set the parent of the newly allocated page to pParent. */
4218 pNew->pParent = pParent;
4219 sqlite3PagerRef(pParent->pDbPage);
4220
4221 /* pPage is currently the right-child of pParent. Change this
4222 ** so that the right-child is the new page allocated above and
4223 ** pPage is the next-to-right child.
4224 */
4225 assert( pPage->nCell>0 );
4226 pCell = findCell(pPage, pPage->nCell-1);
4227 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4228 rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4229 if( rc!=SQLITE_OK ){
4230 return rc;
4231 }
4232 assert( parentSize<64 );
4233 rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4234 if( rc!=SQLITE_OK ){
4235 return rc;
4236 }
4237 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4238 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4239
4240#ifndef SQLITE_OMIT_AUTOVACUUM
4241 /* If this is an auto-vacuum database, update the pointer map
4242 ** with entries for the new page, and any pointer from the
4243 ** cell on the page to an overflow page.
4244 */
4245 if( pBt->autoVacuum ){
4246 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4247 if( rc==SQLITE_OK ){
4248 rc = ptrmapPutOvfl(pNew, 0);
4249 }
4250 if( rc!=SQLITE_OK ){
4251 releasePage(pNew);
4252 return rc;
4253 }
4254 }
4255#endif
4256
4257 /* Release the reference to the new page and balance the parent page,
4258 ** in case the divider cell inserted caused it to become overfull.
4259 */
4260 releasePage(pNew);
4261 return balance(pParent, 0);
4262}
4263#endif /* SQLITE_OMIT_QUICKBALANCE */
4264
4265/*
4266** This routine redistributes Cells on pPage and up to NN*2 siblings
4267** of pPage so that all pages have about the same amount of free space.
4268** Usually NN siblings on either side of pPage is used in the balancing,
4269** though more siblings might come from one side if pPage is the first
4270** or last child of its parent. If pPage has fewer than 2*NN siblings
4271** (something which can only happen if pPage is the root page or a
4272** child of root) then all available siblings participate in the balancing.
4273**
4274** The number of siblings of pPage might be increased or decreased by one or
4275** two in an effort to keep pages nearly full but not over full. The root page
4276** is special and is allowed to be nearly empty. If pPage is
4277** the root page, then the depth of the tree might be increased
4278** or decreased by one, as necessary, to keep the root page from being
4279** overfull or completely empty.
4280**
4281** Note that when this routine is called, some of the Cells on pPage
4282** might not actually be stored in pPage->aData[]. This can happen
4283** if the page is overfull. Part of the job of this routine is to
4284** make sure all Cells for pPage once again fit in pPage->aData[].
4285**
4286** In the course of balancing the siblings of pPage, the parent of pPage
4287** might become overfull or underfull. If that happens, then this routine
4288** is called recursively on the parent.
4289**
4290** If this routine fails for any reason, it might leave the database
4291** in a corrupted state. So if this routine fails, the database should
4292** be rolled back.
4293*/
4294static int balance_nonroot(MemPage *pPage){
4295 MemPage *pParent; /* The parent of pPage */
4296 BtShared *pBt; /* The whole database */
4297 int nCell = 0; /* Number of cells in apCell[] */
4298 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4299 int nOld; /* Number of pages in apOld[] */
4300 int nNew; /* Number of pages in apNew[] */
4301 int nDiv; /* Number of cells in apDiv[] */
4302 int i, j, k; /* Loop counters */
4303 int idx; /* Index of pPage in pParent->aCell[] */
4304 int nxDiv; /* Next divider slot in pParent->aCell[] */
4305 int rc; /* The return code */
4306 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4307 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4308 int usableSpace; /* Bytes in pPage beyond the header */
4309 int pageFlags; /* Value of pPage->aData[0] */
4310 int subtotal; /* Subtotal of bytes in cells on one page */
4311 int iSpace = 0; /* First unused byte of aSpace[] */
4312 MemPage *apOld[NB]; /* pPage and up to two siblings */
4313 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4314 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4315 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4316 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4317 u8 *apDiv[NB]; /* Divider cells in pParent */
4318 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4319 int szNew[NB+2]; /* Combined size of cells place on i-th page */
4320 u8 **apCell = 0; /* All cells begin balanced */
4321 int *szCell; /* Local size of all cells in apCell[] */
4322 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4323 u8 *aSpace; /* Space to hold copies of dividers cells */
4324#ifndef SQLITE_OMIT_AUTOVACUUM
4325 u8 *aFrom = 0;
4326#endif
4327
4328 /*
4329 ** Find the parent page.
4330 */
4331 assert( pPage->isInit );
4332 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4333 pBt = pPage->pBt;
4334 pParent = pPage->pParent;
4335 assert( pParent );
4336 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4337 return rc;
4338 }
4339 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4340
4341#ifndef SQLITE_OMIT_QUICKBALANCE
4342 /*
4343 ** A special case: If a new entry has just been inserted into a
4344 ** table (that is, a btree with integer keys and all data at the leaves)
4345 ** and the new entry is the right-most entry in the tree (it has the
4346 ** largest key) then use the special balance_quick() routine for
4347 ** balancing. balance_quick() is much faster and results in a tighter
4348 ** packing of data in the common case.
4349 */
4350 if( pPage->leaf &&
4351 pPage->intKey &&
4352 pPage->leafData &&
4353 pPage->nOverflow==1 &&
4354 pPage->aOvfl[0].idx==pPage->nCell &&
4355 pPage->pParent->pgno!=1 &&
4356 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4357 ){
4358 /*
4359 ** TODO: Check the siblings to the left of pPage. It may be that
4360 ** they are not full and no new page is required.
4361 */
4362 return balance_quick(pPage, pParent);
4363 }
4364#endif
4365
4366 /*
4367 ** Find the cell in the parent page whose left child points back
4368 ** to pPage. The "idx" variable is the index of that cell. If pPage
4369 ** is the rightmost child of pParent then set idx to pParent->nCell
4370 */
4371 if( pParent->idxShift ){
4372 Pgno pgno;
4373 pgno = pPage->pgno;
4374 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
4375 for(idx=0; idx<pParent->nCell; idx++){
4376 if( get4byte(findCell(pParent, idx))==pgno ){
4377 break;
4378 }
4379 }
4380 assert( idx<pParent->nCell
4381 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
4382 }else{
4383 idx = pPage->idxParent;
4384 }
4385
4386 /*
4387 ** Initialize variables so that it will be safe to jump
4388 ** directly to balance_cleanup at any moment.
4389 */
4390 nOld = nNew = 0;
4391 sqlite3PagerRef(pParent->pDbPage);
4392
4393 /*
4394 ** Find sibling pages to pPage and the cells in pParent that divide
4395 ** the siblings. An attempt is made to find NN siblings on either
4396 ** side of pPage. More siblings are taken from one side, however, if
4397 ** pPage there are fewer than NN siblings on the other side. If pParent
4398 ** has NB or fewer children then all children of pParent are taken.
4399 */
4400 nxDiv = idx - NN;
4401 if( nxDiv + NB > pParent->nCell ){
4402 nxDiv = pParent->nCell - NB + 1;
4403 }
4404 if( nxDiv<0 ){
4405 nxDiv = 0;
4406 }
4407 nDiv = 0;
4408 for(i=0, k=nxDiv; i<NB; i++, k++){
4409 if( k<pParent->nCell ){
4410 apDiv[i] = findCell(pParent, k);
4411 nDiv++;
4412 assert( !pParent->leaf );
4413 pgnoOld[i] = get4byte(apDiv[i]);
4414 }else if( k==pParent->nCell ){
4415 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
4416 }else{
4417 break;
4418 }
4419 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
4420 if( rc ) goto balance_cleanup;
4421 apOld[i]->idxParent = k;
4422 apCopy[i] = 0;
4423 assert( i==nOld );
4424 nOld++;
4425 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
4426 }
4427
4428 /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
4429 ** alignment */
4430 nMaxCells = (nMaxCells + 1)&~1;
4431
4432 /*
4433 ** Allocate space for memory structures
4434 */
4435 apCell = sqliteMallocRaw(
4436 nMaxCells*sizeof(u8*) /* apCell */
4437 + nMaxCells*sizeof(int) /* szCell */
4438 + ROUND8(sizeof(MemPage))*NB /* aCopy */
4439 + pBt->pageSize*(5+NB) /* aSpace */
4440 + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */
4441 );
4442 if( apCell==0 ){
4443 rc = SQLITE_NOMEM;
4444 goto balance_cleanup;
4445 }
4446 szCell = (int*)&apCell[nMaxCells];
4447 aCopy[0] = (u8*)&szCell[nMaxCells];
4448 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4449 for(i=1; i<NB; i++){
4450 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4451 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4452 }
4453 aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4454 assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4455#ifndef SQLITE_OMIT_AUTOVACUUM
4456 if( pBt->autoVacuum ){
4457 aFrom = &aSpace[5*pBt->pageSize];
4458 }
4459#endif
4460
4461 /*
4462 ** Make copies of the content of pPage and its siblings into aOld[].
4463 ** The rest of this function will use data from the copies rather
4464 ** that the original pages since the original pages will be in the
4465 ** process of being overwritten.
4466 */
4467 for(i=0; i<nOld; i++){
4468 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize];
4469 p->aData = &((u8*)p)[-pBt->pageSize];
4470 memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
4471 /* The memcpy() above changes the value of p->aData so we have to
4472 ** set it again. */
4473 p->aData = &((u8*)p)[-pBt->pageSize];
4474 }
4475
4476 /*
4477 ** Load pointers to all cells on sibling pages and the divider cells
4478 ** into the local apCell[] array. Make copies of the divider cells
4479 ** into space obtained form aSpace[] and remove the the divider Cells
4480 ** from pParent.
4481 **
4482 ** If the siblings are on leaf pages, then the child pointers of the
4483 ** divider cells are stripped from the cells before they are copied
4484 ** into aSpace[]. In this way, all cells in apCell[] are without
4485 ** child pointers. If siblings are not leaves, then all cell in
4486 ** apCell[] include child pointers. Either way, all cells in apCell[]
4487 ** are alike.
4488 **
4489 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
4490 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
4491 */
4492 nCell = 0;
4493 leafCorrection = pPage->leaf*4;
4494 leafData = pPage->leafData && pPage->leaf;
4495 for(i=0; i<nOld; i++){
4496 MemPage *pOld = apCopy[i];
4497 int limit = pOld->nCell+pOld->nOverflow;
4498 for(j=0; j<limit; j++){
4499 assert( nCell<nMaxCells );
4500 apCell[nCell] = findOverflowCell(pOld, j);
4501 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
4502#ifndef SQLITE_OMIT_AUTOVACUUM
4503 if( pBt->autoVacuum ){
4504 int a;
4505 aFrom[nCell] = i;
4506 for(a=0; a<pOld->nOverflow; a++){
4507 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
4508 aFrom[nCell] = 0xFF;
4509 break;
4510 }
4511 }
4512 }
4513#endif
4514 nCell++;
4515 }
4516 if( i<nOld-1 ){
4517 int sz = cellSizePtr(pParent, apDiv[i]);
4518 if( leafData ){
4519 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
4520 ** are duplicates of keys on the child pages. We need to remove
4521 ** the divider cells from pParent, but the dividers cells are not
4522 ** added to apCell[] because they are duplicates of child cells.
4523 */
4524 dropCell(pParent, nxDiv, sz);
4525 }else{
4526 u8 *pTemp;
4527 assert( nCell<nMaxCells );
4528 szCell[nCell] = sz;
4529 pTemp = &aSpace[iSpace];
4530 iSpace += sz;
4531 assert( iSpace<=pBt->pageSize*5 );
4532 memcpy(pTemp, apDiv[i], sz);
4533 apCell[nCell] = pTemp+leafCorrection;
4534#ifndef SQLITE_OMIT_AUTOVACUUM
4535 if( pBt->autoVacuum ){
4536 aFrom[nCell] = 0xFF;
4537 }
4538#endif
4539 dropCell(pParent, nxDiv, sz);
4540 szCell[nCell] -= leafCorrection;
4541 assert( get4byte(pTemp)==pgnoOld[i] );
4542 if( !pOld->leaf ){
4543 assert( leafCorrection==0 );
4544 /* The right pointer of the child page pOld becomes the left
4545 ** pointer of the divider cell */
4546 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4547 }else{
4548 assert( leafCorrection==4 );
4549 if( szCell[nCell]<4 ){
4550 /* Do not allow any cells smaller than 4 bytes. */
4551 szCell[nCell] = 4;
4552 }
4553 }
4554 nCell++;
4555 }
4556 }
4557 }
4558
4559 /*
4560 ** Figure out the number of pages needed to hold all nCell cells.
4561 ** Store this number in "k". Also compute szNew[] which is the total
4562 ** size of all cells on the i-th page and cntNew[] which is the index
4563 ** in apCell[] of the cell that divides page i from page i+1.
4564 ** cntNew[k] should equal nCell.
4565 **
4566 ** Values computed by this block:
4567 **
4568 ** k: The total number of sibling pages
4569 ** szNew[i]: Spaced used on the i-th sibling page.
4570 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
4571 ** the right of the i-th sibling page.
4572 ** usableSpace: Number of bytes of space available on each sibling.
4573 **
4574 */
4575 usableSpace = pBt->usableSize - 12 + leafCorrection;
4576 for(subtotal=k=i=0; i<nCell; i++){
4577 assert( i<nMaxCells );
4578 subtotal += szCell[i] + 2;
4579 if( subtotal > usableSpace ){
4580 szNew[k] = subtotal - szCell[i];
4581 cntNew[k] = i;
4582 if( leafData ){ i--; }
4583 subtotal = 0;
4584 k++;
4585 }
4586 }
4587 szNew[k] = subtotal;
4588 cntNew[k] = nCell;
4589 k++;
4590
4591 /*
4592 ** The packing computed by the previous block is biased toward the siblings
4593 ** on the left side. The left siblings are always nearly full, while the
4594 ** right-most sibling might be nearly empty. This block of code attempts
4595 ** to adjust the packing of siblings to get a better balance.
4596 **
4597 ** This adjustment is more than an optimization. The packing above might
4598 ** be so out of balance as to be illegal. For example, the right-most
4599 ** sibling might be completely empty. This adjustment is not optional.
4600 */
4601 for(i=k-1; i>0; i--){
4602 int szRight = szNew[i]; /* Size of sibling on the right */
4603 int szLeft = szNew[i-1]; /* Size of sibling on the left */
4604 int r; /* Index of right-most cell in left sibling */
4605 int d; /* Index of first cell to the left of right sibling */
4606
4607 r = cntNew[i-1] - 1;
4608 d = r + 1 - leafData;
4609 assert( d<nMaxCells );
4610 assert( r<nMaxCells );
4611 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
4612 szRight += szCell[d] + 2;
4613 szLeft -= szCell[r] + 2;
4614 cntNew[i-1]--;
4615 r = cntNew[i-1] - 1;
4616 d = r + 1 - leafData;
4617 }
4618 szNew[i] = szRight;
4619 szNew[i-1] = szLeft;
4620 }
4621
4622 /* Either we found one or more cells (cntnew[0])>0) or we are the
4623 ** a virtual root page. A virtual root page is when the real root
4624 ** page is page 1 and we are the only child of that page.
4625 */
4626 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
4627
4628 /*
4629 ** Allocate k new pages. Reuse old pages where possible.
4630 */
4631 assert( pPage->pgno>1 );
4632 pageFlags = pPage->aData[0];
4633 for(i=0; i<k; i++){
4634 MemPage *pNew;
4635 if( i<nOld ){
4636 pNew = apNew[i] = apOld[i];
4637 pgnoNew[i] = pgnoOld[i];
4638 apOld[i] = 0;
4639 rc = sqlite3PagerWrite(pNew->pDbPage);
4640 nNew++;
4641 if( rc ) goto balance_cleanup;
4642 }else{
4643 assert( i>0 );
4644 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
4645 if( rc ) goto balance_cleanup;
4646 apNew[i] = pNew;
4647 nNew++;
4648 }
4649 zeroPage(pNew, pageFlags);
4650 }
4651
4652 /* Free any old pages that were not reused as new pages.
4653 */
4654 while( i<nOld ){
4655 rc = freePage(apOld[i]);
4656 if( rc ) goto balance_cleanup;
4657 releasePage(apOld[i]);
4658 apOld[i] = 0;
4659 i++;
4660 }
4661
4662 /*
4663 ** Put the new pages in accending order. This helps to
4664 ** keep entries in the disk file in order so that a scan
4665 ** of the table is a linear scan through the file. That
4666 ** in turn helps the operating system to deliver pages
4667 ** from the disk more rapidly.
4668 **
4669 ** An O(n^2) insertion sort algorithm is used, but since
4670 ** n is never more than NB (a small constant), that should
4671 ** not be a problem.
4672 **
4673 ** When NB==3, this one optimization makes the database
4674 ** about 25% faster for large insertions and deletions.
4675 */
4676 for(i=0; i<k-1; i++){
4677 int minV = pgnoNew[i];
4678 int minI = i;
4679 for(j=i+1; j<k; j++){
4680 if( pgnoNew[j]<(unsigned)minV ){
4681 minI = j;
4682 minV = pgnoNew[j];
4683 }
4684 }
4685 if( minI>i ){
4686 int t;
4687 MemPage *pT;
4688 t = pgnoNew[i];
4689 pT = apNew[i];
4690 pgnoNew[i] = pgnoNew[minI];
4691 apNew[i] = apNew[minI];
4692 pgnoNew[minI] = t;
4693 apNew[minI] = pT;
4694 }
4695 }
4696 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
4697 pgnoOld[0],
4698 nOld>=2 ? pgnoOld[1] : 0,
4699 nOld>=3 ? pgnoOld[2] : 0,
4700 pgnoNew[0], szNew[0],
4701 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
4702 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
4703 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
4704 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
4705
4706 /*
4707 ** Evenly distribute the data in apCell[] across the new pages.
4708 ** Insert divider cells into pParent as necessary.
4709 */
4710 j = 0;
4711 for(i=0; i<nNew; i++){
4712 /* Assemble the new sibling page. */
4713 MemPage *pNew = apNew[i];
4714 assert( j<nMaxCells );
4715 assert( pNew->pgno==pgnoNew[i] );
4716 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
4717 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
4718 assert( pNew->nOverflow==0 );
4719
4720#ifndef SQLITE_OMIT_AUTOVACUUM
4721 /* If this is an auto-vacuum database, update the pointer map entries
4722 ** that point to the siblings that were rearranged. These can be: left
4723 ** children of cells, the right-child of the page, or overflow pages
4724 ** pointed to by cells.
4725 */
4726 if( pBt->autoVacuum ){
4727 for(k=j; k<cntNew[i]; k++){
4728 assert( k<nMaxCells );
4729 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
4730 rc = ptrmapPutOvfl(pNew, k-j);
4731 if( rc!=SQLITE_OK ){
4732 goto balance_cleanup;
4733 }
4734 }
4735 }
4736 }
4737#endif
4738
4739 j = cntNew[i];
4740
4741 /* If the sibling page assembled above was not the right-most sibling,
4742 ** insert a divider cell into the parent page.
4743 */
4744 if( i<nNew-1 && j<nCell ){
4745 u8 *pCell;
4746 u8 *pTemp;
4747 int sz;
4748
4749 assert( j<nMaxCells );
4750 pCell = apCell[j];
4751 sz = szCell[j] + leafCorrection;
4752 if( !pNew->leaf ){
4753 memcpy(&pNew->aData[8], pCell, 4);
4754 pTemp = 0;
4755 }else if( leafData ){
4756/* If the tree is a leaf-data tree, and the siblings are leaves,
4757 ** then there is no divider cell in apCell[]. Instead, the divider
4758 ** cell consists of the integer key for the right-most cell of
4759 ** the sibling-page assembled above only.
4760 */
4761 CellInfo info;
4762 j--;
4763 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
4764 pCell = &aSpace[iSpace];
4765 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
4766 iSpace += sz;
4767 assert( iSpace<=pBt->pageSize*5 );
4768 pTemp = 0;
4769 }else{
4770 pCell -= 4;
4771 pTemp = &aSpace[iSpace];
4772 iSpace += sz;
4773 assert( iSpace<=pBt->pageSize*5 );
4774 /* Obscure case for non-leaf-data trees: If the cell at pCell was
4775 ** previously stored on a leaf node, and it's reported size was 4
4776 ** bytes, then it may actually be smaller than this
4777 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
4778 ** any cell). But it's important to pass the correct size to
4779 ** insertCell(), so reparse the cell now.
4780 **
4781 ** Note that this can never happen in an SQLite data file, as all
4782 ** cells are at least 4 bytes. It only happens in b-trees used
4783 ** to evaluate "IN (SELECT ...)" and similar clauses.
4784 */
4785 if( szCell[j]==4 ){
4786 assert(leafCorrection==4);
4787 sz = cellSizePtr(pParent, pCell);
4788 }
4789 }
4790 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
4791 if( rc!=SQLITE_OK ) goto balance_cleanup;
4792 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
4793#ifndef SQLITE_OMIT_AUTOVACUUM
4794 /* If this is an auto-vacuum database, and not a leaf-data tree,
4795 ** then update the pointer map with an entry for the overflow page
4796 ** that the cell just inserted points to (if any).
4797 */
4798 if( pBt->autoVacuum && !leafData ){
4799 rc = ptrmapPutOvfl(pParent, nxDiv);
4800 if( rc!=SQLITE_OK ){
4801 goto balance_cleanup;
4802 }
4803 }
4804#endif
4805 j++;
4806 nxDiv++;
4807 }
4808 }
4809 assert( j==nCell );
4810 assert( nOld>0 );
4811 assert( nNew>0 );
4812 if( (pageFlags & PTF_LEAF)==0 ){
4813 memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
4814 }
4815 if( nxDiv==pParent->nCell+pParent->nOverflow ){
4816 /* Right-most sibling is the right-most child of pParent */
4817 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
4818 }else{
4819 /* Right-most sibling is the left child of the first entry in pParent
4820 ** past the right-most divider entry */
4821 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
4822 }
4823
4824 /*
4825 ** Reparent children of all cells.
4826 */
4827 for(i=0; i<nNew; i++){
4828 rc = reparentChildPages(apNew[i]);
4829 if( rc!=SQLITE_OK ) goto balance_cleanup;
4830 }
4831 rc = reparentChildPages(pParent);
4832 if( rc!=SQLITE_OK ) goto balance_cleanup;
4833
4834 /*
4835 ** Balance the parent page. Note that the current page (pPage) might
4836 ** have been added to the freelist so it might no longer be initialized.
4837 ** But the parent page will always be initialized.
4838 */
4839 assert( pParent->isInit );
4840 rc = balance(pParent, 0);
4841
4842 /*
4843 ** Cleanup before returning.
4844 */
4845balance_cleanup:
4846 sqliteFree(apCell);
4847 for(i=0; i<nOld; i++){
4848 releasePage(apOld[i]);
4849 }
4850 for(i=0; i<nNew; i++){
4851 releasePage(apNew[i]);
4852 }
4853 releasePage(pParent);
4854 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
4855 pPage->pgno, nOld, nNew, nCell));
4856 return rc;
4857}
4858
4859/*
4860** This routine is called for the root page of a btree when the root
4861** page contains no cells. This is an opportunity to make the tree
4862** shallower by one level.
4863*/
4864static int balance_shallower(MemPage *pPage){
4865 MemPage *pChild; /* The only child page of pPage */
4866 Pgno pgnoChild; /* Page number for pChild */
4867 int rc = SQLITE_OK; /* Return code from subprocedures */
4868 BtShared *pBt; /* The main BTree structure */
4869 int mxCellPerPage; /* Maximum number of cells per page */
4870 u8 **apCell; /* All cells from pages being balanced */
4871 int *szCell; /* Local size of all cells */
4872
4873 assert( pPage->pParent==0 );
4874 assert( pPage->nCell==0 );
4875 pBt = pPage->pBt;
4876 mxCellPerPage = MX_CELL(pBt);
4877 apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
4878 if( apCell==0 ) return SQLITE_NOMEM;
4879 szCell = (int*)&apCell[mxCellPerPage];
4880 if( pPage->leaf ){
4881 /* The table is completely empty */
4882 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
4883 }else{
4884 /* The root page is empty but has one child. Transfer the
4885 ** information from that one child into the root page if it
4886 ** will fit. This reduces the depth of the tree by one.
4887 **
4888 ** If the root page is page 1, it has less space available than
4889 ** its child (due to the 100 byte header that occurs at the beginning
4890 ** of the database fle), so it might not be able to hold all of the
4891 ** information currently contained in the child. If this is the
4892 ** case, then do not do the transfer. Leave page 1 empty except
4893 ** for the right-pointer to the child page. The child page becomes
4894 ** the virtual root of the tree.
4895 */
4896 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4897 assert( pgnoChild>0 );
4898 assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) );
4899 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
4900 if( rc ) goto end_shallow_balance;
4901 if( pPage->pgno==1 ){
4902 rc = sqlite3BtreeInitPage(pChild, pPage);
4903 if( rc ) goto end_shallow_balance;
4904 assert( pChild->nOverflow==0 );
4905 if( pChild->nFree>=100 ){
4906 /* The child information will fit on the root page, so do the
4907 ** copy */
4908 int i;
4909 zeroPage(pPage, pChild->aData[0]);
4910 for(i=0; i<pChild->nCell; i++){
4911 apCell[i] = findCell(pChild,i);
4912 szCell[i] = cellSizePtr(pChild, apCell[i]);
4913 }
4914 assemblePage(pPage, pChild->nCell, apCell, szCell);
4915 /* Copy the right-pointer of the child to the parent. */
4916 put4byte(&pPage->aData[pPage->hdrOffset+8],
4917 get4byte(&pChild->aData[pChild->hdrOffset+8]));
4918 freePage(pChild);
4919 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
4920 }else{
4921 /* The child has more information that will fit on the root.
4922 ** The tree is already balanced. Do nothing. */
4923 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
4924 }
4925 }else{
4926 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
4927 pPage->isInit = 0;
4928 pPage->pParent = 0;
4929 rc = sqlite3BtreeInitPage(pPage, 0);
4930 assert( rc==SQLITE_OK );
4931 freePage(pChild);
4932 TRACE(("BALANCE: transfer child %d into root %d\n",
4933 pChild->pgno, pPage->pgno));
4934 }
4935 rc = reparentChildPages(pPage);
4936 assert( pPage->nOverflow==0 );
4937#ifndef SQLITE_OMIT_AUTOVACUUM
4938 if( pBt->autoVacuum ){
4939 int i;
4940 for(i=0; i<pPage->nCell; i++){
4941 rc = ptrmapPutOvfl(pPage, i);
4942 if( rc!=SQLITE_OK ){
4943 goto end_shallow_balance;
4944 }
4945 }
4946 }
4947#endif
4948 if( rc!=SQLITE_OK ) goto end_shallow_balance;
4949 releasePage(pChild);
4950 }
4951end_shallow_balance:
4952 sqliteFree(apCell);
4953 return rc;
4954}
4955
4956
4957/*
4958** The root page is overfull
4959**
4960** When this happens, Create a new child page and copy the
4961** contents of the root into the child. Then make the root
4962** page an empty page with rightChild pointing to the new
4963** child. Finally, call balance_internal() on the new child
4964** to cause it to split.
4965*/
4966static int balance_deeper(MemPage *pPage){
4967 int rc; /* Return value from subprocedures */
4968 MemPage *pChild; /* Pointer to a new child page */
4969 Pgno pgnoChild; /* Page number of the new child page */
4970 BtShared *pBt; /* The BTree */
4971 int usableSize; /* Total usable size of a page */
4972 u8 *data; /* Content of the parent page */
4973 u8 *cdata; /* Content of the child page */
4974 int hdr; /* Offset to page header in parent */
4975 int brk; /* Offset to content of first cell in parent */
4976
4977 assert( pPage->pParent==0 );
4978 assert( pPage->nOverflow>0 );
4979 pBt = pPage->pBt;
4980 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
4981 if( rc ) return rc;
4982 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
4983 usableSize = pBt->usableSize;
4984 data = pPage->aData;
4985 hdr = pPage->hdrOffset;
4986 brk = get2byte(&data[hdr+5]);
4987 cdata = pChild->aData;
4988 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
4989 memcpy(&cdata[brk], &data[brk], usableSize-brk);
4990 assert( pChild->isInit==0 );
4991 rc = sqlite3BtreeInitPage(pChild, pPage);
4992 if( rc ) goto balancedeeper_out;
4993 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
4994 pChild->nOverflow = pPage->nOverflow;
4995 if( pChild->nOverflow ){
4996 pChild->nFree = 0;
4997 }
4998 assert( pChild->nCell==pPage->nCell );
4999 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5000 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5001 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5002#ifndef SQLITE_OMIT_AUTOVACUUM
5003 if( pBt->autoVacuum ){
5004 int i;
5005 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5006 if( rc ) goto balancedeeper_out;
5007 for(i=0; i<pChild->nCell; i++){
5008 rc = ptrmapPutOvfl(pChild, i);
5009 if( rc!=SQLITE_OK ){
5010 return rc;
5011 }
5012 }
5013 }
5014#endif
5015 rc = balance_nonroot(pChild);
5016
5017balancedeeper_out:
5018 releasePage(pChild);
5019 return rc;
5020}
5021
5022/*
5023** Decide if the page pPage needs to be balanced. If balancing is
5024** required, call the appropriate balancing routine.
5025*/
5026static int balance(MemPage *pPage, int insert){
5027 int rc = SQLITE_OK;
5028 if( pPage->pParent==0 ){
5029 if( pPage->nOverflow>0 ){
5030 rc = balance_deeper(pPage);
5031 }
5032 if( rc==SQLITE_OK && pPage->nCell==0 ){
5033 rc = balance_shallower(pPage);
5034 }
5035 }else{
5036 if( pPage->nOverflow>0 ||
5037 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5038 rc = balance_nonroot(pPage);