aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/win32/btree.c
diff options
context:
space:
mode:
authordan miller2007-10-20 05:34:26 +0000
committerdan miller2007-10-20 05:34:26 +0000
commit354ea97baf765759911f0c56d3ed511350ebe348 (patch)
tree1adf96a98045d24b8741ba02bf21d195e70993ca /libraries/sqlite/win32/btree.c
parentsqlite source (unix build) added to libraries (diff)
downloadopensim-SC-354ea97baf765759911f0c56d3ed511350ebe348.zip
opensim-SC-354ea97baf765759911f0c56d3ed511350ebe348.tar.gz
opensim-SC-354ea97baf765759911f0c56d3ed511350ebe348.tar.bz2
opensim-SC-354ea97baf765759911f0c56d3ed511350ebe348.tar.xz
sqlite 3.5.1 windows source
Diffstat (limited to 'libraries/sqlite/win32/btree.c')
-rwxr-xr-xlibraries/sqlite/win32/btree.c6890
1 files changed, 6890 insertions, 0 deletions
diff --git a/libraries/sqlite/win32/btree.c b/libraries/sqlite/win32/btree.c
new file mode 100755
index 0000000..de8821a
--- /dev/null
+++ b/libraries/sqlite/win32/btree.c
@@ -0,0 +1,6890 @@
1/*
2** 2004 April 6
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** $Id: btree.c,v 1.428 2007/10/03 08:46:44 danielk1977 Exp $
13**
14** This file implements a external (disk-based) database using BTrees.
15** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
17*/
18#include "btreeInt.h"
19
20/*
21** The header string that appears at the beginning of every
22** SQLite database.
23*/
24static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25
26/*
27** Set this global variable to 1 to enable tracing using the TRACE
28** macro.
29*/
30#if SQLITE_TEST
31int sqlite3_btree_trace=0; /* True to enable tracing */
32#endif
33
34
35
36#ifndef SQLITE_OMIT_SHARED_CACHE
37/*
38** A flag to indicate whether or not shared cache is enabled. Also,
39** a list of BtShared objects that are eligible for participation
40** in shared cache. The variables have file scope during normal builds,
41** but the test harness needs to access these variables so we make them
42** global for test builds.
43*/
44#ifdef SQLITE_TEST
45BtShared *sqlite3SharedCacheList = 0;
46int sqlite3SharedCacheEnabled = 0;
47#else
48static BtShared *sqlite3SharedCacheList = 0;
49static int sqlite3SharedCacheEnabled = 0;
50#endif
51#endif /* SQLITE_OMIT_SHARED_CACHE */
52
53#ifndef SQLITE_OMIT_SHARED_CACHE
54/*
55** Enable or disable the shared pager and schema features.
56**
57** This routine has no effect on existing database connections.
58** The shared cache setting effects only future calls to
59** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
60*/
61int sqlite3_enable_shared_cache(int enable){
62 sqlite3SharedCacheEnabled = enable;
63 return SQLITE_OK;
64}
65#endif
66
67
68/*
69** Forward declaration
70*/
71static int checkReadLocks(Btree*,Pgno,BtCursor*);
72
73
74#ifdef SQLITE_OMIT_SHARED_CACHE
75 /*
76 ** The functions queryTableLock(), lockTable() and unlockAllTables()
77 ** manipulate entries in the BtShared.pLock linked list used to store
78 ** shared-cache table level locks. If the library is compiled with the
79 ** shared-cache feature disabled, then there is only ever one user
80 ** of each BtShared structure and so this locking is not necessary.
81 ** So define the lock related functions as no-ops.
82 */
83 #define queryTableLock(a,b,c) SQLITE_OK
84 #define lockTable(a,b,c) SQLITE_OK
85 #define unlockAllTables(a)
86#endif
87
88#ifndef SQLITE_OMIT_SHARED_CACHE
89/*
90** Query to see if btree handle p may obtain a lock of type eLock
91** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
92** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
93** SQLITE_LOCKED if not.
94*/
95static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
96 BtShared *pBt = p->pBt;
97 BtLock *pIter;
98
99 assert( sqlite3BtreeHoldsMutex(p) );
100
101 /* This is a no-op if the shared-cache is not enabled */
102 if( !p->sharable ){
103 return SQLITE_OK;
104 }
105
106 /* This (along with lockTable()) is where the ReadUncommitted flag is
107 ** dealt with. If the caller is querying for a read-lock and the flag is
108 ** set, it is unconditionally granted - even if there are write-locks
109 ** on the table. If a write-lock is requested, the ReadUncommitted flag
110 ** is not considered.
111 **
112 ** In function lockTable(), if a read-lock is demanded and the
113 ** ReadUncommitted flag is set, no entry is added to the locks list
114 ** (BtShared.pLock).
115 **
116 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
117 ** not create or respect table locks. The locking procedure for a
118 ** write-cursor does not change.
119 */
120 if(
121 !p->pSqlite ||
122 0==(p->pSqlite->flags&SQLITE_ReadUncommitted) ||
123 eLock==WRITE_LOCK ||
124 iTab==MASTER_ROOT
125 ){
126 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
127 if( pIter->pBtree!=p && pIter->iTable==iTab &&
128 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
129 return SQLITE_LOCKED;
130 }
131 }
132 }
133 return SQLITE_OK;
134}
135#endif /* !SQLITE_OMIT_SHARED_CACHE */
136
137#ifndef SQLITE_OMIT_SHARED_CACHE
138/*
139** Add a lock on the table with root-page iTable to the shared-btree used
140** by Btree handle p. Parameter eLock must be either READ_LOCK or
141** WRITE_LOCK.
142**
143** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
144** SQLITE_NOMEM may also be returned.
145*/
146static int lockTable(Btree *p, Pgno iTable, u8 eLock){
147 BtShared *pBt = p->pBt;
148 BtLock *pLock = 0;
149 BtLock *pIter;
150
151 assert( sqlite3BtreeHoldsMutex(p) );
152
153 /* This is a no-op if the shared-cache is not enabled */
154 if( !p->sharable ){
155 return SQLITE_OK;
156 }
157
158 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
159
160 /* If the read-uncommitted flag is set and a read-lock is requested,
161 ** return early without adding an entry to the BtShared.pLock list. See
162 ** comment in function queryTableLock() for more info on handling
163 ** the ReadUncommitted flag.
164 */
165 if(
166 (p->pSqlite) &&
167 (p->pSqlite->flags&SQLITE_ReadUncommitted) &&
168 (eLock==READ_LOCK) &&
169 iTable!=MASTER_ROOT
170 ){
171 return SQLITE_OK;
172 }
173
174 /* First search the list for an existing lock on this table. */
175 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
176 if( pIter->iTable==iTable && pIter->pBtree==p ){
177 pLock = pIter;
178 break;
179 }
180 }
181
182 /* If the above search did not find a BtLock struct associating Btree p
183 ** with table iTable, allocate one and link it into the list.
184 */
185 if( !pLock ){
186 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
187 if( !pLock ){
188 return SQLITE_NOMEM;
189 }
190 pLock->iTable = iTable;
191 pLock->pBtree = p;
192 pLock->pNext = pBt->pLock;
193 pBt->pLock = pLock;
194 }
195
196 /* Set the BtLock.eLock variable to the maximum of the current lock
197 ** and the requested lock. This means if a write-lock was already held
198 ** and a read-lock requested, we don't incorrectly downgrade the lock.
199 */
200 assert( WRITE_LOCK>READ_LOCK );
201 if( eLock>pLock->eLock ){
202 pLock->eLock = eLock;
203 }
204
205 return SQLITE_OK;
206}
207#endif /* !SQLITE_OMIT_SHARED_CACHE */
208
209#ifndef SQLITE_OMIT_SHARED_CACHE
210/*
211** Release all the table locks (locks obtained via calls to the lockTable()
212** procedure) held by Btree handle p.
213*/
214static void unlockAllTables(Btree *p){
215 BtLock **ppIter = &p->pBt->pLock;
216
217 assert( sqlite3BtreeHoldsMutex(p) );
218 assert( p->sharable || 0==*ppIter );
219
220 while( *ppIter ){
221 BtLock *pLock = *ppIter;
222 if( pLock->pBtree==p ){
223 *ppIter = pLock->pNext;
224 sqlite3_free(pLock);
225 }else{
226 ppIter = &pLock->pNext;
227 }
228 }
229}
230#endif /* SQLITE_OMIT_SHARED_CACHE */
231
232static void releasePage(MemPage *pPage); /* Forward reference */
233
234/*
235** Verify that the cursor holds a mutex on the BtShared
236*/
237#ifndef NDEBUG
238static int cursorHoldsMutex(BtCursor *p){
239 return sqlite3_mutex_held(p->pBt->mutex);
240}
241#endif
242
243
244#ifndef SQLITE_OMIT_INCRBLOB
245/*
246** Invalidate the overflow page-list cache for cursor pCur, if any.
247*/
248static void invalidateOverflowCache(BtCursor *pCur){
249 assert( cursorHoldsMutex(pCur) );
250 sqlite3_free(pCur->aOverflow);
251 pCur->aOverflow = 0;
252}
253
254/*
255** Invalidate the overflow page-list cache for all cursors opened
256** on the shared btree structure pBt.
257*/
258static void invalidateAllOverflowCache(BtShared *pBt){
259 BtCursor *p;
260 assert( sqlite3_mutex_held(pBt->mutex) );
261 for(p=pBt->pCursor; p; p=p->pNext){
262 invalidateOverflowCache(p);
263 }
264}
265#else
266 #define invalidateOverflowCache(x)
267 #define invalidateAllOverflowCache(x)
268#endif
269
270/*
271** Save the current cursor position in the variables BtCursor.nKey
272** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
273*/
274static int saveCursorPosition(BtCursor *pCur){
275 int rc;
276
277 assert( CURSOR_VALID==pCur->eState );
278 assert( 0==pCur->pKey );
279 assert( cursorHoldsMutex(pCur) );
280
281 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
282
283 /* If this is an intKey table, then the above call to BtreeKeySize()
284 ** stores the integer key in pCur->nKey. In this case this value is
285 ** all that is required. Otherwise, if pCur is not open on an intKey
286 ** table, then malloc space for and store the pCur->nKey bytes of key
287 ** data.
288 */
289 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
290 void *pKey = sqlite3_malloc(pCur->nKey);
291 if( pKey ){
292 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
293 if( rc==SQLITE_OK ){
294 pCur->pKey = pKey;
295 }else{
296 sqlite3_free(pKey);
297 }
298 }else{
299 rc = SQLITE_NOMEM;
300 }
301 }
302 assert( !pCur->pPage->intKey || !pCur->pKey );
303
304 if( rc==SQLITE_OK ){
305 releasePage(pCur->pPage);
306 pCur->pPage = 0;
307 pCur->eState = CURSOR_REQUIRESEEK;
308 }
309
310 invalidateOverflowCache(pCur);
311 return rc;
312}
313
314/*
315** Save the positions of all cursors except pExcept open on the table
316** with root-page iRoot. Usually, this is called just before cursor
317** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
318*/
319static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
320 BtCursor *p;
321 assert( sqlite3_mutex_held(pBt->mutex) );
322 assert( pExcept==0 || pExcept->pBt==pBt );
323 for(p=pBt->pCursor; p; p=p->pNext){
324 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
325 p->eState==CURSOR_VALID ){
326 int rc = saveCursorPosition(p);
327 if( SQLITE_OK!=rc ){
328 return rc;
329 }
330 }
331 }
332 return SQLITE_OK;
333}
334
335/*
336** Clear the current cursor position.
337*/
338static void clearCursorPosition(BtCursor *pCur){
339 assert( cursorHoldsMutex(pCur) );
340 sqlite3_free(pCur->pKey);
341 pCur->pKey = 0;
342 pCur->eState = CURSOR_INVALID;
343}
344
345/*
346** Restore the cursor to the position it was in (or as close to as possible)
347** when saveCursorPosition() was called. Note that this call deletes the
348** saved position info stored by saveCursorPosition(), so there can be
349** at most one effective restoreOrClearCursorPosition() call after each
350** saveCursorPosition().
351**
352** If the second argument argument - doSeek - is false, then instead of
353** returning the cursor to it's saved position, any saved position is deleted
354** and the cursor state set to CURSOR_INVALID.
355*/
356int sqlite3BtreeRestoreOrClearCursorPosition(BtCursor *pCur){
357 int rc;
358 assert( cursorHoldsMutex(pCur) );
359 assert( pCur->eState>=CURSOR_REQUIRESEEK );
360 if( pCur->eState==CURSOR_FAULT ){
361 return pCur->skip;
362 }
363#ifndef SQLITE_OMIT_INCRBLOB
364 if( pCur->isIncrblobHandle ){
365 return SQLITE_ABORT;
366 }
367#endif
368 pCur->eState = CURSOR_INVALID;
369 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
370 if( rc==SQLITE_OK ){
371 sqlite3_free(pCur->pKey);
372 pCur->pKey = 0;
373 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
374 }
375 return rc;
376}
377
378#define restoreOrClearCursorPosition(p) \
379 (p->eState>=CURSOR_REQUIRESEEK ? \
380 sqlite3BtreeRestoreOrClearCursorPosition(p) : \
381 SQLITE_OK)
382
383#ifndef SQLITE_OMIT_AUTOVACUUM
384/*
385** Given a page number of a regular database page, return the page
386** number for the pointer-map page that contains the entry for the
387** input page number.
388*/
389static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
390 int nPagesPerMapPage, iPtrMap, ret;
391 assert( sqlite3_mutex_held(pBt->mutex) );
392 nPagesPerMapPage = (pBt->usableSize/5)+1;
393 iPtrMap = (pgno-2)/nPagesPerMapPage;
394 ret = (iPtrMap*nPagesPerMapPage) + 2;
395 if( ret==PENDING_BYTE_PAGE(pBt) ){
396 ret++;
397 }
398 return ret;
399}
400
401/*
402** Write an entry into the pointer map.
403**
404** This routine updates the pointer map entry for page number 'key'
405** so that it maps to type 'eType' and parent page number 'pgno'.
406** An error code is returned if something goes wrong, otherwise SQLITE_OK.
407*/
408static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
409 DbPage *pDbPage; /* The pointer map page */
410 u8 *pPtrmap; /* The pointer map data */
411 Pgno iPtrmap; /* The pointer map page number */
412 int offset; /* Offset in pointer map page */
413 int rc;
414
415 assert( sqlite3_mutex_held(pBt->mutex) );
416 /* The master-journal page number must never be used as a pointer map page */
417 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
418
419 assert( pBt->autoVacuum );
420 if( key==0 ){
421 return SQLITE_CORRUPT_BKPT;
422 }
423 iPtrmap = PTRMAP_PAGENO(pBt, key);
424 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
425 if( rc!=SQLITE_OK ){
426 return rc;
427 }
428 offset = PTRMAP_PTROFFSET(pBt, key);
429 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
430
431 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
432 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
433 rc = sqlite3PagerWrite(pDbPage);
434 if( rc==SQLITE_OK ){
435 pPtrmap[offset] = eType;
436 put4byte(&pPtrmap[offset+1], parent);
437 }
438 }
439
440 sqlite3PagerUnref(pDbPage);
441 return rc;
442}
443
444/*
445** Read an entry from the pointer map.
446**
447** This routine retrieves the pointer map entry for page 'key', writing
448** the type and parent page number to *pEType and *pPgno respectively.
449** An error code is returned if something goes wrong, otherwise SQLITE_OK.
450*/
451static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
452 DbPage *pDbPage; /* The pointer map page */
453 int iPtrmap; /* Pointer map page index */
454 u8 *pPtrmap; /* Pointer map page data */
455 int offset; /* Offset of entry in pointer map */
456 int rc;
457
458 assert( sqlite3_mutex_held(pBt->mutex) );
459
460 iPtrmap = PTRMAP_PAGENO(pBt, key);
461 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
462 if( rc!=0 ){
463 return rc;
464 }
465 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
466
467 offset = PTRMAP_PTROFFSET(pBt, key);
468 assert( pEType!=0 );
469 *pEType = pPtrmap[offset];
470 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
471
472 sqlite3PagerUnref(pDbPage);
473 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
474 return SQLITE_OK;
475}
476
477#endif /* SQLITE_OMIT_AUTOVACUUM */
478
479/*
480** Given a btree page and a cell index (0 means the first cell on
481** the page, 1 means the second cell, and so forth) return a pointer
482** to the cell content.
483**
484** This routine works only for pages that do not contain overflow cells.
485*/
486#define findCell(pPage, iCell) \
487 ((pPage)->aData + get2byte(&(pPage)->aData[(pPage)->cellOffset+2*(iCell)]))
488#ifdef SQLITE_TEST
489u8 *sqlite3BtreeFindCell(MemPage *pPage, int iCell){
490 assert( iCell>=0 );
491 assert( iCell<get2byte(&pPage->aData[pPage->hdrOffset+3]) );
492 return findCell(pPage, iCell);
493}
494#endif
495
496/*
497** This a more complex version of sqlite3BtreeFindCell() that works for
498** pages that do contain overflow cells. See insert
499*/
500static u8 *findOverflowCell(MemPage *pPage, int iCell){
501 int i;
502 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
503 for(i=pPage->nOverflow-1; i>=0; i--){
504 int k;
505 struct _OvflCell *pOvfl;
506 pOvfl = &pPage->aOvfl[i];
507 k = pOvfl->idx;
508 if( k<=iCell ){
509 if( k==iCell ){
510 return pOvfl->pCell;
511 }
512 iCell--;
513 }
514 }
515 return findCell(pPage, iCell);
516}
517
518/*
519** Parse a cell content block and fill in the CellInfo structure. There
520** are two versions of this function. sqlite3BtreeParseCell() takes a
521** cell index as the second argument and sqlite3BtreeParseCellPtr()
522** takes a pointer to the body of the cell as its second argument.
523**
524** Within this file, the parseCell() macro can be called instead of
525** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
526*/
527void sqlite3BtreeParseCellPtr(
528 MemPage *pPage, /* Page containing the cell */
529 u8 *pCell, /* Pointer to the cell text. */
530 CellInfo *pInfo /* Fill in this structure */
531){
532 int n; /* Number bytes in cell content header */
533 u32 nPayload; /* Number of bytes of cell payload */
534
535 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
536
537 pInfo->pCell = pCell;
538 assert( pPage->leaf==0 || pPage->leaf==1 );
539 n = pPage->childPtrSize;
540 assert( n==4-4*pPage->leaf );
541 if( pPage->hasData ){
542 n += getVarint32(&pCell[n], &nPayload);
543 }else{
544 nPayload = 0;
545 }
546 pInfo->nData = nPayload;
547 if( pPage->intKey ){
548 n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
549 }else{
550 u32 x;
551 n += getVarint32(&pCell[n], &x);
552 pInfo->nKey = x;
553 nPayload += x;
554 }
555 pInfo->nPayload = nPayload;
556 pInfo->nHeader = n;
557 if( nPayload<=pPage->maxLocal ){
558 /* This is the (easy) common case where the entire payload fits
559 ** on the local page. No overflow is required.
560 */
561 int nSize; /* Total size of cell content in bytes */
562 pInfo->nLocal = nPayload;
563 pInfo->iOverflow = 0;
564 nSize = nPayload + n;
565 if( nSize<4 ){
566 nSize = 4; /* Minimum cell size is 4 */
567 }
568 pInfo->nSize = nSize;
569 }else{
570 /* If the payload will not fit completely on the local page, we have
571 ** to decide how much to store locally and how much to spill onto
572 ** overflow pages. The strategy is to minimize the amount of unused
573 ** space on overflow pages while keeping the amount of local storage
574 ** in between minLocal and maxLocal.
575 **
576 ** Warning: changing the way overflow payload is distributed in any
577 ** way will result in an incompatible file format.
578 */
579 int minLocal; /* Minimum amount of payload held locally */
580 int maxLocal; /* Maximum amount of payload held locally */
581 int surplus; /* Overflow payload available for local storage */
582
583 minLocal = pPage->minLocal;
584 maxLocal = pPage->maxLocal;
585 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
586 if( surplus <= maxLocal ){
587 pInfo->nLocal = surplus;
588 }else{
589 pInfo->nLocal = minLocal;
590 }
591 pInfo->iOverflow = pInfo->nLocal + n;
592 pInfo->nSize = pInfo->iOverflow + 4;
593 }
594}
595#define parseCell(pPage, iCell, pInfo) \
596 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
597void sqlite3BtreeParseCell(
598 MemPage *pPage, /* Page containing the cell */
599 int iCell, /* The cell index. First cell is 0 */
600 CellInfo *pInfo /* Fill in this structure */
601){
602 parseCell(pPage, iCell, pInfo);
603}
604
605/*
606** Compute the total number of bytes that a Cell needs in the cell
607** data area of the btree-page. The return number includes the cell
608** data header and the local payload, but not any overflow page or
609** the space used by the cell pointer.
610*/
611#ifndef NDEBUG
612static int cellSize(MemPage *pPage, int iCell){
613 CellInfo info;
614 sqlite3BtreeParseCell(pPage, iCell, &info);
615 return info.nSize;
616}
617#endif
618static int cellSizePtr(MemPage *pPage, u8 *pCell){
619 CellInfo info;
620 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
621 return info.nSize;
622}
623
624#ifndef SQLITE_OMIT_AUTOVACUUM
625/*
626** If the cell pCell, part of page pPage contains a pointer
627** to an overflow page, insert an entry into the pointer-map
628** for the overflow page.
629*/
630static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
631 if( pCell ){
632 CellInfo info;
633 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
634 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
635 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
636 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
637 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
638 }
639 }
640 return SQLITE_OK;
641}
642/*
643** If the cell with index iCell on page pPage contains a pointer
644** to an overflow page, insert an entry into the pointer-map
645** for the overflow page.
646*/
647static int ptrmapPutOvfl(MemPage *pPage, int iCell){
648 u8 *pCell;
649 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
650 pCell = findOverflowCell(pPage, iCell);
651 return ptrmapPutOvflPtr(pPage, pCell);
652}
653#endif
654
655
656/*
657** Defragment the page given. All Cells are moved to the
658** end of the page and all free space is collected into one
659** big FreeBlk that occurs in between the header and cell
660** pointer array and the cell content area.
661*/
662static int defragmentPage(MemPage *pPage){
663 int i; /* Loop counter */
664 int pc; /* Address of a i-th cell */
665 int addr; /* Offset of first byte after cell pointer array */
666 int hdr; /* Offset to the page header */
667 int size; /* Size of a cell */
668 int usableSize; /* Number of usable bytes on a page */
669 int cellOffset; /* Offset to the cell pointer array */
670 int brk; /* Offset to the cell content area */
671 int nCell; /* Number of cells on the page */
672 unsigned char *data; /* The page data */
673 unsigned char *temp; /* Temp area for cell content */
674
675 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
676 assert( pPage->pBt!=0 );
677 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
678 assert( pPage->nOverflow==0 );
679 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
680 temp = sqlite3_malloc( pPage->pBt->pageSize );
681 if( temp==0 ) return SQLITE_NOMEM;
682 data = pPage->aData;
683 hdr = pPage->hdrOffset;
684 cellOffset = pPage->cellOffset;
685 nCell = pPage->nCell;
686 assert( nCell==get2byte(&data[hdr+3]) );
687 usableSize = pPage->pBt->usableSize;
688 brk = get2byte(&data[hdr+5]);
689 memcpy(&temp[brk], &data[brk], usableSize - brk);
690 brk = usableSize;
691 for(i=0; i<nCell; i++){
692 u8 *pAddr; /* The i-th cell pointer */
693 pAddr = &data[cellOffset + i*2];
694 pc = get2byte(pAddr);
695 assert( pc<pPage->pBt->usableSize );
696 size = cellSizePtr(pPage, &temp[pc]);
697 brk -= size;
698 memcpy(&data[brk], &temp[pc], size);
699 put2byte(pAddr, brk);
700 }
701 assert( brk>=cellOffset+2*nCell );
702 put2byte(&data[hdr+5], brk);
703 data[hdr+1] = 0;
704 data[hdr+2] = 0;
705 data[hdr+7] = 0;
706 addr = cellOffset+2*nCell;
707 memset(&data[addr], 0, brk-addr);
708 sqlite3_free(temp);
709 return SQLITE_OK;
710}
711
712/*
713** Allocate nByte bytes of space on a page.
714**
715** Return the index into pPage->aData[] of the first byte of
716** the new allocation. Or return 0 if there is not enough free
717** space on the page to satisfy the allocation request.
718**
719** If the page contains nBytes of free space but does not contain
720** nBytes of contiguous free space, then this routine automatically
721** calls defragementPage() to consolidate all free space before
722** allocating the new chunk.
723*/
724static int allocateSpace(MemPage *pPage, int nByte){
725 int addr, pc, hdr;
726 int size;
727 int nFrag;
728 int top;
729 int nCell;
730 int cellOffset;
731 unsigned char *data;
732
733 data = pPage->aData;
734 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
735 assert( pPage->pBt );
736 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
737 if( nByte<4 ) nByte = 4;
738 if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
739 pPage->nFree -= nByte;
740 hdr = pPage->hdrOffset;
741
742 nFrag = data[hdr+7];
743 if( nFrag<60 ){
744 /* Search the freelist looking for a slot big enough to satisfy the
745 ** space request. */
746 addr = hdr+1;
747 while( (pc = get2byte(&data[addr]))>0 ){
748 size = get2byte(&data[pc+2]);
749 if( size>=nByte ){
750 if( size<nByte+4 ){
751 memcpy(&data[addr], &data[pc], 2);
752 data[hdr+7] = nFrag + size - nByte;
753 return pc;
754 }else{
755 put2byte(&data[pc+2], size-nByte);
756 return pc + size - nByte;
757 }
758 }
759 addr = pc;
760 }
761 }
762
763 /* Allocate memory from the gap in between the cell pointer array
764 ** and the cell content area.
765 */
766 top = get2byte(&data[hdr+5]);
767 nCell = get2byte(&data[hdr+3]);
768 cellOffset = pPage->cellOffset;
769 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
770 if( defragmentPage(pPage) ) return 0;
771 top = get2byte(&data[hdr+5]);
772 }
773 top -= nByte;
774 assert( cellOffset + 2*nCell <= top );
775 put2byte(&data[hdr+5], top);
776 return top;
777}
778
779/*
780** Return a section of the pPage->aData to the freelist.
781** The first byte of the new free block is pPage->aDisk[start]
782** and the size of the block is "size" bytes.
783**
784** Most of the effort here is involved in coalesing adjacent
785** free blocks into a single big free block.
786*/
787static void freeSpace(MemPage *pPage, int start, int size){
788 int addr, pbegin, hdr;
789 unsigned char *data = pPage->aData;
790
791 assert( pPage->pBt!=0 );
792 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
793 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
794 assert( (start + size)<=pPage->pBt->usableSize );
795 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
796 if( size<4 ) size = 4;
797
798#ifdef SQLITE_SECURE_DELETE
799 /* Overwrite deleted information with zeros when the SECURE_DELETE
800 ** option is enabled at compile-time */
801 memset(&data[start], 0, size);
802#endif
803
804 /* Add the space back into the linked list of freeblocks */
805 hdr = pPage->hdrOffset;
806 addr = hdr + 1;
807 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
808 assert( pbegin<=pPage->pBt->usableSize-4 );
809 assert( pbegin>addr );
810 addr = pbegin;
811 }
812 assert( pbegin<=pPage->pBt->usableSize-4 );
813 assert( pbegin>addr || pbegin==0 );
814 put2byte(&data[addr], start);
815 put2byte(&data[start], pbegin);
816 put2byte(&data[start+2], size);
817 pPage->nFree += size;
818
819 /* Coalesce adjacent free blocks */
820 addr = pPage->hdrOffset + 1;
821 while( (pbegin = get2byte(&data[addr]))>0 ){
822 int pnext, psize;
823 assert( pbegin>addr );
824 assert( pbegin<=pPage->pBt->usableSize-4 );
825 pnext = get2byte(&data[pbegin]);
826 psize = get2byte(&data[pbegin+2]);
827 if( pbegin + psize + 3 >= pnext && pnext>0 ){
828 int frag = pnext - (pbegin+psize);
829 assert( frag<=data[pPage->hdrOffset+7] );
830 data[pPage->hdrOffset+7] -= frag;
831 put2byte(&data[pbegin], get2byte(&data[pnext]));
832 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
833 }else{
834 addr = pbegin;
835 }
836 }
837
838 /* If the cell content area begins with a freeblock, remove it. */
839 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
840 int top;
841 pbegin = get2byte(&data[hdr+1]);
842 memcpy(&data[hdr+1], &data[pbegin], 2);
843 top = get2byte(&data[hdr+5]);
844 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
845 }
846}
847
848/*
849** Decode the flags byte (the first byte of the header) for a page
850** and initialize fields of the MemPage structure accordingly.
851*/
852static void decodeFlags(MemPage *pPage, int flagByte){
853 BtShared *pBt; /* A copy of pPage->pBt */
854
855 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
856 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
857 pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
858 pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
859 pPage->leaf = (flagByte & PTF_LEAF)!=0;
860 pPage->childPtrSize = 4*(pPage->leaf==0);
861 pBt = pPage->pBt;
862 if( flagByte & PTF_LEAFDATA ){
863 pPage->leafData = 1;
864 pPage->maxLocal = pBt->maxLeaf;
865 pPage->minLocal = pBt->minLeaf;
866 }else{
867 pPage->leafData = 0;
868 pPage->maxLocal = pBt->maxLocal;
869 pPage->minLocal = pBt->minLocal;
870 }
871 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
872}
873
874/*
875** Initialize the auxiliary information for a disk block.
876**
877** The pParent parameter must be a pointer to the MemPage which
878** is the parent of the page being initialized. The root of a
879** BTree has no parent and so for that page, pParent==NULL.
880**
881** Return SQLITE_OK on success. If we see that the page does
882** not contain a well-formed database page, then return
883** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
884** guarantee that the page is well-formed. It only shows that
885** we failed to detect any corruption.
886*/
887int sqlite3BtreeInitPage(
888 MemPage *pPage, /* The page to be initialized */
889 MemPage *pParent /* The parent. Might be NULL */
890){
891 int pc; /* Address of a freeblock within pPage->aData[] */
892 int hdr; /* Offset to beginning of page header */
893 u8 *data; /* Equal to pPage->aData */
894 BtShared *pBt; /* The main btree structure */
895 int usableSize; /* Amount of usable space on each page */
896 int cellOffset; /* Offset from start of page to first cell pointer */
897 int nFree; /* Number of unused bytes on the page */
898 int top; /* First byte of the cell content area */
899
900 pBt = pPage->pBt;
901 assert( pBt!=0 );
902 assert( pParent==0 || pParent->pBt==pBt );
903 assert( sqlite3_mutex_held(pBt->mutex) );
904 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
905 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
906 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
907 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
908 /* The parent page should never change unless the file is corrupt */
909 return SQLITE_CORRUPT_BKPT;
910 }
911 if( pPage->isInit ) return SQLITE_OK;
912 if( pPage->pParent==0 && pParent!=0 ){
913 pPage->pParent = pParent;
914 sqlite3PagerRef(pParent->pDbPage);
915 }
916 hdr = pPage->hdrOffset;
917 data = pPage->aData;
918 decodeFlags(pPage, data[hdr]);
919 pPage->nOverflow = 0;
920 pPage->idxShift = 0;
921 usableSize = pBt->usableSize;
922 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
923 top = get2byte(&data[hdr+5]);
924 pPage->nCell = get2byte(&data[hdr+3]);
925 if( pPage->nCell>MX_CELL(pBt) ){
926 /* To many cells for a single page. The page must be corrupt */
927 return SQLITE_CORRUPT_BKPT;
928 }
929 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
930 /* All pages must have at least one cell, except for root pages */
931 return SQLITE_CORRUPT_BKPT;
932 }
933
934 /* Compute the total free space on the page */
935 pc = get2byte(&data[hdr+1]);
936 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
937 while( pc>0 ){
938 int next, size;
939 if( pc>usableSize-4 ){
940 /* Free block is off the page */
941 return SQLITE_CORRUPT_BKPT;
942 }
943 next = get2byte(&data[pc]);
944 size = get2byte(&data[pc+2]);
945 if( next>0 && next<=pc+size+3 ){
946 /* Free blocks must be in accending order */
947 return SQLITE_CORRUPT_BKPT;
948 }
949 nFree += size;
950 pc = next;
951 }
952 pPage->nFree = nFree;
953 if( nFree>=usableSize ){
954 /* Free space cannot exceed total page size */
955 return SQLITE_CORRUPT_BKPT;
956 }
957
958 pPage->isInit = 1;
959 return SQLITE_OK;
960}
961
962/*
963** Set up a raw page so that it looks like a database page holding
964** no entries.
965*/
966static void zeroPage(MemPage *pPage, int flags){
967 unsigned char *data = pPage->aData;
968 BtShared *pBt = pPage->pBt;
969 int hdr = pPage->hdrOffset;
970 int first;
971
972 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
973 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
974 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
975 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
976 assert( sqlite3_mutex_held(pBt->mutex) );
977 memset(&data[hdr], 0, pBt->usableSize - hdr);
978 data[hdr] = flags;
979 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
980 memset(&data[hdr+1], 0, 4);
981 data[hdr+7] = 0;
982 put2byte(&data[hdr+5], pBt->usableSize);
983 pPage->nFree = pBt->usableSize - first;
984 decodeFlags(pPage, flags);
985 pPage->hdrOffset = hdr;
986 pPage->cellOffset = first;
987 pPage->nOverflow = 0;
988 pPage->idxShift = 0;
989 pPage->nCell = 0;
990 pPage->isInit = 1;
991}
992
993/*
994** Get a page from the pager. Initialize the MemPage.pBt and
995** MemPage.aData elements if needed.
996**
997** If the noContent flag is set, it means that we do not care about
998** the content of the page at this time. So do not go to the disk
999** to fetch the content. Just fill in the content with zeros for now.
1000** If in the future we call sqlite3PagerWrite() on this page, that
1001** means we have started to be concerned about content and the disk
1002** read should occur at that point.
1003*/
1004int sqlite3BtreeGetPage(
1005 BtShared *pBt, /* The btree */
1006 Pgno pgno, /* Number of the page to fetch */
1007 MemPage **ppPage, /* Return the page in this parameter */
1008 int noContent /* Do not load page content if true */
1009){
1010 int rc;
1011 MemPage *pPage;
1012 DbPage *pDbPage;
1013
1014 assert( sqlite3_mutex_held(pBt->mutex) );
1015 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1016 if( rc ) return rc;
1017 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
1018 pPage->aData = sqlite3PagerGetData(pDbPage);
1019 pPage->pDbPage = pDbPage;
1020 pPage->pBt = pBt;
1021 pPage->pgno = pgno;
1022 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1023 *ppPage = pPage;
1024 return SQLITE_OK;
1025}
1026
1027/*
1028** Get a page from the pager and initialize it. This routine
1029** is just a convenience wrapper around separate calls to
1030** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1031*/
1032static int getAndInitPage(
1033 BtShared *pBt, /* The database file */
1034 Pgno pgno, /* Number of the page to get */
1035 MemPage **ppPage, /* Write the page pointer here */
1036 MemPage *pParent /* Parent of the page */
1037){
1038 int rc;
1039 assert( sqlite3_mutex_held(pBt->mutex) );
1040 if( pgno==0 ){
1041 return SQLITE_CORRUPT_BKPT;
1042 }
1043 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1044 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1045 rc = sqlite3BtreeInitPage(*ppPage, pParent);
1046 }
1047 return rc;
1048}
1049
1050/*
1051** Release a MemPage. This should be called once for each prior
1052** call to sqlite3BtreeGetPage.
1053*/
1054static void releasePage(MemPage *pPage){
1055 if( pPage ){
1056 assert( pPage->aData );
1057 assert( pPage->pBt );
1058 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1059 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1060 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1061 sqlite3PagerUnref(pPage->pDbPage);
1062 }
1063}
1064
1065/*
1066** This routine is called when the reference count for a page
1067** reaches zero. We need to unref the pParent pointer when that
1068** happens.
1069*/
1070static void pageDestructor(DbPage *pData, int pageSize){
1071 MemPage *pPage;
1072 assert( (pageSize & 7)==0 );
1073 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1074 assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) );
1075 if( pPage->pParent ){
1076 MemPage *pParent = pPage->pParent;
1077 assert( pParent->pBt==pPage->pBt );
1078 pPage->pParent = 0;
1079 releasePage(pParent);
1080 }
1081 pPage->isInit = 0;
1082}
1083
1084/*
1085** During a rollback, when the pager reloads information into the cache
1086** so that the cache is restored to its original state at the start of
1087** the transaction, for each page restored this routine is called.
1088**
1089** This routine needs to reset the extra data section at the end of the
1090** page to agree with the restored data.
1091*/
1092static void pageReinit(DbPage *pData, int pageSize){
1093 MemPage *pPage;
1094 assert( (pageSize & 7)==0 );
1095 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1096 if( pPage->isInit ){
1097 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1098 pPage->isInit = 0;
1099 sqlite3BtreeInitPage(pPage, pPage->pParent);
1100 }
1101}
1102
1103/*
1104** Open a database file.
1105**
1106** zFilename is the name of the database file. If zFilename is NULL
1107** a new database with a random name is created. This randomly named
1108** database file will be deleted when sqlite3BtreeClose() is called.
1109** If zFilename is ":memory:" then an in-memory database is created
1110** that is automatically destroyed when it is closed.
1111*/
1112int sqlite3BtreeOpen(
1113 const char *zFilename, /* Name of the file containing the BTree database */
1114 sqlite3 *pSqlite, /* Associated database handle */
1115 Btree **ppBtree, /* Pointer to new Btree object written here */
1116 int flags, /* Options */
1117 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1118){
1119 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1120 BtShared *pBt = 0; /* Shared part of btree structure */
1121 Btree *p; /* Handle to return */
1122 int rc = SQLITE_OK;
1123 int nReserve;
1124 unsigned char zDbHeader[100];
1125
1126 /* Set the variable isMemdb to true for an in-memory database, or
1127 ** false for a file-based database. This symbol is only required if
1128 ** either of the shared-data or autovacuum features are compiled
1129 ** into the library.
1130 */
1131#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1132 #ifdef SQLITE_OMIT_MEMORYDB
1133 const int isMemdb = 0;
1134 #else
1135 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1136 #endif
1137#endif
1138
1139 assert( pSqlite!=0 );
1140 assert( sqlite3_mutex_held(pSqlite->mutex) );
1141
1142 pVfs = pSqlite->pVfs;
1143 p = sqlite3MallocZero(sizeof(Btree));
1144 if( !p ){
1145 return SQLITE_NOMEM;
1146 }
1147 p->inTrans = TRANS_NONE;
1148 p->pSqlite = pSqlite;
1149
1150#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1151 /*
1152 ** If this Btree is a candidate for shared cache, try to find an
1153 ** existing BtShared object that we can share with
1154 */
1155 if( (flags & BTREE_PRIVATE)==0
1156 && isMemdb==0
1157 && (pSqlite->flags & SQLITE_Vtab)==0
1158 && zFilename && zFilename[0]
1159 ){
1160 if( sqlite3SharedCacheEnabled ){
1161 int nFullPathname = pVfs->mxPathname+1;
1162 char *zFullPathname = (char *)sqlite3_malloc(nFullPathname);
1163 sqlite3_mutex *mutexShared;
1164 p->sharable = 1;
1165 if( pSqlite ){
1166 pSqlite->flags |= SQLITE_SharedCache;
1167 }
1168 if( !zFullPathname ){
1169 sqlite3_free(p);
1170 return SQLITE_NOMEM;
1171 }
1172 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1173 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1174 sqlite3_mutex_enter(mutexShared);
1175 for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){
1176 assert( pBt->nRef>0 );
1177 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1178 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1179 p->pBt = pBt;
1180 pBt->nRef++;
1181 break;
1182 }
1183 }
1184 sqlite3_mutex_leave(mutexShared);
1185 sqlite3_free(zFullPathname);
1186 }
1187#ifdef SQLITE_DEBUG
1188 else{
1189 /* In debug mode, we mark all persistent databases as sharable
1190 ** even when they are not. This exercises the locking code and
1191 ** gives more opportunity for asserts(sqlite3_mutex_held())
1192 ** statements to find locking problems.
1193 */
1194 p->sharable = 1;
1195 }
1196#endif
1197 }
1198#endif
1199 if( pBt==0 ){
1200 /*
1201 ** The following asserts make sure that structures used by the btree are
1202 ** the right size. This is to guard against size changes that result
1203 ** when compiling on a different architecture.
1204 */
1205 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1206 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1207 assert( sizeof(u32)==4 );
1208 assert( sizeof(u16)==2 );
1209 assert( sizeof(Pgno)==4 );
1210
1211 pBt = sqlite3MallocZero( sizeof(*pBt) );
1212 if( pBt==0 ){
1213 rc = SQLITE_NOMEM;
1214 goto btree_open_out;
1215 }
1216 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1217 EXTRA_SIZE, flags, vfsFlags);
1218 if( rc==SQLITE_OK ){
1219 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1220 }
1221 if( rc!=SQLITE_OK ){
1222 goto btree_open_out;
1223 }
1224 p->pBt = pBt;
1225
1226 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1227 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1228 pBt->pCursor = 0;
1229 pBt->pPage1 = 0;
1230 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1231 pBt->pageSize = get2byte(&zDbHeader[16]);
1232 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1233 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1234 pBt->pageSize = 0;
1235 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1236 pBt->maxEmbedFrac = 64; /* 25% */
1237 pBt->minEmbedFrac = 32; /* 12.5% */
1238 pBt->minLeafFrac = 32; /* 12.5% */
1239#ifndef SQLITE_OMIT_AUTOVACUUM
1240 /* If the magic name ":memory:" will create an in-memory database, then
1241 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1242 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1243 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1244 ** regular file-name. In this case the auto-vacuum applies as per normal.
1245 */
1246 if( zFilename && !isMemdb ){
1247 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1248 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1249 }
1250#endif
1251 nReserve = 0;
1252 }else{
1253 nReserve = zDbHeader[20];
1254 pBt->maxEmbedFrac = zDbHeader[21];
1255 pBt->minEmbedFrac = zDbHeader[22];
1256 pBt->minLeafFrac = zDbHeader[23];
1257 pBt->pageSizeFixed = 1;
1258#ifndef SQLITE_OMIT_AUTOVACUUM
1259 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1260 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1261#endif
1262 }
1263 pBt->usableSize = pBt->pageSize - nReserve;
1264 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1265 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1266
1267#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1268 /* Add the new BtShared object to the linked list sharable BtShareds.
1269 */
1270 if( p->sharable ){
1271 sqlite3_mutex *mutexShared;
1272 pBt->nRef = 1;
1273 mutexShared = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1274 if( SQLITE_THREADSAFE ){
1275 pBt->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
1276 if( pBt->mutex==0 ){
1277 rc = SQLITE_NOMEM;
1278 pSqlite->mallocFailed = 0;
1279 goto btree_open_out;
1280 }
1281 }
1282 sqlite3_mutex_enter(mutexShared);
1283 pBt->pNext = sqlite3SharedCacheList;
1284 sqlite3SharedCacheList = pBt;
1285 sqlite3_mutex_leave(mutexShared);
1286 }
1287#endif
1288 }
1289
1290#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1291 /* If the new Btree uses a sharable pBtShared, then link the new
1292 ** Btree into the list of all sharable Btrees for the same connection.
1293 ** The list is kept in ascending order by pBt address.
1294 */
1295 if( p->sharable ){
1296 int i;
1297 Btree *pSib;
1298 for(i=0; i<pSqlite->nDb; i++){
1299 if( (pSib = pSqlite->aDb[i].pBt)!=0 && pSib->sharable ){
1300 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1301 if( p->pBt<pSib->pBt ){
1302 p->pNext = pSib;
1303 p->pPrev = 0;
1304 pSib->pPrev = p;
1305 }else{
1306 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1307 pSib = pSib->pNext;
1308 }
1309 p->pNext = pSib->pNext;
1310 p->pPrev = pSib;
1311 if( p->pNext ){
1312 p->pNext->pPrev = p;
1313 }
1314 pSib->pNext = p;
1315 }
1316 break;
1317 }
1318 }
1319 }
1320#endif
1321 *ppBtree = p;
1322
1323btree_open_out:
1324 if( rc!=SQLITE_OK ){
1325 if( pBt && pBt->pPager ){
1326 sqlite3PagerClose(pBt->pPager);
1327 }
1328 sqlite3_free(pBt);
1329 sqlite3_free(p);
1330 *ppBtree = 0;
1331 }
1332 return rc;
1333}
1334
1335/*
1336** Decrement the BtShared.nRef counter. When it reaches zero,
1337** remove the BtShared structure from the sharing list. Return
1338** true if the BtShared.nRef counter reaches zero and return
1339** false if it is still positive.
1340*/
1341static int removeFromSharingList(BtShared *pBt){
1342#ifndef SQLITE_OMIT_SHARED_CACHE
1343 sqlite3_mutex *pMaster;
1344 BtShared *pList;
1345 int removed = 0;
1346
1347 assert( sqlite3_mutex_notheld(pBt->mutex) );
1348 pMaster = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MASTER);
1349 sqlite3_mutex_enter(pMaster);
1350 pBt->nRef--;
1351 if( pBt->nRef<=0 ){
1352 if( sqlite3SharedCacheList==pBt ){
1353 sqlite3SharedCacheList = pBt->pNext;
1354 }else{
1355 pList = sqlite3SharedCacheList;
1356 while( pList && pList->pNext!=pBt ){
1357 pList=pList->pNext;
1358 }
1359 if( pList ){
1360 pList->pNext = pBt->pNext;
1361 }
1362 }
1363 if( SQLITE_THREADSAFE ){
1364 sqlite3_mutex_free(pBt->mutex);
1365 }
1366 removed = 1;
1367 }
1368 sqlite3_mutex_leave(pMaster);
1369 return removed;
1370#else
1371 return 1;
1372#endif
1373}
1374
1375/*
1376** Close an open database and invalidate all cursors.
1377*/
1378int sqlite3BtreeClose(Btree *p){
1379 BtShared *pBt = p->pBt;
1380 BtCursor *pCur;
1381
1382 /* Close all cursors opened via this handle. */
1383 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
1384 sqlite3BtreeEnter(p);
1385 pCur = pBt->pCursor;
1386 while( pCur ){
1387 BtCursor *pTmp = pCur;
1388 pCur = pCur->pNext;
1389 if( pTmp->pBtree==p ){
1390 sqlite3BtreeCloseCursor(pTmp);
1391 }
1392 }
1393
1394 /* Rollback any active transaction and free the handle structure.
1395 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1396 ** this handle.
1397 */
1398 sqlite3BtreeRollback(p);
1399 sqlite3BtreeLeave(p);
1400
1401 /* If there are still other outstanding references to the shared-btree
1402 ** structure, return now. The remainder of this procedure cleans
1403 ** up the shared-btree.
1404 */
1405 assert( p->wantToLock==0 && p->locked==0 );
1406 if( !p->sharable || removeFromSharingList(pBt) ){
1407 /* The pBt is no longer on the sharing list, so we can access
1408 ** it without having to hold the mutex.
1409 **
1410 ** Clean out and delete the BtShared object.
1411 */
1412 assert( !pBt->pCursor );
1413 sqlite3PagerClose(pBt->pPager);
1414 if( pBt->xFreeSchema && pBt->pSchema ){
1415 pBt->xFreeSchema(pBt->pSchema);
1416 }
1417 sqlite3_free(pBt->pSchema);
1418 sqlite3_free(pBt);
1419 }
1420
1421#ifndef SQLITE_OMIT_SHARED_CACHE
1422 assert( p->wantToLock==0 );
1423 assert( p->locked==0 );
1424 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1425 if( p->pNext ) p->pNext->pPrev = p->pPrev;
1426#endif
1427
1428 sqlite3_free(p);
1429 return SQLITE_OK;
1430}
1431
1432/*
1433** Change the busy handler callback function.
1434*/
1435int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){
1436 BtShared *pBt = p->pBt;
1437 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
1438 sqlite3BtreeEnter(p);
1439 pBt->pBusyHandler = pHandler;
1440 sqlite3PagerSetBusyhandler(pBt->pPager, pHandler);
1441 sqlite3BtreeLeave(p);
1442 return SQLITE_OK;
1443}
1444
1445/*
1446** Change the limit on the number of pages allowed in the cache.
1447**
1448** The maximum number of cache pages is set to the absolute
1449** value of mxPage. If mxPage is negative, the pager will
1450** operate asynchronously - it will not stop to do fsync()s
1451** to insure data is written to the disk surface before
1452** continuing. Transactions still work if synchronous is off,
1453** and the database cannot be corrupted if this program
1454** crashes. But if the operating system crashes or there is
1455** an abrupt power failure when synchronous is off, the database
1456** could be left in an inconsistent and unrecoverable state.
1457** Synchronous is on by default so database corruption is not
1458** normally a worry.
1459*/
1460int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1461 BtShared *pBt = p->pBt;
1462 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
1463 sqlite3BtreeEnter(p);
1464 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1465 sqlite3BtreeLeave(p);
1466 return SQLITE_OK;
1467}
1468
1469/*
1470** Change the way data is synced to disk in order to increase or decrease
1471** how well the database resists damage due to OS crashes and power
1472** failures. Level 1 is the same as asynchronous (no syncs() occur and
1473** there is a high probability of damage) Level 2 is the default. There
1474** is a very low but non-zero probability of damage. Level 3 reduces the
1475** probability of damage to near zero but with a write performance reduction.
1476*/
1477#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1478int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1479 BtShared *pBt = p->pBt;
1480 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
1481 sqlite3BtreeEnter(p);
1482 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1483 sqlite3BtreeLeave(p);
1484 return SQLITE_OK;
1485}
1486#endif
1487
1488/*
1489** Return TRUE if the given btree is set to safety level 1. In other
1490** words, return TRUE if no sync() occurs on the disk files.
1491*/
1492int sqlite3BtreeSyncDisabled(Btree *p){
1493 BtShared *pBt = p->pBt;
1494 int rc;
1495 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
1496 sqlite3BtreeEnter(p);
1497 assert( pBt && pBt->pPager );
1498 rc = sqlite3PagerNosync(pBt->pPager);
1499 sqlite3BtreeLeave(p);
1500 return rc;
1501}
1502
1503#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1504/*
1505** Change the default pages size and the number of reserved bytes per page.
1506**
1507** The page size must be a power of 2 between 512 and 65536. If the page
1508** size supplied does not meet this constraint then the page size is not
1509** changed.
1510**
1511** Page sizes are constrained to be a power of two so that the region
1512** of the database file used for locking (beginning at PENDING_BYTE,
1513** the first byte past the 1GB boundary, 0x40000000) needs to occur
1514** at the beginning of a page.
1515**
1516** If parameter nReserve is less than zero, then the number of reserved
1517** bytes per page is left unchanged.
1518*/
1519int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1520 int rc = SQLITE_OK;
1521 BtShared *pBt = p->pBt;
1522 sqlite3BtreeEnter(p);
1523 if( pBt->pageSizeFixed ){
1524 sqlite3BtreeLeave(p);
1525 return SQLITE_READONLY;
1526 }
1527 if( nReserve<0 ){
1528 nReserve = pBt->pageSize - pBt->usableSize;
1529 }
1530 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1531 ((pageSize-1)&pageSize)==0 ){
1532 assert( (pageSize & 7)==0 );
1533 assert( !pBt->pPage1 && !pBt->pCursor );
1534 pBt->pageSize = pageSize;
1535 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1536 }
1537 pBt->usableSize = pBt->pageSize - nReserve;
1538 sqlite3BtreeLeave(p);
1539 return rc;
1540}
1541
1542/*
1543** Return the currently defined page size
1544*/
1545int sqlite3BtreeGetPageSize(Btree *p){
1546 return p->pBt->pageSize;
1547}
1548int sqlite3BtreeGetReserve(Btree *p){
1549 int n;
1550 sqlite3BtreeEnter(p);
1551 n = p->pBt->pageSize - p->pBt->usableSize;
1552 sqlite3BtreeLeave(p);
1553 return n;
1554}
1555
1556/*
1557** Set the maximum page count for a database if mxPage is positive.
1558** No changes are made if mxPage is 0 or negative.
1559** Regardless of the value of mxPage, return the maximum page count.
1560*/
1561int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1562 int n;
1563 sqlite3BtreeEnter(p);
1564 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1565 sqlite3BtreeLeave(p);
1566 return n;
1567}
1568#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1569
1570/*
1571** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1572** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1573** is disabled. The default value for the auto-vacuum property is
1574** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1575*/
1576int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1577#ifdef SQLITE_OMIT_AUTOVACUUM
1578 return SQLITE_READONLY;
1579#else
1580 BtShared *pBt = p->pBt;
1581 int rc = SQLITE_OK;
1582 int av = (autoVacuum?1:0);
1583
1584 sqlite3BtreeEnter(p);
1585 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1586 rc = SQLITE_READONLY;
1587 }else{
1588 pBt->autoVacuum = av;
1589 }
1590 sqlite3BtreeLeave(p);
1591 return rc;
1592#endif
1593}
1594
1595/*
1596** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1597** enabled 1 is returned. Otherwise 0.
1598*/
1599int sqlite3BtreeGetAutoVacuum(Btree *p){
1600#ifdef SQLITE_OMIT_AUTOVACUUM
1601 return BTREE_AUTOVACUUM_NONE;
1602#else
1603 int rc;
1604 sqlite3BtreeEnter(p);
1605 rc = (
1606 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1607 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1608 BTREE_AUTOVACUUM_INCR
1609 );
1610 sqlite3BtreeLeave(p);
1611 return rc;
1612#endif
1613}
1614
1615
1616/*
1617** Get a reference to pPage1 of the database file. This will
1618** also acquire a readlock on that file.
1619**
1620** SQLITE_OK is returned on success. If the file is not a
1621** well-formed database file, then SQLITE_CORRUPT is returned.
1622** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1623** is returned if we run out of memory.
1624*/
1625static int lockBtree(BtShared *pBt){
1626 int rc, pageSize;
1627 MemPage *pPage1;
1628
1629 assert( sqlite3_mutex_held(pBt->mutex) );
1630 if( pBt->pPage1 ) return SQLITE_OK;
1631 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1632 if( rc!=SQLITE_OK ) return rc;
1633
1634
1635 /* Do some checking to help insure the file we opened really is
1636 ** a valid database file.
1637 */
1638 rc = SQLITE_NOTADB;
1639 if( sqlite3PagerPagecount(pBt->pPager)>0 ){
1640 u8 *page1 = pPage1->aData;
1641 if( memcmp(page1, zMagicHeader, 16)!=0 ){
1642 goto page1_init_failed;
1643 }
1644 if( page1[18]>1 ){
1645 pBt->readOnly = 1;
1646 }
1647 if( page1[19]>1 ){
1648 goto page1_init_failed;
1649 }
1650 pageSize = get2byte(&page1[16]);
1651 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1652 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1653 ){
1654 goto page1_init_failed;
1655 }
1656 assert( (pageSize & 7)==0 );
1657 pBt->pageSize = pageSize;
1658 pBt->usableSize = pageSize - page1[20];
1659 if( pBt->usableSize<500 ){
1660 goto page1_init_failed;
1661 }
1662 pBt->maxEmbedFrac = page1[21];
1663 pBt->minEmbedFrac = page1[22];
1664 pBt->minLeafFrac = page1[23];
1665#ifndef SQLITE_OMIT_AUTOVACUUM
1666 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1667 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1668#endif
1669 }
1670
1671 /* maxLocal is the maximum amount of payload to store locally for
1672 ** a cell. Make sure it is small enough so that at least minFanout
1673 ** cells can will fit on one page. We assume a 10-byte page header.
1674 ** Besides the payload, the cell must store:
1675 ** 2-byte pointer to the cell
1676 ** 4-byte child pointer
1677 ** 9-byte nKey value
1678 ** 4-byte nData value
1679 ** 4-byte overflow page pointer
1680 ** So a cell consists of a 2-byte poiner, a header which is as much as
1681 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1682 ** page pointer.
1683 */
1684 pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1685 pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1686 pBt->maxLeaf = pBt->usableSize - 35;
1687 pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1688 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1689 goto page1_init_failed;
1690 }
1691 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1692 pBt->pPage1 = pPage1;
1693 return SQLITE_OK;
1694
1695page1_init_failed:
1696 releasePage(pPage1);
1697 pBt->pPage1 = 0;
1698 return rc;
1699}
1700
1701/*
1702** This routine works like lockBtree() except that it also invokes the
1703** busy callback if there is lock contention.
1704*/
1705static int lockBtreeWithRetry(Btree *pRef){
1706 int rc = SQLITE_OK;
1707
1708 assert( sqlite3BtreeHoldsMutex(pRef) );
1709 if( pRef->inTrans==TRANS_NONE ){
1710 u8 inTransaction = pRef->pBt->inTransaction;
1711 btreeIntegrity(pRef);
1712 rc = sqlite3BtreeBeginTrans(pRef, 0);
1713 pRef->pBt->inTransaction = inTransaction;
1714 pRef->inTrans = TRANS_NONE;
1715 if( rc==SQLITE_OK ){
1716 pRef->pBt->nTransaction--;
1717 }
1718 btreeIntegrity(pRef);
1719 }
1720 return rc;
1721}
1722
1723
1724/*
1725** If there are no outstanding cursors and we are not in the middle
1726** of a transaction but there is a read lock on the database, then
1727** this routine unrefs the first page of the database file which
1728** has the effect of releasing the read lock.
1729**
1730** If there are any outstanding cursors, this routine is a no-op.
1731**
1732** If there is a transaction in progress, this routine is a no-op.
1733*/
1734static void unlockBtreeIfUnused(BtShared *pBt){
1735 assert( sqlite3_mutex_held(pBt->mutex) );
1736 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1737 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1738 if( pBt->pPage1->aData==0 ){
1739 MemPage *pPage = pBt->pPage1;
1740 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1741 pPage->pBt = pBt;
1742 pPage->pgno = 1;
1743 }
1744 releasePage(pBt->pPage1);
1745 }
1746 pBt->pPage1 = 0;
1747 pBt->inStmt = 0;
1748 }
1749}
1750
1751/*
1752** Create a new database by initializing the first page of the
1753** file.
1754*/
1755static int newDatabase(BtShared *pBt){
1756 MemPage *pP1;
1757 unsigned char *data;
1758 int rc;
1759
1760 assert( sqlite3_mutex_held(pBt->mutex) );
1761 if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK;
1762 pP1 = pBt->pPage1;
1763 assert( pP1!=0 );
1764 data = pP1->aData;
1765 rc = sqlite3PagerWrite(pP1->pDbPage);
1766 if( rc ) return rc;
1767 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1768 assert( sizeof(zMagicHeader)==16 );
1769 put2byte(&data[16], pBt->pageSize);
1770 data[18] = 1;
1771 data[19] = 1;
1772 data[20] = pBt->pageSize - pBt->usableSize;
1773 data[21] = pBt->maxEmbedFrac;
1774 data[22] = pBt->minEmbedFrac;
1775 data[23] = pBt->minLeafFrac;
1776 memset(&data[24], 0, 100-24);
1777 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1778 pBt->pageSizeFixed = 1;
1779#ifndef SQLITE_OMIT_AUTOVACUUM
1780 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1781 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1782 put4byte(&data[36 + 4*4], pBt->autoVacuum);
1783 put4byte(&data[36 + 7*4], pBt->incrVacuum);
1784#endif
1785 return SQLITE_OK;
1786}
1787
1788/*
1789** Attempt to start a new transaction. A write-transaction
1790** is started if the second argument is nonzero, otherwise a read-
1791** transaction. If the second argument is 2 or more and exclusive
1792** transaction is started, meaning that no other process is allowed
1793** to access the database. A preexisting transaction may not be
1794** upgraded to exclusive by calling this routine a second time - the
1795** exclusivity flag only works for a new transaction.
1796**
1797** A write-transaction must be started before attempting any
1798** changes to the database. None of the following routines
1799** will work unless a transaction is started first:
1800**
1801** sqlite3BtreeCreateTable()
1802** sqlite3BtreeCreateIndex()
1803** sqlite3BtreeClearTable()
1804** sqlite3BtreeDropTable()
1805** sqlite3BtreeInsert()
1806** sqlite3BtreeDelete()
1807** sqlite3BtreeUpdateMeta()
1808**
1809** If an initial attempt to acquire the lock fails because of lock contention
1810** and the database was previously unlocked, then invoke the busy handler
1811** if there is one. But if there was previously a read-lock, do not
1812** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1813** returned when there is already a read-lock in order to avoid a deadlock.
1814**
1815** Suppose there are two processes A and B. A has a read lock and B has
1816** a reserved lock. B tries to promote to exclusive but is blocked because
1817** of A's read lock. A tries to promote to reserved but is blocked by B.
1818** One or the other of the two processes must give way or there can be
1819** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1820** when A already has a read lock, we encourage A to give up and let B
1821** proceed.
1822*/
1823int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1824 BtShared *pBt = p->pBt;
1825 int rc = SQLITE_OK;
1826
1827 sqlite3BtreeEnter(p);
1828 btreeIntegrity(p);
1829
1830 /* If the btree is already in a write-transaction, or it
1831 ** is already in a read-transaction and a read-transaction
1832 ** is requested, this is a no-op.
1833 */
1834 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1835 goto trans_begun;
1836 }
1837
1838 /* Write transactions are not possible on a read-only database */
1839 if( pBt->readOnly && wrflag ){
1840 rc = SQLITE_READONLY;
1841 goto trans_begun;
1842 }
1843
1844 /* If another database handle has already opened a write transaction
1845 ** on this shared-btree structure and a second write transaction is
1846 ** requested, return SQLITE_BUSY.
1847 */
1848 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1849 rc = SQLITE_BUSY;
1850 goto trans_begun;
1851 }
1852
1853 do {
1854 if( pBt->pPage1==0 ){
1855 rc = lockBtree(pBt);
1856 }
1857
1858 if( rc==SQLITE_OK && wrflag ){
1859 if( pBt->readOnly ){
1860 rc = SQLITE_READONLY;
1861 }else{
1862 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1863 if( rc==SQLITE_OK ){
1864 rc = newDatabase(pBt);
1865 }
1866 }
1867 }
1868
1869 if( rc==SQLITE_OK ){
1870 if( wrflag ) pBt->inStmt = 0;
1871 }else{
1872 unlockBtreeIfUnused(pBt);
1873 }
1874 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1875 sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
1876
1877 if( rc==SQLITE_OK ){
1878 if( p->inTrans==TRANS_NONE ){
1879 pBt->nTransaction++;
1880 }
1881 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1882 if( p->inTrans>pBt->inTransaction ){
1883 pBt->inTransaction = p->inTrans;
1884 }
1885 }
1886
1887
1888trans_begun:
1889 btreeIntegrity(p);
1890 sqlite3BtreeLeave(p);
1891 return rc;
1892}
1893
1894#ifndef SQLITE_OMIT_AUTOVACUUM
1895
1896/*
1897** Set the pointer-map entries for all children of page pPage. Also, if
1898** pPage contains cells that point to overflow pages, set the pointer
1899** map entries for the overflow pages as well.
1900*/
1901static int setChildPtrmaps(MemPage *pPage){
1902 int i; /* Counter variable */
1903 int nCell; /* Number of cells in page pPage */
1904 int rc; /* Return code */
1905 BtShared *pBt = pPage->pBt;
1906 int isInitOrig = pPage->isInit;
1907 Pgno pgno = pPage->pgno;
1908
1909 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1910 rc = sqlite3BtreeInitPage(pPage, pPage->pParent);
1911 if( rc!=SQLITE_OK ){
1912 goto set_child_ptrmaps_out;
1913 }
1914 nCell = pPage->nCell;
1915
1916 for(i=0; i<nCell; i++){
1917 u8 *pCell = findCell(pPage, i);
1918
1919 rc = ptrmapPutOvflPtr(pPage, pCell);
1920 if( rc!=SQLITE_OK ){
1921 goto set_child_ptrmaps_out;
1922 }
1923
1924 if( !pPage->leaf ){
1925 Pgno childPgno = get4byte(pCell);
1926 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1927 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1928 }
1929 }
1930
1931 if( !pPage->leaf ){
1932 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1933 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1934 }
1935
1936set_child_ptrmaps_out:
1937 pPage->isInit = isInitOrig;
1938 return rc;
1939}
1940
1941/*
1942** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1943** page, is a pointer to page iFrom. Modify this pointer so that it points to
1944** iTo. Parameter eType describes the type of pointer to be modified, as
1945** follows:
1946**
1947** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1948** page of pPage.
1949**
1950** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1951** page pointed to by one of the cells on pPage.
1952**
1953** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1954** overflow page in the list.
1955*/
1956static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1957 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1958 if( eType==PTRMAP_OVERFLOW2 ){
1959 /* The pointer is always the first 4 bytes of the page in this case. */
1960 if( get4byte(pPage->aData)!=iFrom ){
1961 return SQLITE_CORRUPT_BKPT;
1962 }
1963 put4byte(pPage->aData, iTo);
1964 }else{
1965 int isInitOrig = pPage->isInit;
1966 int i;
1967 int nCell;
1968
1969 sqlite3BtreeInitPage(pPage, 0);
1970 nCell = pPage->nCell;
1971
1972 for(i=0; i<nCell; i++){
1973 u8 *pCell = findCell(pPage, i);
1974 if( eType==PTRMAP_OVERFLOW1 ){
1975 CellInfo info;
1976 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1977 if( info.iOverflow ){
1978 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1979 put4byte(&pCell[info.iOverflow], iTo);
1980 break;
1981 }
1982 }
1983 }else{
1984 if( get4byte(pCell)==iFrom ){
1985 put4byte(pCell, iTo);
1986 break;
1987 }
1988 }
1989 }
1990
1991 if( i==nCell ){
1992 if( eType!=PTRMAP_BTREE ||
1993 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1994 return SQLITE_CORRUPT_BKPT;
1995 }
1996 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1997 }
1998
1999 pPage->isInit = isInitOrig;
2000 }
2001 return SQLITE_OK;
2002}
2003
2004
2005/*
2006** Move the open database page pDbPage to location iFreePage in the
2007** database. The pDbPage reference remains valid.
2008*/
2009static int relocatePage(
2010 BtShared *pBt, /* Btree */
2011 MemPage *pDbPage, /* Open page to move */
2012 u8 eType, /* Pointer map 'type' entry for pDbPage */
2013 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2014 Pgno iFreePage /* The location to move pDbPage to */
2015){
2016 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2017 Pgno iDbPage = pDbPage->pgno;
2018 Pager *pPager = pBt->pPager;
2019 int rc;
2020
2021 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2022 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2023 assert( sqlite3_mutex_held(pBt->mutex) );
2024 assert( pDbPage->pBt==pBt );
2025
2026 /* Move page iDbPage from it's current location to page number iFreePage */
2027 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2028 iDbPage, iFreePage, iPtrPage, eType));
2029 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage);
2030 if( rc!=SQLITE_OK ){
2031 return rc;
2032 }
2033 pDbPage->pgno = iFreePage;
2034
2035 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2036 ** that point to overflow pages. The pointer map entries for all these
2037 ** pages need to be changed.
2038 **
2039 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2040 ** pointer to a subsequent overflow page. If this is the case, then
2041 ** the pointer map needs to be updated for the subsequent overflow page.
2042 */
2043 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2044 rc = setChildPtrmaps(pDbPage);
2045 if( rc!=SQLITE_OK ){
2046 return rc;
2047 }
2048 }else{
2049 Pgno nextOvfl = get4byte(pDbPage->aData);
2050 if( nextOvfl!=0 ){
2051 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2052 if( rc!=SQLITE_OK ){
2053 return rc;
2054 }
2055 }
2056 }
2057
2058 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2059 ** that it points at iFreePage. Also fix the pointer map entry for
2060 ** iPtrPage.
2061 */
2062 if( eType!=PTRMAP_ROOTPAGE ){
2063 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2064 if( rc!=SQLITE_OK ){
2065 return rc;
2066 }
2067 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2068 if( rc!=SQLITE_OK ){
2069 releasePage(pPtrPage);
2070 return rc;
2071 }
2072 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2073 releasePage(pPtrPage);
2074 if( rc==SQLITE_OK ){
2075 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2076 }
2077 }
2078 return rc;
2079}
2080
2081/* Forward declaration required by incrVacuumStep(). */
2082static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2083
2084/*
2085** Perform a single step of an incremental-vacuum. If successful,
2086** return SQLITE_OK. If there is no work to do (and therefore no
2087** point in calling this function again), return SQLITE_DONE.
2088**
2089** More specificly, this function attempts to re-organize the
2090** database so that the last page of the file currently in use
2091** is no longer in use.
2092**
2093** If the nFin parameter is non-zero, the implementation assumes
2094** that the caller will keep calling incrVacuumStep() until
2095** it returns SQLITE_DONE or an error, and that nFin is the
2096** number of pages the database file will contain after this
2097** process is complete.
2098*/
2099static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2100 Pgno iLastPg; /* Last page in the database */
2101 Pgno nFreeList; /* Number of pages still on the free-list */
2102
2103 assert( sqlite3_mutex_held(pBt->mutex) );
2104 iLastPg = pBt->nTrunc;
2105 if( iLastPg==0 ){
2106 iLastPg = sqlite3PagerPagecount(pBt->pPager);
2107 }
2108
2109 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2110 int rc;
2111 u8 eType;
2112 Pgno iPtrPage;
2113
2114 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2115 if( nFreeList==0 || nFin==iLastPg ){
2116 return SQLITE_DONE;
2117 }
2118
2119 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2120 if( rc!=SQLITE_OK ){
2121 return rc;
2122 }
2123 if( eType==PTRMAP_ROOTPAGE ){
2124 return SQLITE_CORRUPT_BKPT;
2125 }
2126
2127 if( eType==PTRMAP_FREEPAGE ){
2128 if( nFin==0 ){
2129 /* Remove the page from the files free-list. This is not required
2130 ** if nFin is non-zero. In that case, the free-list will be
2131 ** truncated to zero after this function returns, so it doesn't
2132 ** matter if it still contains some garbage entries.
2133 */
2134 Pgno iFreePg;
2135 MemPage *pFreePg;
2136 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2137 if( rc!=SQLITE_OK ){
2138 return rc;
2139 }
2140 assert( iFreePg==iLastPg );
2141 releasePage(pFreePg);
2142 }
2143 } else {
2144 Pgno iFreePg; /* Index of free page to move pLastPg to */
2145 MemPage *pLastPg;
2146
2147 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2148 if( rc!=SQLITE_OK ){
2149 return rc;
2150 }
2151
2152 /* If nFin is zero, this loop runs exactly once and page pLastPg
2153 ** is swapped with the first free page pulled off the free list.
2154 **
2155 ** On the other hand, if nFin is greater than zero, then keep
2156 ** looping until a free-page located within the first nFin pages
2157 ** of the file is found.
2158 */
2159 do {
2160 MemPage *pFreePg;
2161 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2162 if( rc!=SQLITE_OK ){
2163 releasePage(pLastPg);
2164 return rc;
2165 }
2166 releasePage(pFreePg);
2167 }while( nFin!=0 && iFreePg>nFin );
2168 assert( iFreePg<iLastPg );
2169
2170 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2171 if( rc!=SQLITE_OK ){
2172 return rc;
2173 }
2174 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg);
2175 releasePage(pLastPg);
2176 if( rc!=SQLITE_OK ){
2177 return rc;
2178 }
2179 }
2180 }
2181
2182 pBt->nTrunc = iLastPg - 1;
2183 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2184 pBt->nTrunc--;
2185 }
2186 return SQLITE_OK;
2187}
2188
2189/*
2190** A write-transaction must be opened before calling this function.
2191** It performs a single unit of work towards an incremental vacuum.
2192**
2193** If the incremental vacuum is finished after this function has run,
2194** SQLITE_DONE is returned. If it is not finished, but no error occured,
2195** SQLITE_OK is returned. Otherwise an SQLite error code.
2196*/
2197int sqlite3BtreeIncrVacuum(Btree *p){
2198 int rc;
2199 BtShared *pBt = p->pBt;
2200
2201 sqlite3BtreeEnter(p);
2202 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2203 if( !pBt->autoVacuum ){
2204 rc = SQLITE_DONE;
2205 }else{
2206 invalidateAllOverflowCache(pBt);
2207 rc = incrVacuumStep(pBt, 0);
2208 }
2209 sqlite3BtreeLeave(p);
2210 return rc;
2211}
2212
2213/*
2214** This routine is called prior to sqlite3PagerCommit when a transaction
2215** is commited for an auto-vacuum database.
2216**
2217** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2218** the database file should be truncated to during the commit process.
2219** i.e. the database has been reorganized so that only the first *pnTrunc
2220** pages are in use.
2221*/
2222static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2223 int rc = SQLITE_OK;
2224 Pager *pPager = pBt->pPager;
2225#ifndef NDEBUG
2226 int nRef = sqlite3PagerRefcount(pPager);
2227#endif
2228
2229 assert( sqlite3_mutex_held(pBt->mutex) );
2230 invalidateAllOverflowCache(pBt);
2231 assert(pBt->autoVacuum);
2232 if( !pBt->incrVacuum ){
2233 Pgno nFin = 0;
2234
2235 if( pBt->nTrunc==0 ){
2236 Pgno nFree;
2237 Pgno nPtrmap;
2238 const int pgsz = pBt->pageSize;
2239 Pgno nOrig = sqlite3PagerPagecount(pBt->pPager);
2240
2241 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2242 return SQLITE_CORRUPT_BKPT;
2243 }
2244 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2245 nOrig--;
2246 }
2247 nFree = get4byte(&pBt->pPage1->aData[36]);
2248 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2249 nFin = nOrig - nFree - nPtrmap;
2250 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2251 nFin--;
2252 }
2253 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2254 nFin--;
2255 }
2256 }
2257
2258 while( rc==SQLITE_OK ){
2259 rc = incrVacuumStep(pBt, nFin);
2260 }
2261 if( rc==SQLITE_DONE ){
2262 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2263 rc = SQLITE_OK;
2264 if( pBt->nTrunc ){
2265 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2266 put4byte(&pBt->pPage1->aData[32], 0);
2267 put4byte(&pBt->pPage1->aData[36], 0);
2268 pBt->nTrunc = nFin;
2269 }
2270 }
2271 if( rc!=SQLITE_OK ){
2272 sqlite3PagerRollback(pPager);
2273 }
2274 }
2275
2276 if( rc==SQLITE_OK ){
2277 *pnTrunc = pBt->nTrunc;
2278 pBt->nTrunc = 0;
2279 }
2280 assert( nRef==sqlite3PagerRefcount(pPager) );
2281 return rc;
2282}
2283
2284#endif
2285
2286/*
2287** This routine does the first phase of a two-phase commit. This routine
2288** causes a rollback journal to be created (if it does not already exist)
2289** and populated with enough information so that if a power loss occurs
2290** the database can be restored to its original state by playing back
2291** the journal. Then the contents of the journal are flushed out to
2292** the disk. After the journal is safely on oxide, the changes to the
2293** database are written into the database file and flushed to oxide.
2294** At the end of this call, the rollback journal still exists on the
2295** disk and we are still holding all locks, so the transaction has not
2296** committed. See sqlite3BtreeCommit() for the second phase of the
2297** commit process.
2298**
2299** This call is a no-op if no write-transaction is currently active on pBt.
2300**
2301** Otherwise, sync the database file for the btree pBt. zMaster points to
2302** the name of a master journal file that should be written into the
2303** individual journal file, or is NULL, indicating no master journal file
2304** (single database transaction).
2305**
2306** When this is called, the master journal should already have been
2307** created, populated with this journal pointer and synced to disk.
2308**
2309** Once this is routine has returned, the only thing required to commit
2310** the write-transaction for this database file is to delete the journal.
2311*/
2312int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2313 int rc = SQLITE_OK;
2314 if( p->inTrans==TRANS_WRITE ){
2315 BtShared *pBt = p->pBt;
2316 Pgno nTrunc = 0;
2317 sqlite3BtreeEnter(p);
2318#ifndef SQLITE_OMIT_AUTOVACUUM
2319 if( pBt->autoVacuum ){
2320 rc = autoVacuumCommit(pBt, &nTrunc);
2321 if( rc!=SQLITE_OK ){
2322 sqlite3BtreeLeave(p);
2323 return rc;
2324 }
2325 }
2326#endif
2327 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc);
2328 sqlite3BtreeLeave(p);
2329 }
2330 return rc;
2331}
2332
2333/*
2334** Commit the transaction currently in progress.
2335**
2336** This routine implements the second phase of a 2-phase commit. The
2337** sqlite3BtreeSync() routine does the first phase and should be invoked
2338** prior to calling this routine. The sqlite3BtreeSync() routine did
2339** all the work of writing information out to disk and flushing the
2340** contents so that they are written onto the disk platter. All this
2341** routine has to do is delete or truncate the rollback journal
2342** (which causes the transaction to commit) and drop locks.
2343**
2344** This will release the write lock on the database file. If there
2345** are no active cursors, it also releases the read lock.
2346*/
2347int sqlite3BtreeCommitPhaseTwo(Btree *p){
2348 BtShared *pBt = p->pBt;
2349
2350 sqlite3BtreeEnter(p);
2351 btreeIntegrity(p);
2352
2353 /* If the handle has a write-transaction open, commit the shared-btrees
2354 ** transaction and set the shared state to TRANS_READ.
2355 */
2356 if( p->inTrans==TRANS_WRITE ){
2357 int rc;
2358 assert( pBt->inTransaction==TRANS_WRITE );
2359 assert( pBt->nTransaction>0 );
2360 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2361 if( rc!=SQLITE_OK ){
2362 sqlite3BtreeLeave(p);
2363 return rc;
2364 }
2365 pBt->inTransaction = TRANS_READ;
2366 pBt->inStmt = 0;
2367 }
2368 unlockAllTables(p);
2369
2370 /* If the handle has any kind of transaction open, decrement the transaction
2371 ** count of the shared btree. If the transaction count reaches 0, set
2372 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2373 ** will unlock the pager.
2374 */
2375 if( p->inTrans!=TRANS_NONE ){
2376 pBt->nTransaction--;
2377 if( 0==pBt->nTransaction ){
2378 pBt->inTransaction = TRANS_NONE;
2379 }
2380 }
2381
2382 /* Set the handles current transaction state to TRANS_NONE and unlock
2383 ** the pager if this call closed the only read or write transaction.
2384 */
2385 p->inTrans = TRANS_NONE;
2386 unlockBtreeIfUnused(pBt);
2387
2388 btreeIntegrity(p);
2389 sqlite3BtreeLeave(p);
2390 return SQLITE_OK;
2391}
2392
2393/*
2394** Do both phases of a commit.
2395*/
2396int sqlite3BtreeCommit(Btree *p){
2397 int rc;
2398 sqlite3BtreeEnter(p);
2399 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2400 if( rc==SQLITE_OK ){
2401 rc = sqlite3BtreeCommitPhaseTwo(p);
2402 }
2403 sqlite3BtreeLeave(p);
2404 return rc;
2405}
2406
2407#ifndef NDEBUG
2408/*
2409** Return the number of write-cursors open on this handle. This is for use
2410** in assert() expressions, so it is only compiled if NDEBUG is not
2411** defined.
2412**
2413** For the purposes of this routine, a write-cursor is any cursor that
2414** is capable of writing to the databse. That means the cursor was
2415** originally opened for writing and the cursor has not be disabled
2416** by having its state changed to CURSOR_FAULT.
2417*/
2418static int countWriteCursors(BtShared *pBt){
2419 BtCursor *pCur;
2420 int r = 0;
2421 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2422 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2423 }
2424 return r;
2425}
2426#endif
2427
2428/*
2429** This routine sets the state to CURSOR_FAULT and the error
2430** code to errCode for every cursor on BtShared that pBtree
2431** references.
2432**
2433** Every cursor is tripped, including cursors that belong
2434** to other database connections that happen to be sharing
2435** the cache with pBtree.
2436**
2437** This routine gets called when a rollback occurs.
2438** All cursors using the same cache must be tripped
2439** to prevent them from trying to use the btree after
2440** the rollback. The rollback may have deleted tables
2441** or moved root pages, so it is not sufficient to
2442** save the state of the cursor. The cursor must be
2443** invalidated.
2444*/
2445void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2446 BtCursor *p;
2447 sqlite3BtreeEnter(pBtree);
2448 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2449 clearCursorPosition(p);
2450 p->eState = CURSOR_FAULT;
2451 p->skip = errCode;
2452 }
2453 sqlite3BtreeLeave(pBtree);
2454}
2455
2456/*
2457** Rollback the transaction in progress. All cursors will be
2458** invalided by this operation. Any attempt to use a cursor
2459** that was open at the beginning of this operation will result
2460** in an error.
2461**
2462** This will release the write lock on the database file. If there
2463** are no active cursors, it also releases the read lock.
2464*/
2465int sqlite3BtreeRollback(Btree *p){
2466 int rc;
2467 BtShared *pBt = p->pBt;
2468 MemPage *pPage1;
2469
2470 sqlite3BtreeEnter(p);
2471 rc = saveAllCursors(pBt, 0, 0);
2472#ifndef SQLITE_OMIT_SHARED_CACHE
2473 if( rc!=SQLITE_OK ){
2474 /* This is a horrible situation. An IO or malloc() error occured whilst
2475 ** trying to save cursor positions. If this is an automatic rollback (as
2476 ** the result of a constraint, malloc() failure or IO error) then
2477 ** the cache may be internally inconsistent (not contain valid trees) so
2478 ** we cannot simply return the error to the caller. Instead, abort
2479 ** all queries that may be using any of the cursors that failed to save.
2480 */
2481 sqlite3BtreeTripAllCursors(p, rc);
2482 }
2483#endif
2484 btreeIntegrity(p);
2485 unlockAllTables(p);
2486
2487 if( p->inTrans==TRANS_WRITE ){
2488 int rc2;
2489
2490#ifndef SQLITE_OMIT_AUTOVACUUM
2491 pBt->nTrunc = 0;
2492#endif
2493
2494 assert( TRANS_WRITE==pBt->inTransaction );
2495 rc2 = sqlite3PagerRollback(pBt->pPager);
2496 if( rc2!=SQLITE_OK ){
2497 rc = rc2;
2498 }
2499
2500 /* The rollback may have destroyed the pPage1->aData value. So
2501 ** call sqlite3BtreeGetPage() on page 1 again to make
2502 ** sure pPage1->aData is set correctly. */
2503 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2504 releasePage(pPage1);
2505 }
2506 assert( countWriteCursors(pBt)==0 );
2507 pBt->inTransaction = TRANS_READ;
2508 }
2509
2510 if( p->inTrans!=TRANS_NONE ){
2511 assert( pBt->nTransaction>0 );
2512 pBt->nTransaction--;
2513 if( 0==pBt->nTransaction ){
2514 pBt->inTransaction = TRANS_NONE;
2515 }
2516 }
2517
2518 p->inTrans = TRANS_NONE;
2519 pBt->inStmt = 0;
2520 unlockBtreeIfUnused(pBt);
2521
2522 btreeIntegrity(p);
2523 sqlite3BtreeLeave(p);
2524 return rc;
2525}
2526
2527/*
2528** Start a statement subtransaction. The subtransaction can
2529** can be rolled back independently of the main transaction.
2530** You must start a transaction before starting a subtransaction.
2531** The subtransaction is ended automatically if the main transaction
2532** commits or rolls back.
2533**
2534** Only one subtransaction may be active at a time. It is an error to try
2535** to start a new subtransaction if another subtransaction is already active.
2536**
2537** Statement subtransactions are used around individual SQL statements
2538** that are contained within a BEGIN...COMMIT block. If a constraint
2539** error occurs within the statement, the effect of that one statement
2540** can be rolled back without having to rollback the entire transaction.
2541*/
2542int sqlite3BtreeBeginStmt(Btree *p){
2543 int rc;
2544 BtShared *pBt = p->pBt;
2545 sqlite3BtreeEnter(p);
2546 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2547 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2548 }else{
2549 assert( pBt->inTransaction==TRANS_WRITE );
2550 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2551 pBt->inStmt = 1;
2552 }
2553 sqlite3BtreeLeave(p);
2554 return rc;
2555}
2556
2557
2558/*
2559** Commit the statment subtransaction currently in progress. If no
2560** subtransaction is active, this is a no-op.
2561*/
2562int sqlite3BtreeCommitStmt(Btree *p){
2563 int rc;
2564 BtShared *pBt = p->pBt;
2565 sqlite3BtreeEnter(p);
2566 if( pBt->inStmt && !pBt->readOnly ){
2567 rc = sqlite3PagerStmtCommit(pBt->pPager);
2568 }else{
2569 rc = SQLITE_OK;
2570 }
2571 pBt->inStmt = 0;
2572 sqlite3BtreeLeave(p);
2573 return rc;
2574}
2575
2576/*
2577** Rollback the active statement subtransaction. If no subtransaction
2578** is active this routine is a no-op.
2579**
2580** All cursors will be invalidated by this operation. Any attempt
2581** to use a cursor that was open at the beginning of this operation
2582** will result in an error.
2583*/
2584int sqlite3BtreeRollbackStmt(Btree *p){
2585 int rc = SQLITE_OK;
2586 BtShared *pBt = p->pBt;
2587 sqlite3BtreeEnter(p);
2588 if( pBt->inStmt && !pBt->readOnly ){
2589 rc = sqlite3PagerStmtRollback(pBt->pPager);
2590 assert( countWriteCursors(pBt)==0 );
2591 pBt->inStmt = 0;
2592 }
2593 sqlite3BtreeLeave(p);
2594 return rc;
2595}
2596
2597/*
2598** Default key comparison function to be used if no comparison function
2599** is specified on the sqlite3BtreeCursor() call.
2600*/
2601static int dfltCompare(
2602 void *NotUsed, /* User data is not used */
2603 int n1, const void *p1, /* First key to compare */
2604 int n2, const void *p2 /* Second key to compare */
2605){
2606 int c;
2607 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2608 if( c==0 ){
2609 c = n1 - n2;
2610 }
2611 return c;
2612}
2613
2614/*
2615** Create a new cursor for the BTree whose root is on the page
2616** iTable. The act of acquiring a cursor gets a read lock on
2617** the database file.
2618**
2619** If wrFlag==0, then the cursor can only be used for reading.
2620** If wrFlag==1, then the cursor can be used for reading or for
2621** writing if other conditions for writing are also met. These
2622** are the conditions that must be met in order for writing to
2623** be allowed:
2624**
2625** 1: The cursor must have been opened with wrFlag==1
2626**
2627** 2: Other database connections that share the same pager cache
2628** but which are not in the READ_UNCOMMITTED state may not have
2629** cursors open with wrFlag==0 on the same table. Otherwise
2630** the changes made by this write cursor would be visible to
2631** the read cursors in the other database connection.
2632**
2633** 3: The database must be writable (not on read-only media)
2634**
2635** 4: There must be an active transaction.
2636**
2637** No checking is done to make sure that page iTable really is the
2638** root page of a b-tree. If it is not, then the cursor acquired
2639** will not work correctly.
2640**
2641** The comparison function must be logically the same for every cursor
2642** on a particular table. Changing the comparison function will result
2643** in incorrect operations. If the comparison function is NULL, a
2644** default comparison function is used. The comparison function is
2645** always ignored for INTKEY tables.
2646*/
2647static int btreeCursor(
2648 Btree *p, /* The btree */
2649 int iTable, /* Root page of table to open */
2650 int wrFlag, /* 1 to write. 0 read-only */
2651 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2652 void *pArg, /* First arg to xCompare() */
2653 BtCursor **ppCur /* Write new cursor here */
2654){
2655 int rc;
2656 BtCursor *pCur;
2657 BtShared *pBt = p->pBt;
2658
2659 assert( sqlite3BtreeHoldsMutex(p) );
2660 *ppCur = 0;
2661 if( wrFlag ){
2662 if( pBt->readOnly ){
2663 return SQLITE_READONLY;
2664 }
2665 if( checkReadLocks(p, iTable, 0) ){
2666 return SQLITE_LOCKED;
2667 }
2668 }
2669
2670 if( pBt->pPage1==0 ){
2671 rc = lockBtreeWithRetry(p);
2672 if( rc!=SQLITE_OK ){
2673 return rc;
2674 }
2675 if( pBt->readOnly && wrFlag ){
2676 return SQLITE_READONLY;
2677 }
2678 }
2679 pCur = sqlite3MallocZero( sizeof(*pCur) );
2680 if( pCur==0 ){
2681 rc = SQLITE_NOMEM;
2682 goto create_cursor_exception;
2683 }
2684 pCur->pgnoRoot = (Pgno)iTable;
2685 if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){
2686 rc = SQLITE_EMPTY;
2687 goto create_cursor_exception;
2688 }
2689 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2690 if( rc!=SQLITE_OK ){
2691 goto create_cursor_exception;
2692 }
2693
2694 /* Now that no other errors can occur, finish filling in the BtCursor
2695 ** variables, link the cursor into the BtShared list and set *ppCur (the
2696 ** output argument to this function).
2697 */
2698 pCur->xCompare = xCmp ? xCmp : dfltCompare;
2699 pCur->pArg = pArg;
2700 pCur->pBtree = p;
2701 pCur->pBt = pBt;
2702 pCur->wrFlag = wrFlag;
2703 pCur->pNext = pBt->pCursor;
2704 if( pCur->pNext ){
2705 pCur->pNext->pPrev = pCur;
2706 }
2707 pBt->pCursor = pCur;
2708 pCur->eState = CURSOR_INVALID;
2709 *ppCur = pCur;
2710
2711 return SQLITE_OK;
2712
2713create_cursor_exception:
2714 if( pCur ){
2715 releasePage(pCur->pPage);
2716 sqlite3_free(pCur);
2717 }
2718 unlockBtreeIfUnused(pBt);
2719 return rc;
2720}
2721int sqlite3BtreeCursor(
2722 Btree *p, /* The btree */
2723 int iTable, /* Root page of table to open */
2724 int wrFlag, /* 1 to write. 0 read-only */
2725 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2726 void *pArg, /* First arg to xCompare() */
2727 BtCursor **ppCur /* Write new cursor here */
2728){
2729 int rc;
2730 sqlite3BtreeEnter(p);
2731 rc = btreeCursor(p, iTable, wrFlag, xCmp, pArg, ppCur);
2732 sqlite3BtreeLeave(p);
2733 return rc;
2734}
2735
2736
2737/*
2738** Close a cursor. The read lock on the database file is released
2739** when the last cursor is closed.
2740*/
2741int sqlite3BtreeCloseCursor(BtCursor *pCur){
2742 BtShared *pBt = pCur->pBt;
2743 Btree *pBtree = pCur->pBtree;
2744
2745 sqlite3BtreeEnter(pBtree);
2746 clearCursorPosition(pCur);
2747 if( pCur->pPrev ){
2748 pCur->pPrev->pNext = pCur->pNext;
2749 }else{
2750 pBt->pCursor = pCur->pNext;
2751 }
2752 if( pCur->pNext ){
2753 pCur->pNext->pPrev = pCur->pPrev;
2754 }
2755 releasePage(pCur->pPage);
2756 unlockBtreeIfUnused(pBt);
2757 invalidateOverflowCache(pCur);
2758 sqlite3_free(pCur);
2759 sqlite3BtreeLeave(pBtree);
2760 return SQLITE_OK;
2761}
2762
2763/*
2764** Make a temporary cursor by filling in the fields of pTempCur.
2765** The temporary cursor is not on the cursor list for the Btree.
2766*/
2767void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2768 assert( cursorHoldsMutex(pCur) );
2769 memcpy(pTempCur, pCur, sizeof(*pCur));
2770 pTempCur->pNext = 0;
2771 pTempCur->pPrev = 0;
2772 if( pTempCur->pPage ){
2773 sqlite3PagerRef(pTempCur->pPage->pDbPage);
2774 }
2775}
2776
2777/*
2778** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2779** function above.
2780*/
2781void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2782 assert( cursorHoldsMutex(pCur) );
2783 if( pCur->pPage ){
2784 sqlite3PagerUnref(pCur->pPage->pDbPage);
2785 }
2786}
2787
2788/*
2789** Make sure the BtCursor* given in the argument has a valid
2790** BtCursor.info structure. If it is not already valid, call
2791** sqlite3BtreeParseCell() to fill it in.
2792**
2793** BtCursor.info is a cache of the information in the current cell.
2794** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2795**
2796** 2007-06-25: There is a bug in some versions of MSVC that cause the
2797** compiler to crash when getCellInfo() is implemented as a macro.
2798** But there is a measureable speed advantage to using the macro on gcc
2799** (when less compiler optimizations like -Os or -O0 are used and the
2800** compiler is not doing agressive inlining.) So we use a real function
2801** for MSVC and a macro for everything else. Ticket #2457.
2802*/
2803#ifndef NDEBUG
2804 static void assertCellInfo(BtCursor *pCur){
2805 CellInfo info;
2806 memset(&info, 0, sizeof(info));
2807 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info);
2808 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2809 }
2810#else
2811 #define assertCellInfo(x)
2812#endif
2813#ifdef _MSC_VER
2814 /* Use a real function in MSVC to work around bugs in that compiler. */
2815 static void getCellInfo(BtCursor *pCur){
2816 if( pCur->info.nSize==0 ){
2817 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);
2818 }else{
2819 assertCellInfo(pCur);
2820 }
2821 }
2822#else /* if not _MSC_VER */
2823 /* Use a macro in all other compilers so that the function is inlined */
2824#define getCellInfo(pCur) \
2825 if( pCur->info.nSize==0 ){ \
2826 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \
2827 }else{ \
2828 assertCellInfo(pCur); \
2829 }
2830#endif /* _MSC_VER */
2831
2832/*
2833** Set *pSize to the size of the buffer needed to hold the value of
2834** the key for the current entry. If the cursor is not pointing
2835** to a valid entry, *pSize is set to 0.
2836**
2837** For a table with the INTKEY flag set, this routine returns the key
2838** itself, not the number of bytes in the key.
2839*/
2840int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2841 int rc;
2842
2843 assert( cursorHoldsMutex(pCur) );
2844 rc = restoreOrClearCursorPosition(pCur);
2845 if( rc==SQLITE_OK ){
2846 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2847 if( pCur->eState==CURSOR_INVALID ){
2848 *pSize = 0;
2849 }else{
2850 getCellInfo(pCur);
2851 *pSize = pCur->info.nKey;
2852 }
2853 }
2854 return rc;
2855}
2856
2857/*
2858** Set *pSize to the number of bytes of data in the entry the
2859** cursor currently points to. Always return SQLITE_OK.
2860** Failure is not possible. If the cursor is not currently
2861** pointing to an entry (which can happen, for example, if
2862** the database is empty) then *pSize is set to 0.
2863*/
2864int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2865 int rc;
2866
2867 assert( cursorHoldsMutex(pCur) );
2868 rc = restoreOrClearCursorPosition(pCur);
2869 if( rc==SQLITE_OK ){
2870 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2871 if( pCur->eState==CURSOR_INVALID ){
2872 /* Not pointing at a valid entry - set *pSize to 0. */
2873 *pSize = 0;
2874 }else{
2875 getCellInfo(pCur);
2876 *pSize = pCur->info.nData;
2877 }
2878 }
2879 return rc;
2880}
2881
2882/*
2883** Given the page number of an overflow page in the database (parameter
2884** ovfl), this function finds the page number of the next page in the
2885** linked list of overflow pages. If possible, it uses the auto-vacuum
2886** pointer-map data instead of reading the content of page ovfl to do so.
2887**
2888** If an error occurs an SQLite error code is returned. Otherwise:
2889**
2890** Unless pPgnoNext is NULL, the page number of the next overflow
2891** page in the linked list is written to *pPgnoNext. If page ovfl
2892** is the last page in it's linked list, *pPgnoNext is set to zero.
2893**
2894** If ppPage is not NULL, *ppPage is set to the MemPage* handle
2895** for page ovfl. The underlying pager page may have been requested
2896** with the noContent flag set, so the page data accessable via
2897** this handle may not be trusted.
2898*/
2899static int getOverflowPage(
2900 BtShared *pBt,
2901 Pgno ovfl, /* Overflow page */
2902 MemPage **ppPage, /* OUT: MemPage handle */
2903 Pgno *pPgnoNext /* OUT: Next overflow page number */
2904){
2905 Pgno next = 0;
2906 int rc;
2907
2908 assert( sqlite3_mutex_held(pBt->mutex) );
2909 /* One of these must not be NULL. Otherwise, why call this function? */
2910 assert(ppPage || pPgnoNext);
2911
2912 /* If pPgnoNext is NULL, then this function is being called to obtain
2913 ** a MemPage* reference only. No page-data is required in this case.
2914 */
2915 if( !pPgnoNext ){
2916 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
2917 }
2918
2919#ifndef SQLITE_OMIT_AUTOVACUUM
2920 /* Try to find the next page in the overflow list using the
2921 ** autovacuum pointer-map pages. Guess that the next page in
2922 ** the overflow list is page number (ovfl+1). If that guess turns
2923 ** out to be wrong, fall back to loading the data of page
2924 ** number ovfl to determine the next page number.
2925 */
2926 if( pBt->autoVacuum ){
2927 Pgno pgno;
2928 Pgno iGuess = ovfl+1;
2929 u8 eType;
2930
2931 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
2932 iGuess++;
2933 }
2934
2935 if( iGuess<=sqlite3PagerPagecount(pBt->pPager) ){
2936 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
2937 if( rc!=SQLITE_OK ){
2938 return rc;
2939 }
2940 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
2941 next = iGuess;
2942 }
2943 }
2944 }
2945#endif
2946
2947 if( next==0 || ppPage ){
2948 MemPage *pPage = 0;
2949
2950 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
2951 assert(rc==SQLITE_OK || pPage==0);
2952 if( next==0 && rc==SQLITE_OK ){
2953 next = get4byte(pPage->aData);
2954 }
2955
2956 if( ppPage ){
2957 *ppPage = pPage;
2958 }else{
2959 releasePage(pPage);
2960 }
2961 }
2962 *pPgnoNext = next;
2963
2964 return rc;
2965}
2966
2967/*
2968** Copy data from a buffer to a page, or from a page to a buffer.
2969**
2970** pPayload is a pointer to data stored on database page pDbPage.
2971** If argument eOp is false, then nByte bytes of data are copied
2972** from pPayload to the buffer pointed at by pBuf. If eOp is true,
2973** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
2974** of data are copied from the buffer pBuf to pPayload.
2975**
2976** SQLITE_OK is returned on success, otherwise an error code.
2977*/
2978static int copyPayload(
2979 void *pPayload, /* Pointer to page data */
2980 void *pBuf, /* Pointer to buffer */
2981 int nByte, /* Number of bytes to copy */
2982 int eOp, /* 0 -> copy from page, 1 -> copy to page */
2983 DbPage *pDbPage /* Page containing pPayload */
2984){
2985 if( eOp ){
2986 /* Copy data from buffer to page (a write operation) */
2987 int rc = sqlite3PagerWrite(pDbPage);
2988 if( rc!=SQLITE_OK ){
2989 return rc;
2990 }
2991 memcpy(pPayload, pBuf, nByte);
2992 }else{
2993 /* Copy data from page to buffer (a read operation) */
2994 memcpy(pBuf, pPayload, nByte);
2995 }
2996 return SQLITE_OK;
2997}
2998
2999/*
3000** This function is used to read or overwrite payload information
3001** for the entry that the pCur cursor is pointing to. If the eOp
3002** parameter is 0, this is a read operation (data copied into
3003** buffer pBuf). If it is non-zero, a write (data copied from
3004** buffer pBuf).
3005**
3006** A total of "amt" bytes are read or written beginning at "offset".
3007** Data is read to or from the buffer pBuf.
3008**
3009** This routine does not make a distinction between key and data.
3010** It just reads or writes bytes from the payload area. Data might
3011** appear on the main page or be scattered out on multiple overflow
3012** pages.
3013**
3014** If the BtCursor.isIncrblobHandle flag is set, and the current
3015** cursor entry uses one or more overflow pages, this function
3016** allocates space for and lazily popluates the overflow page-list
3017** cache array (BtCursor.aOverflow). Subsequent calls use this
3018** cache to make seeking to the supplied offset more efficient.
3019**
3020** Once an overflow page-list cache has been allocated, it may be
3021** invalidated if some other cursor writes to the same table, or if
3022** the cursor is moved to a different row. Additionally, in auto-vacuum
3023** mode, the following events may invalidate an overflow page-list cache.
3024**
3025** * An incremental vacuum,
3026** * A commit in auto_vacuum="full" mode,
3027** * Creating a table (may require moving an overflow page).
3028*/
3029static int accessPayload(
3030 BtCursor *pCur, /* Cursor pointing to entry to read from */
3031 int offset, /* Begin reading this far into payload */
3032 int amt, /* Read this many bytes */
3033 unsigned char *pBuf, /* Write the bytes into this buffer */
3034 int skipKey, /* offset begins at data if this is true */
3035 int eOp /* zero to read. non-zero to write. */
3036){
3037 unsigned char *aPayload;
3038 int rc = SQLITE_OK;
3039 u32 nKey;
3040 int iIdx = 0;
3041 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */
3042 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
3043
3044 assert( pPage );
3045 assert( pCur->eState==CURSOR_VALID );
3046 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3047 assert( offset>=0 );
3048 assert( cursorHoldsMutex(pCur) );
3049
3050 getCellInfo(pCur);
3051 aPayload = pCur->info.pCell + pCur->info.nHeader;
3052 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3053
3054 if( skipKey ){
3055 offset += nKey;
3056 }
3057 if( offset+amt > nKey+pCur->info.nData ){
3058 /* Trying to read or write past the end of the data is an error */
3059 return SQLITE_ERROR;
3060 }
3061
3062 /* Check if data must be read/written to/from the btree page itself. */
3063 if( offset<pCur->info.nLocal ){
3064 int a = amt;
3065 if( a+offset>pCur->info.nLocal ){
3066 a = pCur->info.nLocal - offset;
3067 }
3068 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3069 offset = 0;
3070 pBuf += a;
3071 amt -= a;
3072 }else{
3073 offset -= pCur->info.nLocal;
3074 }
3075
3076 if( rc==SQLITE_OK && amt>0 ){
3077 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3078 Pgno nextPage;
3079
3080 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3081
3082#ifndef SQLITE_OMIT_INCRBLOB
3083 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3084 ** has not been allocated, allocate it now. The array is sized at
3085 ** one entry for each overflow page in the overflow chain. The
3086 ** page number of the first overflow page is stored in aOverflow[0],
3087 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3088 ** (the cache is lazily populated).
3089 */
3090 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3091 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3092 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3093 if( nOvfl && !pCur->aOverflow ){
3094 rc = SQLITE_NOMEM;
3095 }
3096 }
3097
3098 /* If the overflow page-list cache has been allocated and the
3099 ** entry for the first required overflow page is valid, skip
3100 ** directly to it.
3101 */
3102 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3103 iIdx = (offset/ovflSize);
3104 nextPage = pCur->aOverflow[iIdx];
3105 offset = (offset%ovflSize);
3106 }
3107#endif
3108
3109 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3110
3111#ifndef SQLITE_OMIT_INCRBLOB
3112 /* If required, populate the overflow page-list cache. */
3113 if( pCur->aOverflow ){
3114 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3115 pCur->aOverflow[iIdx] = nextPage;
3116 }
3117#endif
3118
3119 if( offset>=ovflSize ){
3120 /* The only reason to read this page is to obtain the page
3121 ** number for the next page in the overflow chain. The page
3122 ** data is not required. So first try to lookup the overflow
3123 ** page-list cache, if any, then fall back to the getOverflowPage()
3124 ** function.
3125 */
3126#ifndef SQLITE_OMIT_INCRBLOB
3127 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3128 nextPage = pCur->aOverflow[iIdx+1];
3129 } else
3130#endif
3131 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3132 offset -= ovflSize;
3133 }else{
3134 /* Need to read this page properly. It contains some of the
3135 ** range of data that is being read (eOp==0) or written (eOp!=0).
3136 */
3137 DbPage *pDbPage;
3138 int a = amt;
3139 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3140 if( rc==SQLITE_OK ){
3141 aPayload = sqlite3PagerGetData(pDbPage);
3142 nextPage = get4byte(aPayload);
3143 if( a + offset > ovflSize ){
3144 a = ovflSize - offset;
3145 }
3146 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3147 sqlite3PagerUnref(pDbPage);
3148 offset = 0;
3149 amt -= a;
3150 pBuf += a;
3151 }
3152 }
3153 }
3154 }
3155
3156 if( rc==SQLITE_OK && amt>0 ){
3157 return SQLITE_CORRUPT_BKPT;
3158 }
3159 return rc;
3160}
3161
3162/*
3163** Read part of the key associated with cursor pCur. Exactly
3164** "amt" bytes will be transfered into pBuf[]. The transfer
3165** begins at "offset".
3166**
3167** Return SQLITE_OK on success or an error code if anything goes
3168** wrong. An error is returned if "offset+amt" is larger than
3169** the available payload.
3170*/
3171int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3172 int rc;
3173
3174 assert( cursorHoldsMutex(pCur) );
3175 rc = restoreOrClearCursorPosition(pCur);
3176 if( rc==SQLITE_OK ){
3177 assert( pCur->eState==CURSOR_VALID );
3178 assert( pCur->pPage!=0 );
3179 if( pCur->pPage->intKey ){
3180 return SQLITE_CORRUPT_BKPT;
3181 }
3182 assert( pCur->pPage->intKey==0 );
3183 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3184 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3185 }
3186 return rc;
3187}
3188
3189/*
3190** Read part of the data associated with cursor pCur. Exactly
3191** "amt" bytes will be transfered into pBuf[]. The transfer
3192** begins at "offset".
3193**
3194** Return SQLITE_OK on success or an error code if anything goes
3195** wrong. An error is returned if "offset+amt" is larger than
3196** the available payload.
3197*/
3198int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3199 int rc;
3200
3201 assert( cursorHoldsMutex(pCur) );
3202 rc = restoreOrClearCursorPosition(pCur);
3203 if( rc==SQLITE_OK ){
3204 assert( pCur->eState==CURSOR_VALID );
3205 assert( pCur->pPage!=0 );
3206 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3207 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3208 }
3209 return rc;
3210}
3211
3212/*
3213** Return a pointer to payload information from the entry that the
3214** pCur cursor is pointing to. The pointer is to the beginning of
3215** the key if skipKey==0 and it points to the beginning of data if
3216** skipKey==1. The number of bytes of available key/data is written
3217** into *pAmt. If *pAmt==0, then the value returned will not be
3218** a valid pointer.
3219**
3220** This routine is an optimization. It is common for the entire key
3221** and data to fit on the local page and for there to be no overflow
3222** pages. When that is so, this routine can be used to access the
3223** key and data without making a copy. If the key and/or data spills
3224** onto overflow pages, then accessPayload() must be used to reassembly
3225** the key/data and copy it into a preallocated buffer.
3226**
3227** The pointer returned by this routine looks directly into the cached
3228** page of the database. The data might change or move the next time
3229** any btree routine is called.
3230*/
3231static const unsigned char *fetchPayload(
3232 BtCursor *pCur, /* Cursor pointing to entry to read from */
3233 int *pAmt, /* Write the number of available bytes here */
3234 int skipKey /* read beginning at data if this is true */
3235){
3236 unsigned char *aPayload;
3237 MemPage *pPage;
3238 u32 nKey;
3239 int nLocal;
3240
3241 assert( pCur!=0 && pCur->pPage!=0 );
3242 assert( pCur->eState==CURSOR_VALID );
3243 assert( cursorHoldsMutex(pCur) );
3244 pPage = pCur->pPage;
3245 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3246 getCellInfo(pCur);
3247 aPayload = pCur->info.pCell;
3248 aPayload += pCur->info.nHeader;
3249 if( pPage->intKey ){
3250 nKey = 0;
3251 }else{
3252 nKey = pCur->info.nKey;
3253 }
3254 if( skipKey ){
3255 aPayload += nKey;
3256 nLocal = pCur->info.nLocal - nKey;
3257 }else{
3258 nLocal = pCur->info.nLocal;
3259 if( nLocal>nKey ){
3260 nLocal = nKey;
3261 }
3262 }
3263 *pAmt = nLocal;
3264 return aPayload;
3265}
3266
3267
3268/*
3269** For the entry that cursor pCur is point to, return as
3270** many bytes of the key or data as are available on the local
3271** b-tree page. Write the number of available bytes into *pAmt.
3272**
3273** The pointer returned is ephemeral. The key/data may move
3274** or be destroyed on the next call to any Btree routine,
3275** including calls from other threads against the same cache.
3276** Hence, a mutex on the BtShared should be held prior to calling
3277** this routine.
3278**
3279** These routines is used to get quick access to key and data
3280** in the common case where no overflow pages are used.
3281*/
3282const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3283 assert( cursorHoldsMutex(pCur) );
3284 if( pCur->eState==CURSOR_VALID ){
3285 return (const void*)fetchPayload(pCur, pAmt, 0);
3286 }
3287 return 0;
3288}
3289const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3290 assert( cursorHoldsMutex(pCur) );
3291 if( pCur->eState==CURSOR_VALID ){
3292 return (const void*)fetchPayload(pCur, pAmt, 1);
3293 }
3294 return 0;
3295}
3296
3297
3298/*
3299** Move the cursor down to a new child page. The newPgno argument is the
3300** page number of the child page to move to.
3301*/
3302static int moveToChild(BtCursor *pCur, u32 newPgno){
3303 int rc;
3304 MemPage *pNewPage;
3305 MemPage *pOldPage;
3306 BtShared *pBt = pCur->pBt;
3307
3308 assert( cursorHoldsMutex(pCur) );
3309 assert( pCur->eState==CURSOR_VALID );
3310 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
3311 if( rc ) return rc;
3312 pNewPage->idxParent = pCur->idx;
3313 pOldPage = pCur->pPage;
3314 pOldPage->idxShift = 0;
3315 releasePage(pOldPage);
3316 pCur->pPage = pNewPage;
3317 pCur->idx = 0;
3318 pCur->info.nSize = 0;
3319 if( pNewPage->nCell<1 ){
3320 return SQLITE_CORRUPT_BKPT;
3321 }
3322 return SQLITE_OK;
3323}
3324
3325/*
3326** Return true if the page is the virtual root of its table.
3327**
3328** The virtual root page is the root page for most tables. But
3329** for the table rooted on page 1, sometime the real root page
3330** is empty except for the right-pointer. In such cases the
3331** virtual root page is the page that the right-pointer of page
3332** 1 is pointing to.
3333*/
3334int sqlite3BtreeIsRootPage(MemPage *pPage){
3335 MemPage *pParent;
3336
3337 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3338 pParent = pPage->pParent;
3339 if( pParent==0 ) return 1;
3340 if( pParent->pgno>1 ) return 0;
3341 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
3342 return 0;
3343}
3344
3345/*
3346** Move the cursor up to the parent page.
3347**
3348** pCur->idx is set to the cell index that contains the pointer
3349** to the page we are coming from. If we are coming from the
3350** right-most child page then pCur->idx is set to one more than
3351** the largest cell index.
3352*/
3353void sqlite3BtreeMoveToParent(BtCursor *pCur){
3354 MemPage *pParent;
3355 MemPage *pPage;
3356 int idxParent;
3357
3358 assert( cursorHoldsMutex(pCur) );
3359 assert( pCur->eState==CURSOR_VALID );
3360 pPage = pCur->pPage;
3361 assert( pPage!=0 );
3362 assert( !sqlite3BtreeIsRootPage(pPage) );
3363 pParent = pPage->pParent;
3364 assert( pParent!=0 );
3365 idxParent = pPage->idxParent;
3366 sqlite3PagerRef(pParent->pDbPage);
3367 releasePage(pPage);
3368 pCur->pPage = pParent;
3369 pCur->info.nSize = 0;
3370 assert( pParent->idxShift==0 );
3371 pCur->idx = idxParent;
3372}
3373
3374/*
3375** Move the cursor to the root page
3376*/
3377static int moveToRoot(BtCursor *pCur){
3378 MemPage *pRoot;
3379 int rc = SQLITE_OK;
3380 Btree *p = pCur->pBtree;
3381 BtShared *pBt = p->pBt;
3382
3383 assert( cursorHoldsMutex(pCur) );
3384 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3385 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3386 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3387 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3388 if( pCur->eState==CURSOR_FAULT ){
3389 return pCur->skip;
3390 }
3391 clearCursorPosition(pCur);
3392 }
3393 pRoot = pCur->pPage;
3394 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3395 assert( pRoot->isInit );
3396 }else{
3397 if(
3398 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3399 ){
3400 pCur->eState = CURSOR_INVALID;
3401 return rc;
3402 }
3403 releasePage(pCur->pPage);
3404 pCur->pPage = pRoot;
3405 }
3406 pCur->idx = 0;
3407 pCur->info.nSize = 0;
3408 if( pRoot->nCell==0 && !pRoot->leaf ){
3409 Pgno subpage;
3410 assert( pRoot->pgno==1 );
3411 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3412 assert( subpage>0 );
3413 pCur->eState = CURSOR_VALID;
3414 rc = moveToChild(pCur, subpage);
3415 }
3416 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3417 return rc;
3418}
3419
3420/*
3421** Move the cursor down to the left-most leaf entry beneath the
3422** entry to which it is currently pointing.
3423**
3424** The left-most leaf is the one with the smallest key - the first
3425** in ascending order.
3426*/
3427static int moveToLeftmost(BtCursor *pCur){
3428 Pgno pgno;
3429 int rc = SQLITE_OK;
3430 MemPage *pPage;
3431
3432 assert( cursorHoldsMutex(pCur) );
3433 assert( pCur->eState==CURSOR_VALID );
3434 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3435 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3436 pgno = get4byte(findCell(pPage, pCur->idx));
3437 rc = moveToChild(pCur, pgno);
3438 }
3439 return rc;
3440}
3441
3442/*
3443** Move the cursor down to the right-most leaf entry beneath the
3444** page to which it is currently pointing. Notice the difference
3445** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3446** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3447** finds the right-most entry beneath the *page*.
3448**
3449** The right-most entry is the one with the largest key - the last
3450** key in ascending order.
3451*/
3452static int moveToRightmost(BtCursor *pCur){
3453 Pgno pgno;
3454 int rc = SQLITE_OK;
3455 MemPage *pPage;
3456
3457 assert( cursorHoldsMutex(pCur) );
3458 assert( pCur->eState==CURSOR_VALID );
3459 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3460 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3461 pCur->idx = pPage->nCell;
3462 rc = moveToChild(pCur, pgno);
3463 }
3464 if( rc==SQLITE_OK ){
3465 pCur->idx = pPage->nCell - 1;
3466 pCur->info.nSize = 0;
3467 }
3468 return SQLITE_OK;
3469}
3470
3471/* Move the cursor to the first entry in the table. Return SQLITE_OK
3472** on success. Set *pRes to 0 if the cursor actually points to something
3473** or set *pRes to 1 if the table is empty.
3474*/
3475int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3476 int rc;
3477
3478 assert( cursorHoldsMutex(pCur) );
3479 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) );
3480 rc = moveToRoot(pCur);
3481 if( rc==SQLITE_OK ){
3482 if( pCur->eState==CURSOR_INVALID ){
3483 assert( pCur->pPage->nCell==0 );
3484 *pRes = 1;
3485 rc = SQLITE_OK;
3486 }else{
3487 assert( pCur->pPage->nCell>0 );
3488 *pRes = 0;
3489 rc = moveToLeftmost(pCur);
3490 }
3491 }
3492 return rc;
3493}
3494
3495/* Move the cursor to the last entry in the table. Return SQLITE_OK
3496** on success. Set *pRes to 0 if the cursor actually points to something
3497** or set *pRes to 1 if the table is empty.
3498*/
3499int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3500 int rc;
3501
3502 assert( cursorHoldsMutex(pCur) );
3503 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) );
3504 rc = moveToRoot(pCur);
3505 if( rc==SQLITE_OK ){
3506 if( CURSOR_INVALID==pCur->eState ){
3507 assert( pCur->pPage->nCell==0 );
3508 *pRes = 1;
3509 }else{
3510 assert( pCur->eState==CURSOR_VALID );
3511 *pRes = 0;
3512 rc = moveToRightmost(pCur);
3513 }
3514 }
3515 return rc;
3516}
3517
3518/* Move the cursor so that it points to an entry near pKey/nKey.
3519** Return a success code.
3520**
3521** For INTKEY tables, only the nKey parameter is used. pKey is
3522** ignored. For other tables, nKey is the number of bytes of data
3523** in pKey. The comparison function specified when the cursor was
3524** created is used to compare keys.
3525**
3526** If an exact match is not found, then the cursor is always
3527** left pointing at a leaf page which would hold the entry if it
3528** were present. The cursor might point to an entry that comes
3529** before or after the key.
3530**
3531** The result of comparing the key with the entry to which the
3532** cursor is written to *pRes if pRes!=NULL. The meaning of
3533** this value is as follows:
3534**
3535** *pRes<0 The cursor is left pointing at an entry that
3536** is smaller than pKey or if the table is empty
3537** and the cursor is therefore left point to nothing.
3538**
3539** *pRes==0 The cursor is left pointing at an entry that
3540** exactly matches pKey.
3541**
3542** *pRes>0 The cursor is left pointing at an entry that
3543** is larger than pKey.
3544**
3545*/
3546int sqlite3BtreeMoveto(
3547 BtCursor *pCur, /* The cursor to be moved */
3548 const void *pKey, /* The key content for indices. Not used by tables */
3549 i64 nKey, /* Size of pKey. Or the key for tables */
3550 int biasRight, /* If true, bias the search to the high end */
3551 int *pRes /* Search result flag */
3552){
3553 int rc;
3554
3555 assert( cursorHoldsMutex(pCur) );
3556 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) );
3557 rc = moveToRoot(pCur);
3558 if( rc ){
3559 return rc;
3560 }
3561 assert( pCur->pPage );
3562 assert( pCur->pPage->isInit );
3563 if( pCur->eState==CURSOR_INVALID ){
3564 *pRes = -1;
3565 assert( pCur->pPage->nCell==0 );
3566 return SQLITE_OK;
3567 }
3568 for(;;){
3569 int lwr, upr;
3570 Pgno chldPg;
3571 MemPage *pPage = pCur->pPage;
3572 int c = -1; /* pRes return if table is empty must be -1 */
3573 lwr = 0;
3574 upr = pPage->nCell-1;
3575 if( !pPage->intKey && pKey==0 ){
3576 return SQLITE_CORRUPT_BKPT;
3577 }
3578 if( biasRight ){
3579 pCur->idx = upr;
3580 }else{
3581 pCur->idx = (upr+lwr)/2;
3582 }
3583 if( lwr<=upr ) for(;;){
3584 void *pCellKey;
3585 i64 nCellKey;
3586 pCur->info.nSize = 0;
3587 if( pPage->intKey ){
3588 u8 *pCell;
3589 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3590 if( pPage->hasData ){
3591 u32 dummy;
3592 pCell += getVarint32(pCell, &dummy);
3593 }
3594 getVarint(pCell, (u64 *)&nCellKey);
3595 if( nCellKey<nKey ){
3596 c = -1;
3597 }else if( nCellKey>nKey ){
3598 c = +1;
3599 }else{
3600 c = 0;
3601 }
3602 }else{
3603 int available;
3604 pCellKey = (void *)fetchPayload(pCur, &available, 0);
3605 nCellKey = pCur->info.nKey;
3606 if( available>=nCellKey ){
3607 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3608 }else{
3609 pCellKey = sqlite3_malloc( nCellKey );
3610 if( pCellKey==0 ) return SQLITE_NOMEM;
3611 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3612 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3613 sqlite3_free(pCellKey);
3614 if( rc ){
3615 return rc;
3616 }
3617 }
3618 }
3619 if( c==0 ){
3620 if( pPage->leafData && !pPage->leaf ){
3621 lwr = pCur->idx;
3622 upr = lwr - 1;
3623 break;
3624 }else{
3625 if( pRes ) *pRes = 0;
3626 return SQLITE_OK;
3627 }
3628 }
3629 if( c<0 ){
3630 lwr = pCur->idx+1;
3631 }else{
3632 upr = pCur->idx-1;
3633 }
3634 if( lwr>upr ){
3635 break;
3636 }
3637 pCur->idx = (lwr+upr)/2;
3638 }
3639 assert( lwr==upr+1 );
3640 assert( pPage->isInit );
3641 if( pPage->leaf ){
3642 chldPg = 0;
3643 }else if( lwr>=pPage->nCell ){
3644 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3645 }else{
3646 chldPg = get4byte(findCell(pPage, lwr));
3647 }
3648 if( chldPg==0 ){
3649 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3650 if( pRes ) *pRes = c;
3651 return SQLITE_OK;
3652 }
3653 pCur->idx = lwr;
3654 pCur->info.nSize = 0;
3655 rc = moveToChild(pCur, chldPg);
3656 if( rc ){
3657 return rc;
3658 }
3659 }
3660 /* NOT REACHED */
3661}
3662
3663
3664/*
3665** Return TRUE if the cursor is not pointing at an entry of the table.
3666**
3667** TRUE will be returned after a call to sqlite3BtreeNext() moves
3668** past the last entry in the table or sqlite3BtreePrev() moves past
3669** the first entry. TRUE is also returned if the table is empty.
3670*/
3671int sqlite3BtreeEof(BtCursor *pCur){
3672 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3673 ** have been deleted? This API will need to change to return an error code
3674 ** as well as the boolean result value.
3675 */
3676 return (CURSOR_VALID!=pCur->eState);
3677}
3678
3679/*
3680** Return the database connection handle for a cursor.
3681*/
3682sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3683 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) );
3684 return pCur->pBtree->pSqlite;
3685}
3686
3687/*
3688** Advance the cursor to the next entry in the database. If
3689** successful then set *pRes=0. If the cursor
3690** was already pointing to the last entry in the database before
3691** this routine was called, then set *pRes=1.
3692*/
3693static int btreeNext(BtCursor *pCur, int *pRes){
3694 int rc;
3695 MemPage *pPage;
3696
3697 assert( cursorHoldsMutex(pCur) );
3698 rc = restoreOrClearCursorPosition(pCur);
3699 if( rc!=SQLITE_OK ){
3700 return rc;
3701 }
3702 assert( pRes!=0 );
3703 pPage = pCur->pPage;
3704 if( CURSOR_INVALID==pCur->eState ){
3705 *pRes = 1;
3706 return SQLITE_OK;
3707 }
3708 if( pCur->skip>0 ){
3709 pCur->skip = 0;
3710 *pRes = 0;
3711 return SQLITE_OK;
3712 }
3713 pCur->skip = 0;
3714
3715 assert( pPage->isInit );
3716 assert( pCur->idx<pPage->nCell );
3717
3718 pCur->idx++;
3719 pCur->info.nSize = 0;
3720 if( pCur->idx>=pPage->nCell ){
3721 if( !pPage->leaf ){
3722 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3723 if( rc ) return rc;
3724 rc = moveToLeftmost(pCur);
3725 *pRes = 0;
3726 return rc;
3727 }
3728 do{
3729 if( sqlite3BtreeIsRootPage(pPage) ){
3730 *pRes = 1;
3731 pCur->eState = CURSOR_INVALID;
3732 return SQLITE_OK;
3733 }
3734 sqlite3BtreeMoveToParent(pCur);
3735 pPage = pCur->pPage;
3736 }while( pCur->idx>=pPage->nCell );
3737 *pRes = 0;
3738 if( pPage->leafData ){
3739 rc = sqlite3BtreeNext(pCur, pRes);
3740 }else{
3741 rc = SQLITE_OK;
3742 }
3743 return rc;
3744 }
3745 *pRes = 0;
3746 if( pPage->leaf ){
3747 return SQLITE_OK;
3748 }
3749 rc = moveToLeftmost(pCur);
3750 return rc;
3751}
3752int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3753 int rc;
3754 assert( cursorHoldsMutex(pCur) );
3755 rc = btreeNext(pCur, pRes);
3756 return rc;
3757}
3758
3759
3760/*
3761** Step the cursor to the back to the previous entry in the database. If
3762** successful then set *pRes=0. If the cursor
3763** was already pointing to the first entry in the database before
3764** this routine was called, then set *pRes=1.
3765*/
3766static int btreePrevious(BtCursor *pCur, int *pRes){
3767 int rc;
3768 Pgno pgno;
3769 MemPage *pPage;
3770
3771 assert( cursorHoldsMutex(pCur) );
3772 rc = restoreOrClearCursorPosition(pCur);
3773 if( rc!=SQLITE_OK ){
3774 return rc;
3775 }
3776 if( CURSOR_INVALID==pCur->eState ){
3777 *pRes = 1;
3778 return SQLITE_OK;
3779 }
3780 if( pCur->skip<0 ){
3781 pCur->skip = 0;
3782 *pRes = 0;
3783 return SQLITE_OK;
3784 }
3785 pCur->skip = 0;
3786
3787 pPage = pCur->pPage;
3788 assert( pPage->isInit );
3789 assert( pCur->idx>=0 );
3790 if( !pPage->leaf ){
3791 pgno = get4byte( findCell(pPage, pCur->idx) );
3792 rc = moveToChild(pCur, pgno);
3793 if( rc ){
3794 return rc;
3795 }
3796 rc = moveToRightmost(pCur);
3797 }else{
3798 while( pCur->idx==0 ){
3799 if( sqlite3BtreeIsRootPage(pPage) ){
3800 pCur->eState = CURSOR_INVALID;
3801 *pRes = 1;
3802 return SQLITE_OK;
3803 }
3804 sqlite3BtreeMoveToParent(pCur);
3805 pPage = pCur->pPage;
3806 }
3807 pCur->idx--;
3808 pCur->info.nSize = 0;
3809 if( pPage->leafData && !pPage->leaf ){
3810 rc = sqlite3BtreePrevious(pCur, pRes);
3811 }else{
3812 rc = SQLITE_OK;
3813 }
3814 }
3815 *pRes = 0;
3816 return rc;
3817}
3818int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3819 int rc;
3820 assert( cursorHoldsMutex(pCur) );
3821 rc = btreePrevious(pCur, pRes);
3822 return rc;
3823}
3824
3825/*
3826** Allocate a new page from the database file.
3827**
3828** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
3829** has already been called on the new page.) The new page has also
3830** been referenced and the calling routine is responsible for calling
3831** sqlite3PagerUnref() on the new page when it is done.
3832**
3833** SQLITE_OK is returned on success. Any other return value indicates
3834** an error. *ppPage and *pPgno are undefined in the event of an error.
3835** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
3836**
3837** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3838** locate a page close to the page number "nearby". This can be used in an
3839** attempt to keep related pages close to each other in the database file,
3840** which in turn can make database access faster.
3841**
3842** If the "exact" parameter is not 0, and the page-number nearby exists
3843** anywhere on the free-list, then it is guarenteed to be returned. This
3844** is only used by auto-vacuum databases when allocating a new table.
3845*/
3846static int allocateBtreePage(
3847 BtShared *pBt,
3848 MemPage **ppPage,
3849 Pgno *pPgno,
3850 Pgno nearby,
3851 u8 exact
3852){
3853 MemPage *pPage1;
3854 int rc;
3855 int n; /* Number of pages on the freelist */
3856 int k; /* Number of leaves on the trunk of the freelist */
3857 MemPage *pTrunk = 0;
3858 MemPage *pPrevTrunk = 0;
3859
3860 assert( sqlite3_mutex_held(pBt->mutex) );
3861 pPage1 = pBt->pPage1;
3862 n = get4byte(&pPage1->aData[36]);
3863 if( n>0 ){
3864 /* There are pages on the freelist. Reuse one of those pages. */
3865 Pgno iTrunk;
3866 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3867
3868 /* If the 'exact' parameter was true and a query of the pointer-map
3869 ** shows that the page 'nearby' is somewhere on the free-list, then
3870 ** the entire-list will be searched for that page.
3871 */
3872#ifndef SQLITE_OMIT_AUTOVACUUM
3873 if( exact && nearby<=sqlite3PagerPagecount(pBt->pPager) ){
3874 u8 eType;
3875 assert( nearby>0 );
3876 assert( pBt->autoVacuum );
3877 rc = ptrmapGet(pBt, nearby, &eType, 0);
3878 if( rc ) return rc;
3879 if( eType==PTRMAP_FREEPAGE ){
3880 searchList = 1;
3881 }
3882 *pPgno = nearby;
3883 }
3884#endif
3885
3886 /* Decrement the free-list count by 1. Set iTrunk to the index of the
3887 ** first free-list trunk page. iPrevTrunk is initially 1.
3888 */
3889 rc = sqlite3PagerWrite(pPage1->pDbPage);
3890 if( rc ) return rc;
3891 put4byte(&pPage1->aData[36], n-1);
3892
3893 /* The code within this loop is run only once if the 'searchList' variable
3894 ** is not true. Otherwise, it runs once for each trunk-page on the
3895 ** free-list until the page 'nearby' is located.
3896 */
3897 do {
3898 pPrevTrunk = pTrunk;
3899 if( pPrevTrunk ){
3900 iTrunk = get4byte(&pPrevTrunk->aData[0]);
3901 }else{
3902 iTrunk = get4byte(&pPage1->aData[32]);
3903 }
3904 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
3905 if( rc ){
3906 pTrunk = 0;
3907 goto end_allocate_page;
3908 }
3909
3910 k = get4byte(&pTrunk->aData[4]);
3911 if( k==0 && !searchList ){
3912 /* The trunk has no leaves and the list is not being searched.
3913 ** So extract the trunk page itself and use it as the newly
3914 ** allocated page */
3915 assert( pPrevTrunk==0 );
3916 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3917 if( rc ){
3918 goto end_allocate_page;
3919 }
3920 *pPgno = iTrunk;
3921 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3922 *ppPage = pTrunk;
3923 pTrunk = 0;
3924 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3925 }else if( k>pBt->usableSize/4 - 8 ){
3926 /* Value of k is out of range. Database corruption */
3927 rc = SQLITE_CORRUPT_BKPT;
3928 goto end_allocate_page;
3929#ifndef SQLITE_OMIT_AUTOVACUUM
3930 }else if( searchList && nearby==iTrunk ){
3931 /* The list is being searched and this trunk page is the page
3932 ** to allocate, regardless of whether it has leaves.
3933 */
3934 assert( *pPgno==iTrunk );
3935 *ppPage = pTrunk;
3936 searchList = 0;
3937 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3938 if( rc ){
3939 goto end_allocate_page;
3940 }
3941 if( k==0 ){
3942 if( !pPrevTrunk ){
3943 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3944 }else{
3945 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3946 }
3947 }else{
3948 /* The trunk page is required by the caller but it contains
3949 ** pointers to free-list leaves. The first leaf becomes a trunk
3950 ** page in this case.
3951 */
3952 MemPage *pNewTrunk;
3953 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3954 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
3955 if( rc!=SQLITE_OK ){
3956 goto end_allocate_page;
3957 }
3958 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
3959 if( rc!=SQLITE_OK ){
3960 releasePage(pNewTrunk);
3961 goto end_allocate_page;
3962 }
3963 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3964 put4byte(&pNewTrunk->aData[4], k-1);
3965 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3966 releasePage(pNewTrunk);
3967 if( !pPrevTrunk ){
3968 put4byte(&pPage1->aData[32], iNewTrunk);
3969 }else{
3970 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
3971 if( rc ){
3972 goto end_allocate_page;
3973 }
3974 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3975 }
3976 }
3977 pTrunk = 0;
3978 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3979#endif
3980 }else{
3981 /* Extract a leaf from the trunk */
3982 int closest;
3983 Pgno iPage;
3984 unsigned char *aData = pTrunk->aData;
3985 rc = sqlite3PagerWrite(pTrunk->pDbPage);
3986 if( rc ){
3987 goto end_allocate_page;
3988 }
3989 if( nearby>0 ){
3990 int i, dist;
3991 closest = 0;
3992 dist = get4byte(&aData[8]) - nearby;
3993 if( dist<0 ) dist = -dist;
3994 for(i=1; i<k; i++){
3995 int d2 = get4byte(&aData[8+i*4]) - nearby;
3996 if( d2<0 ) d2 = -d2;
3997 if( d2<dist ){
3998 closest = i;
3999 dist = d2;
4000 }
4001 }
4002 }else{
4003 closest = 0;
4004 }
4005
4006 iPage = get4byte(&aData[8+closest*4]);
4007 if( !searchList || iPage==nearby ){
4008 *pPgno = iPage;
4009 if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){
4010 /* Free page off the end of the file */
4011 return SQLITE_CORRUPT_BKPT;
4012 }
4013 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4014 ": %d more free pages\n",
4015 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4016 if( closest<k-1 ){
4017 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4018 }
4019 put4byte(&aData[4], k-1);
4020 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4021 if( rc==SQLITE_OK ){
4022 sqlite3PagerDontRollback((*ppPage)->pDbPage);
4023 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4024 if( rc!=SQLITE_OK ){
4025 releasePage(*ppPage);
4026 }
4027 }
4028 searchList = 0;
4029 }
4030 }
4031 releasePage(pPrevTrunk);
4032 pPrevTrunk = 0;
4033 }while( searchList );
4034 }else{
4035 /* There are no pages on the freelist, so create a new page at the
4036 ** end of the file */
4037 *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1;
4038
4039#ifndef SQLITE_OMIT_AUTOVACUUM
4040 if( pBt->nTrunc ){
4041 /* An incr-vacuum has already run within this transaction. So the
4042 ** page to allocate is not from the physical end of the file, but
4043 ** at pBt->nTrunc.
4044 */
4045 *pPgno = pBt->nTrunc+1;
4046 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4047 (*pPgno)++;
4048 }
4049 }
4050 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4051 /* If *pPgno refers to a pointer-map page, allocate two new pages
4052 ** at the end of the file instead of one. The first allocated page
4053 ** becomes a new pointer-map page, the second is used by the caller.
4054 */
4055 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4056 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4057 (*pPgno)++;
4058 }
4059 if( pBt->nTrunc ){
4060 pBt->nTrunc = *pPgno;
4061 }
4062#endif
4063
4064 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4065 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4066 if( rc ) return rc;
4067 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4068 if( rc!=SQLITE_OK ){
4069 releasePage(*ppPage);
4070 }
4071 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4072 }
4073
4074 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4075
4076end_allocate_page:
4077 releasePage(pTrunk);
4078 releasePage(pPrevTrunk);
4079 return rc;
4080}
4081
4082/*
4083** Add a page of the database file to the freelist.
4084**
4085** sqlite3PagerUnref() is NOT called for pPage.
4086*/
4087static int freePage(MemPage *pPage){
4088 BtShared *pBt = pPage->pBt;
4089 MemPage *pPage1 = pBt->pPage1;
4090 int rc, n, k;
4091
4092 /* Prepare the page for freeing */
4093 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4094 assert( pPage->pgno>1 );
4095 pPage->isInit = 0;
4096 releasePage(pPage->pParent);
4097 pPage->pParent = 0;
4098
4099 /* Increment the free page count on pPage1 */
4100 rc = sqlite3PagerWrite(pPage1->pDbPage);
4101 if( rc ) return rc;
4102 n = get4byte(&pPage1->aData[36]);
4103 put4byte(&pPage1->aData[36], n+1);
4104
4105#ifdef SQLITE_SECURE_DELETE
4106 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4107 ** always fully overwrite deleted information with zeros.
4108 */
4109 rc = sqlite3PagerWrite(pPage->pDbPage);
4110 if( rc ) return rc;
4111 memset(pPage->aData, 0, pPage->pBt->pageSize);
4112#endif
4113
4114#ifndef SQLITE_OMIT_AUTOVACUUM
4115 /* If the database supports auto-vacuum, write an entry in the pointer-map
4116 ** to indicate that the page is free.
4117 */
4118 if( pBt->autoVacuum ){
4119 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4120 if( rc ) return rc;
4121 }
4122#endif
4123
4124 if( n==0 ){
4125 /* This is the first free page */
4126 rc = sqlite3PagerWrite(pPage->pDbPage);
4127 if( rc ) return rc;
4128 memset(pPage->aData, 0, 8);
4129 put4byte(&pPage1->aData[32], pPage->pgno);
4130 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4131 }else{
4132 /* Other free pages already exist. Retrive the first trunk page
4133 ** of the freelist and find out how many leaves it has. */
4134 MemPage *pTrunk;
4135 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4136 if( rc ) return rc;
4137 k = get4byte(&pTrunk->aData[4]);
4138 if( k>=pBt->usableSize/4 - 8 ){
4139 /* The trunk is full. Turn the page being freed into a new
4140 ** trunk page with no leaves. */
4141 rc = sqlite3PagerWrite(pPage->pDbPage);
4142 if( rc==SQLITE_OK ){
4143 put4byte(pPage->aData, pTrunk->pgno);
4144 put4byte(&pPage->aData[4], 0);
4145 put4byte(&pPage1->aData[32], pPage->pgno);
4146 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4147 pPage->pgno, pTrunk->pgno));
4148 }
4149 }else if( k<0 ){
4150 rc = SQLITE_CORRUPT;
4151 }else{
4152 /* Add the newly freed page as a leaf on the current trunk */
4153 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4154 if( rc==SQLITE_OK ){
4155 put4byte(&pTrunk->aData[4], k+1);
4156 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4157#ifndef SQLITE_SECURE_DELETE
4158 sqlite3PagerDontWrite(pPage->pDbPage);
4159#endif
4160 }
4161 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4162 }
4163 releasePage(pTrunk);
4164 }
4165 return rc;
4166}
4167
4168/*
4169** Free any overflow pages associated with the given Cell.
4170*/
4171static int clearCell(MemPage *pPage, unsigned char *pCell){
4172 BtShared *pBt = pPage->pBt;
4173 CellInfo info;
4174 Pgno ovflPgno;
4175 int rc;
4176 int nOvfl;
4177 int ovflPageSize;
4178
4179 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4180 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4181 if( info.iOverflow==0 ){
4182 return SQLITE_OK; /* No overflow pages. Return without doing anything */
4183 }
4184 ovflPgno = get4byte(&pCell[info.iOverflow]);
4185 ovflPageSize = pBt->usableSize - 4;
4186 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4187 assert( ovflPgno==0 || nOvfl>0 );
4188 while( nOvfl-- ){
4189 MemPage *pOvfl;
4190 if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){
4191 return SQLITE_CORRUPT_BKPT;
4192 }
4193
4194 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4195 if( rc ) return rc;
4196 rc = freePage(pOvfl);
4197 sqlite3PagerUnref(pOvfl->pDbPage);
4198 if( rc ) return rc;
4199 }
4200 return SQLITE_OK;
4201}
4202
4203/*
4204** Create the byte sequence used to represent a cell on page pPage
4205** and write that byte sequence into pCell[]. Overflow pages are
4206** allocated and filled in as necessary. The calling procedure
4207** is responsible for making sure sufficient space has been allocated
4208** for pCell[].
4209**
4210** Note that pCell does not necessary need to point to the pPage->aData
4211** area. pCell might point to some temporary storage. The cell will
4212** be constructed in this temporary area then copied into pPage->aData
4213** later.
4214*/
4215static int fillInCell(
4216 MemPage *pPage, /* The page that contains the cell */
4217 unsigned char *pCell, /* Complete text of the cell */
4218 const void *pKey, i64 nKey, /* The key */
4219 const void *pData,int nData, /* The data */
4220 int nZero, /* Extra zero bytes to append to pData */
4221 int *pnSize /* Write cell size here */
4222){
4223 int nPayload;
4224 const u8 *pSrc;
4225 int nSrc, n, rc;
4226 int spaceLeft;
4227 MemPage *pOvfl = 0;
4228 MemPage *pToRelease = 0;
4229 unsigned char *pPrior;
4230 unsigned char *pPayload;
4231 BtShared *pBt = pPage->pBt;
4232 Pgno pgnoOvfl = 0;
4233 int nHeader;
4234 CellInfo info;
4235
4236 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4237
4238 /* Fill in the header. */
4239 nHeader = 0;
4240 if( !pPage->leaf ){
4241 nHeader += 4;
4242 }
4243 if( pPage->hasData ){
4244 nHeader += putVarint(&pCell[nHeader], nData+nZero);
4245 }else{
4246 nData = nZero = 0;
4247 }
4248 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4249 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4250 assert( info.nHeader==nHeader );
4251 assert( info.nKey==nKey );
4252 assert( info.nData==nData+nZero );
4253
4254 /* Fill in the payload */
4255 nPayload = nData + nZero;
4256 if( pPage->intKey ){
4257 pSrc = pData;
4258 nSrc = nData;
4259 nData = 0;
4260 }else{
4261 nPayload += nKey;
4262 pSrc = pKey;
4263 nSrc = nKey;
4264 }
4265 *pnSize = info.nSize;
4266 spaceLeft = info.nLocal;
4267 pPayload = &pCell[nHeader];
4268 pPrior = &pCell[info.iOverflow];
4269
4270 while( nPayload>0 ){
4271 if( spaceLeft==0 ){
4272 int isExact = 0;
4273#ifndef SQLITE_OMIT_AUTOVACUUM
4274 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4275 if( pBt->autoVacuum ){
4276 do{
4277 pgnoOvfl++;
4278 } while(
4279 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4280 );
4281 if( pgnoOvfl>1 ){
4282 /* isExact = 1; */
4283 }
4284 }
4285#endif
4286 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4287#ifndef SQLITE_OMIT_AUTOVACUUM
4288 /* If the database supports auto-vacuum, and the second or subsequent
4289 ** overflow page is being allocated, add an entry to the pointer-map
4290 ** for that page now.
4291 **
4292 ** If this is the first overflow page, then write a partial entry
4293 ** to the pointer-map. If we write nothing to this pointer-map slot,
4294 ** then the optimistic overflow chain processing in clearCell()
4295 ** may misinterpret the uninitialised values and delete the
4296 ** wrong pages from the database.
4297 */
4298 if( pBt->autoVacuum && rc==SQLITE_OK ){
4299 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4300 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4301 if( rc ){
4302 releasePage(pOvfl);
4303 }
4304 }
4305#endif
4306 if( rc ){
4307 releasePage(pToRelease);
4308 return rc;
4309 }
4310 put4byte(pPrior, pgnoOvfl);
4311 releasePage(pToRelease);
4312 pToRelease = pOvfl;
4313 pPrior = pOvfl->aData;
4314 put4byte(pPrior, 0);
4315 pPayload = &pOvfl->aData[4];
4316 spaceLeft = pBt->usableSize - 4;
4317 }
4318 n = nPayload;
4319 if( n>spaceLeft ) n = spaceLeft;
4320 if( nSrc>0 ){
4321 if( n>nSrc ) n = nSrc;
4322 assert( pSrc );
4323 memcpy(pPayload, pSrc, n);
4324 }else{
4325 memset(pPayload, 0, n);
4326 }
4327 nPayload -= n;
4328 pPayload += n;
4329 pSrc += n;
4330 nSrc -= n;
4331 spaceLeft -= n;
4332 if( nSrc==0 ){
4333 nSrc = nData;
4334 pSrc = pData;
4335 }
4336 }
4337 releasePage(pToRelease);
4338 return SQLITE_OK;
4339}
4340
4341/*
4342** Change the MemPage.pParent pointer on the page whose number is
4343** given in the second argument so that MemPage.pParent holds the
4344** pointer in the third argument.
4345*/
4346static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
4347 MemPage *pThis;
4348 DbPage *pDbPage;
4349
4350 assert( sqlite3_mutex_held(pBt->mutex) );
4351 assert( pNewParent!=0 );
4352 if( pgno==0 ) return SQLITE_OK;
4353 assert( pBt->pPager!=0 );
4354 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
4355 if( pDbPage ){
4356 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
4357 if( pThis->isInit ){
4358 assert( pThis->aData==sqlite3PagerGetData(pDbPage) );
4359 if( pThis->pParent!=pNewParent ){
4360 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
4361 pThis->pParent = pNewParent;
4362 sqlite3PagerRef(pNewParent->pDbPage);
4363 }
4364 pThis->idxParent = idx;
4365 }
4366 sqlite3PagerUnref(pDbPage);
4367 }
4368
4369#ifndef SQLITE_OMIT_AUTOVACUUM
4370 if( pBt->autoVacuum ){
4371 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
4372 }
4373#endif
4374 return SQLITE_OK;
4375}
4376
4377
4378
4379/*
4380** Change the pParent pointer of all children of pPage to point back
4381** to pPage.
4382**
4383** In other words, for every child of pPage, invoke reparentPage()
4384** to make sure that each child knows that pPage is its parent.
4385**
4386** This routine gets called after you memcpy() one page into
4387** another.
4388*/
4389static int reparentChildPages(MemPage *pPage){
4390 int i;
4391 BtShared *pBt = pPage->pBt;
4392 int rc = SQLITE_OK;
4393
4394 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4395 if( pPage->leaf ) return SQLITE_OK;
4396
4397 for(i=0; i<pPage->nCell; i++){
4398 u8 *pCell = findCell(pPage, i);
4399 if( !pPage->leaf ){
4400 rc = reparentPage(pBt, get4byte(pCell), pPage, i);
4401 if( rc!=SQLITE_OK ) return rc;
4402 }
4403 }
4404 if( !pPage->leaf ){
4405 rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
4406 pPage, i);
4407 pPage->idxShift = 0;
4408 }
4409 return rc;
4410}
4411
4412/*
4413** Remove the i-th cell from pPage. This routine effects pPage only.
4414** The cell content is not freed or deallocated. It is assumed that
4415** the cell content has been copied someplace else. This routine just
4416** removes the reference to the cell from pPage.
4417**
4418** "sz" must be the number of bytes in the cell.
4419*/
4420static void dropCell(MemPage *pPage, int idx, int sz){
4421 int i; /* Loop counter */
4422 int pc; /* Offset to cell content of cell being deleted */
4423 u8 *data; /* pPage->aData */
4424 u8 *ptr; /* Used to move bytes around within data[] */
4425
4426 assert( idx>=0 && idx<pPage->nCell );
4427 assert( sz==cellSize(pPage, idx) );
4428 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4429 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4430 data = pPage->aData;
4431 ptr = &data[pPage->cellOffset + 2*idx];
4432 pc = get2byte(ptr);
4433 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4434 freeSpace(pPage, pc, sz);
4435 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4436 ptr[0] = ptr[2];
4437 ptr[1] = ptr[3];
4438 }
4439 pPage->nCell--;
4440 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4441 pPage->nFree += 2;
4442 pPage->idxShift = 1;
4443}
4444
4445/*
4446** Insert a new cell on pPage at cell index "i". pCell points to the
4447** content of the cell.
4448**
4449** If the cell content will fit on the page, then put it there. If it
4450** will not fit, then make a copy of the cell content into pTemp if
4451** pTemp is not null. Regardless of pTemp, allocate a new entry
4452** in pPage->aOvfl[] and make it point to the cell content (either
4453** in pTemp or the original pCell) and also record its index.
4454** Allocating a new entry in pPage->aCell[] implies that
4455** pPage->nOverflow is incremented.
4456**
4457** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4458** cell. The caller will overwrite them after this function returns. If
4459** nSkip is non-zero, then pCell may not point to an invalid memory location
4460** (but pCell+nSkip is always valid).
4461*/
4462static int insertCell(
4463 MemPage *pPage, /* Page into which we are copying */
4464 int i, /* New cell becomes the i-th cell of the page */
4465 u8 *pCell, /* Content of the new cell */
4466 int sz, /* Bytes of content in pCell */
4467 u8 *pTemp, /* Temp storage space for pCell, if needed */
4468 u8 nSkip /* Do not write the first nSkip bytes of the cell */
4469){
4470 int idx; /* Where to write new cell content in data[] */
4471 int j; /* Loop counter */
4472 int top; /* First byte of content for any cell in data[] */
4473 int end; /* First byte past the last cell pointer in data[] */
4474 int ins; /* Index in data[] where new cell pointer is inserted */
4475 int hdr; /* Offset into data[] of the page header */
4476 int cellOffset; /* Address of first cell pointer in data[] */
4477 u8 *data; /* The content of the whole page */
4478 u8 *ptr; /* Used for moving information around in data[] */
4479
4480 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4481 assert( sz==cellSizePtr(pPage, pCell) );
4482 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4483 if( pPage->nOverflow || sz+2>pPage->nFree ){
4484 if( pTemp ){
4485 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4486 pCell = pTemp;
4487 }
4488 j = pPage->nOverflow++;
4489 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4490 pPage->aOvfl[j].pCell = pCell;
4491 pPage->aOvfl[j].idx = i;
4492 pPage->nFree = 0;
4493 }else{
4494 int rc = sqlite3PagerWrite(pPage->pDbPage);
4495 if( rc!=SQLITE_OK ){
4496 return rc;
4497 }
4498 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4499 data = pPage->aData;
4500 hdr = pPage->hdrOffset;
4501 top = get2byte(&data[hdr+5]);
4502 cellOffset = pPage->cellOffset;
4503 end = cellOffset + 2*pPage->nCell + 2;
4504 ins = cellOffset + 2*i;
4505 if( end > top - sz ){
4506 rc = defragmentPage(pPage);
4507 if( rc!=SQLITE_OK ) return rc;
4508 top = get2byte(&data[hdr+5]);
4509 assert( end + sz <= top );
4510 }
4511 idx = allocateSpace(pPage, sz);
4512 assert( idx>0 );
4513 assert( end <= get2byte(&data[hdr+5]) );
4514 pPage->nCell++;
4515 pPage->nFree -= 2;
4516 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4517 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4518 ptr[0] = ptr[-2];
4519 ptr[1] = ptr[-1];
4520 }
4521 put2byte(&data[ins], idx);
4522 put2byte(&data[hdr+3], pPage->nCell);
4523 pPage->idxShift = 1;
4524#ifndef SQLITE_OMIT_AUTOVACUUM
4525 if( pPage->pBt->autoVacuum ){
4526 /* The cell may contain a pointer to an overflow page. If so, write
4527 ** the entry for the overflow page into the pointer map.
4528 */
4529 CellInfo info;
4530 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4531 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4532 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4533 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4534 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4535 if( rc!=SQLITE_OK ) return rc;
4536 }
4537 }
4538#endif
4539 }
4540
4541 return SQLITE_OK;
4542}
4543
4544/*
4545** Add a list of cells to a page. The page should be initially empty.
4546** The cells are guaranteed to fit on the page.
4547*/
4548static void assemblePage(
4549 MemPage *pPage, /* The page to be assemblied */
4550 int nCell, /* The number of cells to add to this page */
4551 u8 **apCell, /* Pointers to cell bodies */
4552 int *aSize /* Sizes of the cells */
4553){
4554 int i; /* Loop counter */
4555 int totalSize; /* Total size of all cells */
4556 int hdr; /* Index of page header */
4557 int cellptr; /* Address of next cell pointer */
4558 int cellbody; /* Address of next cell body */
4559 u8 *data; /* Data for the page */
4560
4561 assert( pPage->nOverflow==0 );
4562 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4563 totalSize = 0;
4564 for(i=0; i<nCell; i++){
4565 totalSize += aSize[i];
4566 }
4567 assert( totalSize+2*nCell<=pPage->nFree );
4568 assert( pPage->nCell==0 );
4569 cellptr = pPage->cellOffset;
4570 data = pPage->aData;
4571 hdr = pPage->hdrOffset;
4572 put2byte(&data[hdr+3], nCell);
4573 if( nCell ){
4574 cellbody = allocateSpace(pPage, totalSize);
4575 assert( cellbody>0 );
4576 assert( pPage->nFree >= 2*nCell );
4577 pPage->nFree -= 2*nCell;
4578 for(i=0; i<nCell; i++){
4579 put2byte(&data[cellptr], cellbody);
4580 memcpy(&data[cellbody], apCell[i], aSize[i]);
4581 cellptr += 2;
4582 cellbody += aSize[i];
4583 }
4584 assert( cellbody==pPage->pBt->usableSize );
4585 }
4586 pPage->nCell = nCell;
4587}
4588
4589/*
4590** The following parameters determine how many adjacent pages get involved
4591** in a balancing operation. NN is the number of neighbors on either side
4592** of the page that participate in the balancing operation. NB is the
4593** total number of pages that participate, including the target page and
4594** NN neighbors on either side.
4595**
4596** The minimum value of NN is 1 (of course). Increasing NN above 1
4597** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4598** in exchange for a larger degradation in INSERT and UPDATE performance.
4599** The value of NN appears to give the best results overall.
4600*/
4601#define NN 1 /* Number of neighbors on either side of pPage */
4602#define NB (NN*2+1) /* Total pages involved in the balance */
4603
4604/* Forward reference */
4605static int balance(MemPage*, int);
4606
4607#ifndef SQLITE_OMIT_QUICKBALANCE
4608/*
4609** This version of balance() handles the common special case where
4610** a new entry is being inserted on the extreme right-end of the
4611** tree, in other words, when the new entry will become the largest
4612** entry in the tree.
4613**
4614** Instead of trying balance the 3 right-most leaf pages, just add
4615** a new page to the right-hand side and put the one new entry in
4616** that page. This leaves the right side of the tree somewhat
4617** unbalanced. But odds are that we will be inserting new entries
4618** at the end soon afterwards so the nearly empty page will quickly
4619** fill up. On average.
4620**
4621** pPage is the leaf page which is the right-most page in the tree.
4622** pParent is its parent. pPage must have a single overflow entry
4623** which is also the right-most entry on the page.
4624*/
4625static int balance_quick(MemPage *pPage, MemPage *pParent){
4626 int rc;
4627 MemPage *pNew;
4628 Pgno pgnoNew;
4629 u8 *pCell;
4630 int szCell;
4631 CellInfo info;
4632 BtShared *pBt = pPage->pBt;
4633 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4634 int parentSize; /* Size of new divider cell */
4635 u8 parentCell[64]; /* Space for the new divider cell */
4636
4637 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4638
4639 /* Allocate a new page. Insert the overflow cell from pPage
4640 ** into it. Then remove the overflow cell from pPage.
4641 */
4642 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4643 if( rc!=SQLITE_OK ){
4644 return rc;
4645 }
4646 pCell = pPage->aOvfl[0].pCell;
4647 szCell = cellSizePtr(pPage, pCell);
4648 zeroPage(pNew, pPage->aData[0]);
4649 assemblePage(pNew, 1, &pCell, &szCell);
4650 pPage->nOverflow = 0;
4651
4652 /* Set the parent of the newly allocated page to pParent. */
4653 pNew->pParent = pParent;
4654 sqlite3PagerRef(pParent->pDbPage);
4655
4656 /* pPage is currently the right-child of pParent. Change this
4657 ** so that the right-child is the new page allocated above and
4658 ** pPage is the next-to-right child.
4659 */
4660 assert( pPage->nCell>0 );
4661 pCell = findCell(pPage, pPage->nCell-1);
4662 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4663 rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4664 if( rc!=SQLITE_OK ){
4665 return rc;
4666 }
4667 assert( parentSize<64 );
4668 rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4669 if( rc!=SQLITE_OK ){
4670 return rc;
4671 }
4672 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4673 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4674
4675#ifndef SQLITE_OMIT_AUTOVACUUM
4676 /* If this is an auto-vacuum database, update the pointer map
4677 ** with entries for the new page, and any pointer from the
4678 ** cell on the page to an overflow page.
4679 */
4680 if( pBt->autoVacuum ){
4681 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4682 if( rc==SQLITE_OK ){
4683 rc = ptrmapPutOvfl(pNew, 0);
4684 }
4685 if( rc!=SQLITE_OK ){
4686 releasePage(pNew);
4687 return rc;
4688 }
4689 }
4690#endif
4691
4692 /* Release the reference to the new page and balance the parent page,
4693 ** in case the divider cell inserted caused it to become overfull.
4694 */
4695 releasePage(pNew);
4696 return balance(pParent, 0);
4697}
4698#endif /* SQLITE_OMIT_QUICKBALANCE */
4699
4700/*
4701** This routine redistributes Cells on pPage and up to NN*2 siblings
4702** of pPage so that all pages have about the same amount of free space.
4703** Usually NN siblings on either side of pPage is used in the balancing,
4704** though more siblings might come from one side if pPage is the first
4705** or last child of its parent. If pPage has fewer than 2*NN siblings
4706** (something which can only happen if pPage is the root page or a
4707** child of root) then all available siblings participate in the balancing.
4708**
4709** The number of siblings of pPage might be increased or decreased by one or
4710** two in an effort to keep pages nearly full but not over full. The root page
4711** is special and is allowed to be nearly empty. If pPage is
4712** the root page, then the depth of the tree might be increased
4713** or decreased by one, as necessary, to keep the root page from being
4714** overfull or completely empty.
4715**
4716** Note that when this routine is called, some of the Cells on pPage
4717** might not actually be stored in pPage->aData[]. This can happen
4718** if the page is overfull. Part of the job of this routine is to
4719** make sure all Cells for pPage once again fit in pPage->aData[].
4720**
4721** In the course of balancing the siblings of pPage, the parent of pPage
4722** might become overfull or underfull. If that happens, then this routine
4723** is called recursively on the parent.
4724**
4725** If this routine fails for any reason, it might leave the database
4726** in a corrupted state. So if this routine fails, the database should
4727** be rolled back.
4728*/
4729static int balance_nonroot(MemPage *pPage){
4730 MemPage *pParent; /* The parent of pPage */
4731 BtShared *pBt; /* The whole database */
4732 int nCell = 0; /* Number of cells in apCell[] */
4733 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4734 int nOld; /* Number of pages in apOld[] */
4735 int nNew; /* Number of pages in apNew[] */
4736 int nDiv; /* Number of cells in apDiv[] */
4737 int i, j, k; /* Loop counters */
4738 int idx; /* Index of pPage in pParent->aCell[] */
4739 int nxDiv; /* Next divider slot in pParent->aCell[] */
4740 int rc; /* The return code */
4741 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4742 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4743 int usableSpace; /* Bytes in pPage beyond the header */
4744 int pageFlags; /* Value of pPage->aData[0] */
4745 int subtotal; /* Subtotal of bytes in cells on one page */
4746 int iSpace = 0; /* First unused byte of aSpace[] */
4747 MemPage *apOld[NB]; /* pPage and up to two siblings */
4748 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4749 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4750 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4751 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4752 u8 *apDiv[NB]; /* Divider cells in pParent */
4753 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4754 int szNew[NB+2]; /* Combined size of cells place on i-th page */
4755 u8 **apCell = 0; /* All cells begin balanced */
4756 int *szCell; /* Local size of all cells in apCell[] */
4757 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4758 u8 *aSpace; /* Space to hold copies of dividers cells */
4759#ifndef SQLITE_OMIT_AUTOVACUUM
4760 u8 *aFrom = 0;
4761#endif
4762
4763 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4764
4765 /*
4766 ** Find the parent page.
4767 */
4768 assert( pPage->isInit );
4769 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4770 pBt = pPage->pBt;
4771 pParent = pPage->pParent;
4772 assert( pParent );
4773 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4774 return rc;
4775 }
4776 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4777
4778#ifndef SQLITE_OMIT_QUICKBALANCE
4779 /*
4780 ** A special case: If a new entry has just been inserted into a
4781 ** table (that is, a btree with integer keys and all data at the leaves)
4782 ** and the new entry is the right-most entry in the tree (it has the
4783 ** largest key) then use the special balance_quick() routine for
4784 ** balancing. balance_quick() is much faster and results in a tighter
4785 ** packing of data in the common case.
4786 */
4787 if( pPage->leaf &&
4788 pPage->intKey &&
4789 pPage->leafData &&
4790 pPage->nOverflow==1 &&
4791 pPage->aOvfl[0].idx==pPage->nCell &&
4792 pPage->pParent->pgno!=1 &&
4793 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4794 ){
4795 /*
4796 ** TODO: Check the siblings to the left of pPage. It may be that
4797 ** they are not full and no new page is required.
4798 */
4799 return balance_quick(pPage, pParent);
4800 }
4801#endif
4802
4803 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4804 return rc;
4805 }
4806
4807 /*
4808 ** Find the cell in the parent page whose left child points back
4809 ** to pPage. The "idx" variable is the index of that cell. If pPage
4810 ** is the rightmost child of pParent then set idx to pParent->nCell
4811 */
4812 if( pParent->idxShift ){
4813 Pgno pgno;
4814 pgno = pPage->pgno;
4815 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
4816 for(idx=0; idx<pParent->nCell; idx++){
4817 if( get4byte(findCell(pParent, idx))==pgno ){
4818 break;
4819 }
4820 }
4821 assert( idx<pParent->nCell
4822 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
4823 }else{
4824 idx = pPage->idxParent;
4825 }
4826
4827 /*
4828 ** Initialize variables so that it will be safe to jump
4829 ** directly to balance_cleanup at any moment.
4830 */
4831 nOld = nNew = 0;
4832 sqlite3PagerRef(pParent->pDbPage);
4833
4834 /*
4835 ** Find sibling pages to pPage and the cells in pParent that divide
4836 ** the siblings. An attempt is made to find NN siblings on either
4837 ** side of pPage. More siblings are taken from one side, however, if
4838 ** pPage there are fewer than NN siblings on the other side. If pParent
4839 ** has NB or fewer children then all children of pParent are taken.
4840 */
4841 nxDiv = idx - NN;
4842 if( nxDiv + NB > pParent->nCell ){
4843 nxDiv = pParent->nCell - NB + 1;
4844 }
4845 if( nxDiv<0 ){
4846 nxDiv = 0;
4847 }
4848 nDiv = 0;
4849 for(i=0, k=nxDiv; i<NB; i++, k++){
4850 if( k<pParent->nCell ){
4851 apDiv[i] = findCell(pParent, k);
4852 nDiv++;
4853 assert( !pParent->leaf );
4854 pgnoOld[i] = get4byte(apDiv[i]);
4855 }else if( k==pParent->nCell ){
4856 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
4857 }else{
4858 break;
4859 }
4860 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
4861 if( rc ) goto balance_cleanup;
4862 apOld[i]->idxParent = k;
4863 apCopy[i] = 0;
4864 assert( i==nOld );
4865 nOld++;
4866 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
4867 }
4868
4869 /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
4870 ** alignment */
4871 nMaxCells = (nMaxCells + 1)&~1;
4872
4873 /*
4874 ** Allocate space for memory structures
4875 */
4876 apCell = sqlite3_malloc(
4877 nMaxCells*sizeof(u8*) /* apCell */
4878 + nMaxCells*sizeof(int) /* szCell */
4879 + ROUND8(sizeof(MemPage))*NB /* aCopy */
4880 + pBt->pageSize*(5+NB) /* aSpace */
4881 + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */
4882 );
4883 if( apCell==0 ){
4884 rc = SQLITE_NOMEM;
4885 goto balance_cleanup;
4886 }
4887 szCell = (int*)&apCell[nMaxCells];
4888 aCopy[0] = (u8*)&szCell[nMaxCells];
4889 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4890 for(i=1; i<NB; i++){
4891 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4892 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4893 }
4894 aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4895 assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4896#ifndef SQLITE_OMIT_AUTOVACUUM
4897 if( pBt->autoVacuum ){
4898 aFrom = &aSpace[5*pBt->pageSize];
4899 }
4900#endif
4901
4902 /*
4903 ** Make copies of the content of pPage and its siblings into aOld[].
4904 ** The rest of this function will use data from the copies rather
4905 ** that the original pages since the original pages will be in the
4906 ** process of being overwritten.
4907 */
4908 for(i=0; i<nOld; i++){
4909 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
4910 memcpy(p, apOld[i], sizeof(MemPage));
4911 p->aData = (void*)&p[1];
4912 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
4913 }
4914
4915 /*
4916 ** Load pointers to all cells on sibling pages and the divider cells
4917 ** into the local apCell[] array. Make copies of the divider cells
4918 ** into space obtained form aSpace[] and remove the the divider Cells
4919 ** from pParent.
4920 **
4921 ** If the siblings are on leaf pages, then the child pointers of the
4922 ** divider cells are stripped from the cells before they are copied
4923 ** into aSpace[]. In this way, all cells in apCell[] are without
4924 ** child pointers. If siblings are not leaves, then all cell in
4925 ** apCell[] include child pointers. Either way, all cells in apCell[]
4926 ** are alike.
4927 **
4928 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
4929 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
4930 */
4931 nCell = 0;
4932 leafCorrection = pPage->leaf*4;
4933 leafData = pPage->leafData && pPage->leaf;
4934 for(i=0; i<nOld; i++){
4935 MemPage *pOld = apCopy[i];
4936 int limit = pOld->nCell+pOld->nOverflow;
4937 for(j=0; j<limit; j++){
4938 assert( nCell<nMaxCells );
4939 apCell[nCell] = findOverflowCell(pOld, j);
4940 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
4941#ifndef SQLITE_OMIT_AUTOVACUUM
4942 if( pBt->autoVacuum ){
4943 int a;
4944 aFrom[nCell] = i;
4945 for(a=0; a<pOld->nOverflow; a++){
4946 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
4947 aFrom[nCell] = 0xFF;
4948 break;
4949 }
4950 }
4951 }
4952#endif
4953 nCell++;
4954 }
4955 if( i<nOld-1 ){
4956 int sz = cellSizePtr(pParent, apDiv[i]);
4957 if( leafData ){
4958 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
4959 ** are duplicates of keys on the child pages. We need to remove
4960 ** the divider cells from pParent, but the dividers cells are not
4961 ** added to apCell[] because they are duplicates of child cells.
4962 */
4963 dropCell(pParent, nxDiv, sz);
4964 }else{
4965 u8 *pTemp;
4966 assert( nCell<nMaxCells );
4967 szCell[nCell] = sz;
4968 pTemp = &aSpace[iSpace];
4969 iSpace += sz;
4970 assert( iSpace<=pBt->pageSize*5 );
4971 memcpy(pTemp, apDiv[i], sz);
4972 apCell[nCell] = pTemp+leafCorrection;
4973#ifndef SQLITE_OMIT_AUTOVACUUM
4974 if( pBt->autoVacuum ){
4975 aFrom[nCell] = 0xFF;
4976 }
4977#endif
4978 dropCell(pParent, nxDiv, sz);
4979 szCell[nCell] -= leafCorrection;
4980 assert( get4byte(pTemp)==pgnoOld[i] );
4981 if( !pOld->leaf ){
4982 assert( leafCorrection==0 );
4983 /* The right pointer of the child page pOld becomes the left
4984 ** pointer of the divider cell */
4985 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4986 }else{
4987 assert( leafCorrection==4 );
4988 if( szCell[nCell]<4 ){
4989 /* Do not allow any cells smaller than 4 bytes. */
4990 szCell[nCell] = 4;
4991 }
4992 }
4993 nCell++;
4994 }
4995 }
4996 }
4997
4998 /*
4999 ** Figure out the number of pages needed to hold all nCell cells.
5000 ** Store this number in "k". Also compute szNew[] which is the total
5001 ** size of all cells on the i-th page and cntNew[] which is the index
5002 ** in apCell[] of the cell that divides page i from page i+1.
5003 ** cntNew[k] should equal nCell.
5004 **
5005 ** Values computed by this block:
5006 **
5007 ** k: The total number of sibling pages
5008 ** szNew[i]: Spaced used on the i-th sibling page.
5009 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5010 ** the right of the i-th sibling page.
5011 ** usableSpace: Number of bytes of space available on each sibling.
5012 **
5013 */
5014 usableSpace = pBt->usableSize - 12 + leafCorrection;
5015 for(subtotal=k=i=0; i<nCell; i++){
5016 assert( i<nMaxCells );
5017 subtotal += szCell[i] + 2;
5018 if( subtotal > usableSpace ){
5019 szNew[k] = subtotal - szCell[i];
5020 cntNew[k] = i;
5021 if( leafData ){ i--; }
5022 subtotal = 0;
5023 k++;
5024 }
5025 }
5026 szNew[k] = subtotal;
5027 cntNew[k] = nCell;
5028 k++;
5029
5030 /*
5031 ** The packing computed by the previous block is biased toward the siblings
5032 ** on the left side. The left siblings are always nearly full, while the
5033 ** right-most sibling might be nearly empty. This block of code attempts
5034 ** to adjust the packing of siblings to get a better balance.
5035 **
5036 ** This adjustment is more than an optimization. The packing above might
5037 ** be so out of balance as to be illegal. For example, the right-most
5038 ** sibling might be completely empty. This adjustment is not optional.
5039 */
5040 for(i=k-1; i>0; i--){
5041 int szRight = szNew[i]; /* Size of sibling on the right */
5042 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5043 int r; /* Index of right-most cell in left sibling */
5044 int d; /* Index of first cell to the left of right sibling */
5045
5046 r = cntNew[i-1] - 1;
5047 d = r + 1 - leafData;
5048 assert( d<nMaxCells );
5049 assert( r<nMaxCells );
5050 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5051 szRight += szCell[d] + 2;
5052 szLeft -= szCell[r] + 2;
5053 cntNew[i-1]--;
5054 r = cntNew[i-1] - 1;
5055 d = r + 1 - leafData;
5056 }
5057 szNew[i] = szRight;
5058 szNew[i-1] = szLeft;
5059 }
5060
5061 /* Either we found one or more cells (cntnew[0])>0) or we are the
5062 ** a virtual root page. A virtual root page is when the real root
5063 ** page is page 1 and we are the only child of that page.
5064 */
5065 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5066
5067 /*
5068 ** Allocate k new pages. Reuse old pages where possible.
5069 */
5070 assert( pPage->pgno>1 );
5071 pageFlags = pPage->aData[0];
5072 for(i=0; i<k; i++){
5073 MemPage *pNew;
5074 if( i<nOld ){
5075 pNew = apNew[i] = apOld[i];
5076 pgnoNew[i] = pgnoOld[i];
5077 apOld[i] = 0;
5078 rc = sqlite3PagerWrite(pNew->pDbPage);
5079 nNew++;
5080 if( rc ) goto balance_cleanup;
5081 }else{
5082 assert( i>0 );
5083 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5084 if( rc ) goto balance_cleanup;
5085 apNew[i] = pNew;
5086 nNew++;
5087 }
5088 zeroPage(pNew, pageFlags);
5089 }
5090
5091 /* Free any old pages that were not reused as new pages.
5092 */
5093 while( i<nOld ){
5094 rc = freePage(apOld[i]);
5095 if( rc ) goto balance_cleanup;
5096 releasePage(apOld[i]);
5097 apOld[i] = 0;
5098 i++;
5099 }
5100
5101 /*
5102 ** Put the new pages in accending order. This helps to
5103 ** keep entries in the disk file in order so that a scan
5104 ** of the table is a linear scan through the file. That
5105 ** in turn helps the operating system to deliver pages
5106 ** from the disk more rapidly.
5107 **
5108 ** An O(n^2) insertion sort algorithm is used, but since
5109 ** n is never more than NB (a small constant), that should
5110 ** not be a problem.
5111 **
5112 ** When NB==3, this one optimization makes the database
5113 ** about 25% faster for large insertions and deletions.
5114 */
5115 for(i=0; i<k-1; i++){
5116 int minV = pgnoNew[i];
5117 int minI = i;
5118 for(j=i+1; j<k; j++){
5119 if( pgnoNew[j]<(unsigned)minV ){
5120 minI = j;
5121 minV = pgnoNew[j];
5122 }
5123 }
5124 if( minI>i ){
5125 int t;
5126 MemPage *pT;
5127 t = pgnoNew[i];
5128 pT = apNew[i];
5129 pgnoNew[i] = pgnoNew[minI];
5130 apNew[i] = apNew[minI];
5131 pgnoNew[minI] = t;
5132 apNew[minI] = pT;
5133 }
5134 }
5135 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5136 pgnoOld[0],
5137 nOld>=2 ? pgnoOld[1] : 0,
5138 nOld>=3 ? pgnoOld[2] : 0,
5139 pgnoNew[0], szNew[0],
5140 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5141 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5142 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5143 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5144
5145 /*
5146 ** Evenly distribute the data in apCell[] across the new pages.
5147 ** Insert divider cells into pParent as necessary.
5148 */
5149 j = 0;
5150 for(i=0; i<nNew; i++){
5151 /* Assemble the new sibling page. */
5152 MemPage *pNew = apNew[i];
5153 assert( j<nMaxCells );
5154 assert( pNew->pgno==pgnoNew[i] );
5155 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5156 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5157 assert( pNew->nOverflow==0 );
5158
5159#ifndef SQLITE_OMIT_AUTOVACUUM
5160 /* If this is an auto-vacuum database, update the pointer map entries
5161 ** that point to the siblings that were rearranged. These can be: left
5162 ** children of cells, the right-child of the page, or overflow pages
5163 ** pointed to by cells.
5164 */
5165 if( pBt->autoVacuum ){
5166 for(k=j; k<cntNew[i]; k++){
5167 assert( k<nMaxCells );
5168 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5169 rc = ptrmapPutOvfl(pNew, k-j);
5170 if( rc!=SQLITE_OK ){
5171 goto balance_cleanup;
5172 }
5173 }
5174 }
5175 }
5176#endif
5177
5178 j = cntNew[i];
5179
5180 /* If the sibling page assembled above was not the right-most sibling,
5181 ** insert a divider cell into the parent page.
5182 */
5183 if( i<nNew-1 && j<nCell ){
5184 u8 *pCell;
5185 u8 *pTemp;
5186 int sz;
5187
5188 assert( j<nMaxCells );
5189 pCell = apCell[j];
5190 sz = szCell[j] + leafCorrection;
5191 if( !pNew->leaf ){
5192 memcpy(&pNew->aData[8], pCell, 4);
5193 pTemp = 0;
5194 }else if( leafData ){
5195 /* If the tree is a leaf-data tree, and the siblings are leaves,
5196 ** then there is no divider cell in apCell[]. Instead, the divider
5197 ** cell consists of the integer key for the right-most cell of
5198 ** the sibling-page assembled above only.
5199 */
5200 CellInfo info;
5201 j--;
5202 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5203 pCell = &aSpace[iSpace];
5204 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5205 iSpace += sz;
5206 assert( iSpace<=pBt->pageSize*5 );
5207 pTemp = 0;
5208 }else{
5209 pCell -= 4;
5210 pTemp = &aSpace[iSpace];
5211 iSpace += sz;
5212 assert( iSpace<=pBt->pageSize*5 );
5213 /* Obscure case for non-leaf-data trees: If the cell at pCell was
5214 ** previously stored on a leaf node, and it's reported size was 4
5215 ** bytes, then it may actually be smaller than this
5216 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5217 ** any cell). But it's important to pass the correct size to
5218 ** insertCell(), so reparse the cell now.
5219 **
5220 ** Note that this can never happen in an SQLite data file, as all
5221 ** cells are at least 4 bytes. It only happens in b-trees used
5222 ** to evaluate "IN (SELECT ...)" and similar clauses.
5223 */
5224 if( szCell[j]==4 ){
5225 assert(leafCorrection==4);
5226 sz = cellSizePtr(pParent, pCell);
5227 }
5228 }
5229 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5230 if( rc!=SQLITE_OK ) goto balance_cleanup;
5231 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5232#ifndef SQLITE_OMIT_AUTOVACUUM
5233 /* If this is an auto-vacuum database, and not a leaf-data tree,
5234 ** then update the pointer map with an entry for the overflow page
5235 ** that the cell just inserted points to (if any).
5236 */
5237 if( pBt->autoVacuum && !leafData ){
5238 rc = ptrmapPutOvfl(pParent, nxDiv);
5239 if( rc!=SQLITE_OK ){
5240 goto balance_cleanup;
5241 }
5242 }
5243#endif
5244 j++;
5245 nxDiv++;
5246 }
5247 }
5248 assert( j==nCell );
5249 assert( nOld>0 );
5250 assert( nNew>0 );
5251 if( (pageFlags & PTF_LEAF)==0 ){
5252 memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
5253 }
5254 if( nxDiv==pParent->nCell+pParent->nOverflow ){
5255 /* Right-most sibling is the right-most child of pParent */
5256 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5257 }else{
5258 /* Right-most sibling is the left child of the first entry in pParent
5259 ** past the right-most divider entry */
5260 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5261 }
5262
5263 /*
5264 ** Reparent children of all cells.
5265 */
5266 for(i=0; i<nNew; i++){
5267 rc = reparentChildPages(apNew[i]);
5268 if( rc!=SQLITE_OK ) goto balance_cleanup;
5269 }
5270 rc = reparentChildPages(pParent);
5271 if( rc!=SQLITE_OK ) goto balance_cleanup;
5272
5273 /*
5274 ** Balance the parent page. Note that the current page (pPage) might
5275 ** have been added to the freelist so it might no longer be initialized.
5276 ** But the parent page will always be initialized.
5277 */
5278 assert( pParent->isInit );
5279 rc = balance(pParent, 0);
5280
5281 /*
5282 ** Cleanup before returning.
5283 */
5284balance_cleanup:
5285 sqlite3_free(apCell);
5286 for(i=0; i<nOld; i++){
5287 releasePage(apOld[i]);
5288 }
5289 for(i=0; i<nNew; i++){
5290 releasePage(apNew[i]);
5291 }
5292 releasePage(pParent);
5293 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5294 pPage->pgno, nOld, nNew, nCell));
5295 return rc;
5296}
5297
5298/*
5299** This routine is called for the root page of a btree when the root
5300** page contains no cells. This is an opportunity to make the tree
5301** shallower by one level.
5302*/
5303static int balance_shallower(MemPage *pPage){
5304 MemPage *pChild; /* The only child page of pPage */
5305 Pgno pgnoChild; /* Page number for pChild */
5306 int rc = SQLITE_OK; /* Return code from subprocedures */
5307 BtShared *pBt; /* The main BTree structure */
5308 int mxCellPerPage; /* Maximum number of cells per page */
5309 u8 **apCell; /* All cells from pages being balanced */
5310 int *szCell; /* Local size of all cells */
5311
5312 assert( pPage->pParent==0 );
5313 assert( pPage->nCell==0 );
5314 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5315 pBt = pPage->pBt;
5316 mxCellPerPage = MX_CELL(pBt);
5317 apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
5318 if( apCell==0 ) return SQLITE_NOMEM;
5319 szCell = (int*)&apCell[mxCellPerPage];
5320 if( pPage->leaf ){
5321 /* The table is completely empty */
5322 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5323 }else{
5324 /* The root page is empty but has one child. Transfer the
5325 ** information from that one child into the root page if it
5326 ** will fit. This reduces the depth of the tree by one.
5327 **
5328 ** If the root page is page 1, it has less space available than
5329 ** its child (due to the 100 byte header that occurs at the beginning
5330 ** of the database fle), so it might not be able to hold all of the
5331 ** information currently contained in the child. If this is the
5332 ** case, then do not do the transfer. Leave page 1 empty except
5333 ** for the right-pointer to the child page. The child page becomes
5334 ** the virtual root of the tree.
5335 */
5336 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5337 assert( pgnoChild>0 );
5338 assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) );
5339 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5340 if( rc ) goto end_shallow_balance;
5341 if( pPage->pgno==1 ){
5342 rc = sqlite3BtreeInitPage(pChild, pPage);
5343 if( rc ) goto end_shallow_balance;
5344 assert( pChild->nOverflow==0 );
5345 if( pChild->nFree>=100 ){
5346 /* The child information will fit on the root page, so do the
5347 ** copy */
5348 int i;
5349 zeroPage(pPage, pChild->aData[0]);
5350 for(i=0; i<pChild->nCell; i++){
5351 apCell[i] = findCell(pChild,i);
5352 szCell[i] = cellSizePtr(pChild, apCell[i]);
5353 }
5354 assemblePage(pPage, pChild->nCell, apCell, szCell);
5355 /* Copy the right-pointer of the child to the parent. */
5356 put4byte(&pPage->aData[pPage->hdrOffset+8],
5357 get4byte(&pChild->aData[pChild->hdrOffset+8]));
5358 freePage(pChild);
5359 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5360 }else{
5361 /* The child has more information that will fit on the root.
5362 ** The tree is already balanced. Do nothing. */
5363 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5364 }
5365 }else{
5366 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5367 pPage->isInit = 0;
5368 pPage->pParent = 0;
5369 rc = sqlite3BtreeInitPage(pPage, 0);
5370 assert( rc==SQLITE_OK );
5371 freePage(pChild);
5372 TRACE(("BALANCE: transfer child %d into root %d\n",
5373 pChild->pgno, pPage->pgno));
5374 }
5375 rc = reparentChildPages(pPage);
5376 assert( pPage->nOverflow==0 );
5377#ifndef SQLITE_OMIT_AUTOVACUUM
5378 if( pBt->autoVacuum ){
5379 int i;
5380 for(i=0; i<pPage->nCell; i++){
5381 rc = ptrmapPutOvfl(pPage, i);
5382 if( rc!=SQLITE_OK ){
5383 goto end_shallow_balance;
5384 }
5385 }
5386 }
5387#endif
5388 releasePage(pChild);
5389 }
5390end_shallow_balance:
5391 sqlite3_free(apCell);
5392 return rc;
5393}
5394
5395
5396/*
5397** The root page is overfull
5398**
5399** When this happens, Create a new child page and copy the
5400** contents of the root into the child. Then make the root
5401** page an empty page with rightChild pointing to the new
5402** child. Finally, call balance_internal() on the new child
5403** to cause it to split.
5404*/
5405static int balance_deeper(MemPage *pPage){
5406 int rc; /* Return value from subprocedures */
5407 MemPage *pChild; /* Pointer to a new child page */
5408 Pgno pgnoChild; /* Page number of the new child page */
5409 BtShared *pBt; /* The BTree */
5410 int usableSize; /* Total usable size of a page */
5411 u8 *data; /* Content of the parent page */
5412 u8 *cdata; /* Content of the child page */
5413 int hdr; /* Offset to page header in parent */
5414 int brk; /* Offset to content of first cell in parent */
5415
5416 assert( pPage->pParent==0 );
5417 assert( pPage->nOverflow>0 );
5418 pBt = pPage->pBt;
5419 assert( sqlite3_mutex_held(pBt->mutex) );
5420 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5421 if( rc ) return rc;
5422 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5423 usableSize = pBt->usableSize;
5424 data = pPage->aData;
5425 hdr = pPage->hdrOffset;
5426 brk = get2byte(&data[hdr+5]);
5427 cdata = pChild->aData;
5428 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5429 memcpy(&cdata[brk], &data[brk], usableSize-brk);
5430 assert( pChild->isInit==0 );
5431 rc = sqlite3BtreeInitPage(pChild, pPage);
5432 if( rc ) goto balancedeeper_out;
5433 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
5434 pChild->nOverflow = pPage->nOverflow;
5435 if( pChild->nOverflow ){
5436 pChild->nFree = 0;
5437 }
5438 assert( pChild->nCell==pPage->nCell );
5439 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5440 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5441 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5442#ifndef SQLITE_OMIT_AUTOVACUUM
5443 if( pBt->autoVacuum ){
5444 int i;
5445 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5446 if( rc ) goto balancedeeper_out;
5447 for(i=0; i<pChild->nCell; i++){
5448 rc = ptrmapPutOvfl(pChild, i);
5449 if( rc!=SQLITE_OK ){
5450 return rc;
5451 }
5452 }
5453 }
5454#endif
5455 rc = balance_nonroot(pChild);
5456
5457balancedeeper_out:
5458 releasePage(pChild);
5459 return rc;
5460}
5461
5462/*
5463** Decide if the page pPage needs to be balanced. If balancing is
5464** required, call the appropriate balancing routine.
5465*/
5466static int balance(MemPage *pPage, int insert){
5467 int rc = SQLITE_OK;
5468 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5469 if( pPage->pParent==0 ){
5470 rc = sqlite3PagerWrite(pPage->pDbPage);
5471 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5472 rc = balance_deeper(pPage);
5473 }
5474 if( rc==SQLITE_OK && pPage->nCell==0 ){
5475 rc = balance_shallower(pPage);
5476 }
5477 }else{
5478 if( pPage->nOverflow>0 ||
5479 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5480 rc = balance_nonroot(pPage);
5481 }
5482 }
5483 return rc;
5484}
5485
5486/*
5487** This routine checks all cursors that point to table pgnoRoot.
5488** If any of those cursors were opened with wrFlag==0 in a different
5489** database connection (a database connection that shares the pager
5490** cache with the current connection) and that other connection
5491** is not in the ReadUncommmitted state, then this routine returns
5492** SQLITE_LOCKED.
5493**
5494** In addition to checking for read-locks (where a read-lock
5495** means a cursor opened with wrFlag==0) this routine also moves
5496** all write cursors so that they are pointing to the
5497** first Cell on the root page. This is necessary because an insert
5498** or delete might change the number of cells on a page or delete
5499** a page entirely and we do not want to leave any cursors
5500** pointing to non-existant pages or cells.
5501*/
5502static int checkReadLocks(Btree *pBtree, Pgno pgnoRoot, BtCursor *pExclude){
5503 BtCursor *p;
5504 BtShared *pBt = pBtree->pBt;
5505 sqlite3 *db = pBtree->pSqlite;
5506 assert( sqlite3BtreeHoldsMutex(pBtree) );
5507 for(p=pBt->pCursor; p; p=p->pNext){
5508 if( p==pExclude ) continue;
5509 if( p->eState!=CURSOR_VALID ) continue;
5510 if( p->pgnoRoot!=pgnoRoot ) continue;
5511 if( p->wrFlag==0 ){
5512 sqlite3 *dbOther = p->pBtree->pSqlite;
5513 if( dbOther==0 ||
5514 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5515 return SQLITE_LOCKED;
5516 }
5517 }else if( p->pPage->pgno!=p->pgnoRoot ){
5518 moveToRoot(p);
5519 }
5520 }
5521 return SQLITE_OK;
5522}
5523
5524/*
5525** Insert a new record into the BTree. The key is given by (pKey,nKey)
5526** and the data is given by (pData,nData). The cursor is used only to
5527** define what table the record should be inserted into. The cursor
5528** is left pointing at a random location.
5529**
5530** For an INTKEY table, only the nKey value of the key is used. pKey is
5531** ignored. For a ZERODATA table, the pData and nData are both ignored.
5532*/
5533int sqlite3BtreeInsert(
5534 BtCursor *pCur, /* Insert data into the table of this cursor */
5535 const void *pKey, i64 nKey, /* The key of the new record */
5536 const void *pData, int nData, /* The data of the new record */
5537 int nZero, /* Number of extra 0 bytes to append to data */
5538 int appendBias /* True if this is likely an append */
5539){
5540 int rc;
5541 int loc;
5542 int szNew;
5543 MemPage *pPage;
5544 Btree *p = pCur->pBtree;
5545 BtShared *pBt = p->pBt;
5546 unsigned char *oldCell;
5547 unsigned char *newCell = 0;
5548
5549 assert( cursorHoldsMutex(pCur) );
5550 if( pBt->inTransaction!=TRANS_WRITE ){
5551 /* Must start a transaction before doing an insert */
5552 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5553 return rc;
5554 }
5555 assert( !pBt->readOnly );
5556 if( !pCur->wrFlag ){
5557 return SQLITE_PERM; /* Cursor not open for writing */
5558 }
5559 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5560 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5561 }
5562 if( pCur->eState==CURSOR_FAULT ){
5563 return pCur->skip;
5564 }
5565
5566 /* Save the positions of any other cursors open on this table */
5567 clearCursorPosition(pCur);
5568 if(
5569 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5570 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
5571 ){
5572 return rc;
5573 }
5574
5575 pPage = pCur->pPage;
5576 assert( pPage->intKey || nKey>=0 );
5577 assert( pPage->leaf || !pPage->leafData );
5578 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5579 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5580 loc==0 ? "overwrite" : "new entry"));
5581 assert( pPage->isInit );
5582 newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
5583 if( newCell==0 ) return SQLITE_NOMEM;
5584 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5585 if( rc ) goto end_insert;
5586 assert( szNew==cellSizePtr(pPage, newCell) );
5587 assert( szNew<=MX_CELL_SIZE(pBt) );
5588 if( loc==0 && CURSOR_VALID==pCur->eState ){
5589 int szOld;
5590 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
5591 rc = sqlite3PagerWrite(pPage->pDbPage);
5592 if( rc ){
5593 goto end_insert;
5594 }
5595 oldCell = findCell(pPage, pCur->idx);
5596 if( !pPage->leaf ){
5597 memcpy(newCell, oldCell, 4);
5598 }
5599 szOld = cellSizePtr(pPage, oldCell);
5600 rc = clearCell(pPage, oldCell);
5601 if( rc ) goto end_insert;
5602 dropCell(pPage, pCur->idx, szOld);
5603 }else if( loc<0 && pPage->nCell>0 ){
5604 assert( pPage->leaf );
5605 pCur->idx++;
5606 pCur->info.nSize = 0;
5607 }else{
5608 assert( pPage->leaf );
5609 }
5610 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
5611 if( rc!=SQLITE_OK ) goto end_insert;
5612 rc = balance(pPage, 1);
5613 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
5614 /* fflush(stdout); */
5615 if( rc==SQLITE_OK ){
5616 moveToRoot(pCur);
5617 }
5618end_insert:
5619 sqlite3_free(newCell);
5620 return rc;
5621}
5622
5623/*
5624** Delete the entry that the cursor is pointing to. The cursor
5625** is left pointing at a random location.
5626*/
5627int sqlite3BtreeDelete(BtCursor *pCur){
5628 MemPage *pPage = pCur->pPage;
5629 unsigned char *pCell;
5630 int rc;
5631 Pgno pgnoChild = 0;
5632 Btree *p = pCur->pBtree;
5633 BtShared *pBt = p->pBt;
5634
5635 assert( cursorHoldsMutex(pCur) );
5636 assert( pPage->isInit );
5637 if( pBt->inTransaction!=TRANS_WRITE ){
5638 /* Must start a transaction before doing a delete */
5639 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5640 return rc;
5641 }
5642 assert( !pBt->readOnly );
5643 if( pCur->eState==CURSOR_FAULT ){
5644 return pCur->skip;
5645 }
5646 if( pCur->idx >= pPage->nCell ){
5647 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5648 }
5649 if( !pCur->wrFlag ){
5650 return SQLITE_PERM; /* Did not open this cursor for writing */
5651 }
5652 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur) ){
5653 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5654 }
5655
5656 /* Restore the current cursor position (a no-op if the cursor is not in
5657 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5658 ** open on the same table. Then call sqlite3PagerWrite() on the page
5659 ** that the entry will be deleted from.
5660 */
5661 if(
5662 (rc = restoreOrClearCursorPosition(pCur))!=0 ||
5663 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5664 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5665 ){
5666 return rc;
5667 }
5668
5669 /* Locate the cell within it's page and leave pCell pointing to the
5670 ** data. The clearCell() call frees any overflow pages associated with the
5671 ** cell. The cell itself is still intact.
5672 */
5673 pCell = findCell(pPage, pCur->idx);
5674 if( !pPage->leaf ){
5675 pgnoChild = get4byte(pCell);
5676 }
5677 rc = clearCell(pPage, pCell);
5678 if( rc ){
5679 return rc;
5680 }
5681
5682 if( !pPage->leaf ){
5683 /*
5684 ** The entry we are about to delete is not a leaf so if we do not
5685 ** do something we will leave a hole on an internal page.
5686 ** We have to fill the hole by moving in a cell from a leaf. The
5687 ** next Cell after the one to be deleted is guaranteed to exist and
5688 ** to be a leaf so we can use it.
5689 */
5690 BtCursor leafCur;
5691 unsigned char *pNext;
5692 int szNext; /* The compiler warning is wrong: szNext is always
5693 ** initialized before use. Adding an extra initialization
5694 ** to silence the compiler slows down the code. */
5695 int notUsed;
5696 unsigned char *tempCell = 0;
5697 assert( !pPage->leafData );
5698 sqlite3BtreeGetTempCursor(pCur, &leafCur);
5699 rc = sqlite3BtreeNext(&leafCur, &notUsed);
5700 if( rc==SQLITE_OK ){
5701 rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
5702 }
5703 if( rc==SQLITE_OK ){
5704 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5705 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
5706 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5707 pNext = findCell(leafCur.pPage, leafCur.idx);
5708 szNext = cellSizePtr(leafCur.pPage, pNext);
5709 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5710 tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
5711 if( tempCell==0 ){
5712 rc = SQLITE_NOMEM;
5713 }
5714 }
5715 if( rc==SQLITE_OK ){
5716 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
5717 }
5718 if( rc==SQLITE_OK ){
5719 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
5720 rc = balance(pPage, 0);
5721 }
5722 if( rc==SQLITE_OK ){
5723 dropCell(leafCur.pPage, leafCur.idx, szNext);
5724 rc = balance(leafCur.pPage, 0);
5725 }
5726 sqlite3_free(tempCell);
5727 sqlite3BtreeReleaseTempCursor(&leafCur);
5728 }else{
5729 TRACE(("DELETE: table=%d delete from leaf %d\n",
5730 pCur->pgnoRoot, pPage->pgno));
5731 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5732 rc = balance(pPage, 0);
5733 }
5734 if( rc==SQLITE_OK ){
5735 moveToRoot(pCur);
5736 }
5737 return rc;
5738}
5739
5740/*
5741** Create a new BTree table. Write into *piTable the page
5742** number for the root page of the new table.
5743**
5744** The type of type is determined by the flags parameter. Only the
5745** following values of flags are currently in use. Other values for
5746** flags might not work:
5747**
5748** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
5749** BTREE_ZERODATA Used for SQL indices
5750*/
5751static int btreeCreateTable(Btree *p, int *piTable, int flags){
5752 BtShared *pBt = p->pBt;
5753 MemPage *pRoot;
5754 Pgno pgnoRoot;
5755 int rc;
5756
5757 assert( sqlite3BtreeHoldsMutex(p) );
5758 if( pBt->inTransaction!=TRANS_WRITE ){
5759 /* Must start a transaction first */
5760 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5761 return rc;
5762 }
5763 assert( !pBt->readOnly );
5764
5765#ifdef SQLITE_OMIT_AUTOVACUUM
5766 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5767 if( rc ){
5768 return rc;
5769 }
5770#else
5771 if( pBt->autoVacuum ){
5772 Pgno pgnoMove; /* Move a page here to make room for the root-page */
5773 MemPage *pPageMove; /* The page to move to. */
5774
5775 /* Creating a new table may probably require moving an existing database
5776 ** to make room for the new tables root page. In case this page turns
5777 ** out to be an overflow page, delete all overflow page-map caches
5778 ** held by open cursors.
5779 */
5780 invalidateAllOverflowCache(pBt);
5781
5782 /* Read the value of meta[3] from the database to determine where the
5783 ** root page of the new table should go. meta[3] is the largest root-page
5784 ** created so far, so the new root-page is (meta[3]+1).
5785 */
5786 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
5787 if( rc!=SQLITE_OK ){
5788 return rc;
5789 }
5790 pgnoRoot++;
5791
5792 /* The new root-page may not be allocated on a pointer-map page, or the
5793 ** PENDING_BYTE page.
5794 */
5795 if( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
5796 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
5797 pgnoRoot++;
5798 }
5799 assert( pgnoRoot>=3 );
5800
5801 /* Allocate a page. The page that currently resides at pgnoRoot will
5802 ** be moved to the allocated page (unless the allocated page happens
5803 ** to reside at pgnoRoot).
5804 */
5805 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
5806 if( rc!=SQLITE_OK ){
5807 return rc;
5808 }
5809
5810 if( pgnoMove!=pgnoRoot ){
5811 /* pgnoRoot is the page that will be used for the root-page of
5812 ** the new table (assuming an error did not occur). But we were
5813 ** allocated pgnoMove. If required (i.e. if it was not allocated
5814 ** by extending the file), the current page at position pgnoMove
5815 ** is already journaled.
5816 */
5817 u8 eType;
5818 Pgno iPtrPage;
5819
5820 releasePage(pPageMove);
5821
5822 /* Move the page currently at pgnoRoot to pgnoMove. */
5823 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5824 if( rc!=SQLITE_OK ){
5825 return rc;
5826 }
5827 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
5828 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
5829 releasePage(pRoot);
5830 return rc;
5831 }
5832 assert( eType!=PTRMAP_ROOTPAGE );
5833 assert( eType!=PTRMAP_FREEPAGE );
5834 rc = sqlite3PagerWrite(pRoot->pDbPage);
5835 if( rc!=SQLITE_OK ){
5836 releasePage(pRoot);
5837 return rc;
5838 }
5839 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
5840 releasePage(pRoot);
5841
5842 /* Obtain the page at pgnoRoot */
5843 if( rc!=SQLITE_OK ){
5844 return rc;
5845 }
5846 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
5847 if( rc!=SQLITE_OK ){
5848 return rc;
5849 }
5850 rc = sqlite3PagerWrite(pRoot->pDbPage);
5851 if( rc!=SQLITE_OK ){
5852 releasePage(pRoot);
5853 return rc;
5854 }
5855 }else{
5856 pRoot = pPageMove;
5857 }
5858
5859 /* Update the pointer-map and meta-data with the new root-page number. */
5860 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
5861 if( rc ){
5862 releasePage(pRoot);
5863 return rc;
5864 }
5865 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
5866 if( rc ){
5867 releasePage(pRoot);
5868 return rc;
5869 }
5870
5871 }else{
5872 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
5873 if( rc ) return rc;
5874 }
5875#endif
5876 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
5877 zeroPage(pRoot, flags | PTF_LEAF);
5878 sqlite3PagerUnref(pRoot->pDbPage);
5879 *piTable = (int)pgnoRoot;
5880 return SQLITE_OK;
5881}
5882int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
5883 int rc;
5884 sqlite3BtreeEnter(p);
5885 rc = btreeCreateTable(p, piTable, flags);
5886 sqlite3BtreeLeave(p);
5887 return rc;
5888}
5889
5890/*
5891** Erase the given database page and all its children. Return
5892** the page to the freelist.
5893*/
5894static int clearDatabasePage(
5895 BtShared *pBt, /* The BTree that contains the table */
5896 Pgno pgno, /* Page number to clear */
5897 MemPage *pParent, /* Parent page. NULL for the root */
5898 int freePageFlag /* Deallocate page if true */
5899){
5900 MemPage *pPage = 0;
5901 int rc;
5902 unsigned char *pCell;
5903 int i;
5904
5905 assert( sqlite3_mutex_held(pBt->mutex) );
5906 if( pgno>sqlite3PagerPagecount(pBt->pPager) ){
5907 return SQLITE_CORRUPT_BKPT;
5908 }
5909
5910 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
5911 if( rc ) goto cleardatabasepage_out;
5912 for(i=0; i<pPage->nCell; i++){
5913 pCell = findCell(pPage, i);
5914 if( !pPage->leaf ){
5915 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
5916 if( rc ) goto cleardatabasepage_out;
5917 }
5918 rc = clearCell(pPage, pCell);
5919 if( rc ) goto cleardatabasepage_out;
5920 }
5921 if( !pPage->leaf ){
5922 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
5923 if( rc ) goto cleardatabasepage_out;
5924 }
5925 if( freePageFlag ){
5926 rc = freePage(pPage);
5927 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
5928 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
5929 }
5930
5931cleardatabasepage_out:
5932 releasePage(pPage);
5933 return rc;
5934}
5935
5936/*
5937** Delete all information from a single table in the database. iTable is
5938** the page number of the root of the table. After this routine returns,
5939** the root page is empty, but still exists.
5940**
5941** This routine will fail with SQLITE_LOCKED if there are any open
5942** read cursors on the table. Open write cursors are moved to the
5943** root of the table.
5944*/
5945int sqlite3BtreeClearTable(Btree *p, int iTable){
5946 int rc;
5947 BtShared *pBt = p->pBt;
5948 sqlite3BtreeEnter(p);
5949 if( p->inTrans!=TRANS_WRITE ){
5950 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5951 }else if( (rc = checkReadLocks(p, iTable, 0))!=SQLITE_OK ){
5952 /* nothing to do */
5953 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
5954 /* nothing to do */
5955 }else{
5956 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
5957 }
5958 sqlite3BtreeLeave(p);
5959 return rc;
5960}
5961
5962/*
5963** Erase all information in a table and add the root of the table to
5964** the freelist. Except, the root of the principle table (the one on
5965** page 1) is never added to the freelist.
5966**
5967** This routine will fail with SQLITE_LOCKED if there are any open
5968** cursors on the table.
5969**
5970** If AUTOVACUUM is enabled and the page at iTable is not the last
5971** root page in the database file, then the last root page
5972** in the database file is moved into the slot formerly occupied by
5973** iTable and that last slot formerly occupied by the last root page
5974** is added to the freelist instead of iTable. In this say, all
5975** root pages are kept at the beginning of the database file, which
5976** is necessary for AUTOVACUUM to work right. *piMoved is set to the
5977** page number that used to be the last root page in the file before
5978** the move. If no page gets moved, *piMoved is set to 0.
5979** The last root page is recorded in meta[3] and the value of
5980** meta[3] is updated by this procedure.
5981*/
5982static int btreeDropTable(Btree *p, int iTable, int *piMoved){
5983 int rc;
5984 MemPage *pPage = 0;
5985 BtShared *pBt = p->pBt;
5986
5987 assert( sqlite3BtreeHoldsMutex(p) );
5988 if( p->inTrans!=TRANS_WRITE ){
5989 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5990 }
5991
5992 /* It is illegal to drop a table if any cursors are open on the
5993 ** database. This is because in auto-vacuum mode the backend may
5994 ** need to move another root-page to fill a gap left by the deleted
5995 ** root page. If an open cursor was using this page a problem would
5996 ** occur.
5997 */
5998 if( pBt->pCursor ){
5999 return SQLITE_LOCKED;
6000 }
6001
6002 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6003 if( rc ) return rc;
6004 rc = sqlite3BtreeClearTable(p, iTable);
6005 if( rc ){
6006 releasePage(pPage);
6007 return rc;
6008 }
6009
6010 *piMoved = 0;
6011
6012 if( iTable>1 ){
6013#ifdef SQLITE_OMIT_AUTOVACUUM
6014 rc = freePage(pPage);
6015 releasePage(pPage);
6016#else
6017 if( pBt->autoVacuum ){
6018 Pgno maxRootPgno;
6019 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6020 if( rc!=SQLITE_OK ){
6021 releasePage(pPage);
6022 return rc;
6023 }
6024
6025 if( iTable==maxRootPgno ){
6026 /* If the table being dropped is the table with the largest root-page
6027 ** number in the database, put the root page on the free list.
6028 */
6029 rc = freePage(pPage);
6030 releasePage(pPage);
6031 if( rc!=SQLITE_OK ){
6032 return rc;
6033 }
6034 }else{
6035 /* The table being dropped does not have the largest root-page
6036 ** number in the database. So move the page that does into the
6037 ** gap left by the deleted root-page.
6038 */
6039 MemPage *pMove;
6040 releasePage(pPage);
6041 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6042 if( rc!=SQLITE_OK ){
6043 return rc;
6044 }
6045 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
6046 releasePage(pMove);
6047 if( rc!=SQLITE_OK ){
6048 return rc;
6049 }
6050 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6051 if( rc!=SQLITE_OK ){
6052 return rc;
6053 }
6054 rc = freePage(pMove);
6055 releasePage(pMove);
6056 if( rc!=SQLITE_OK ){
6057 return rc;
6058 }
6059 *piMoved = maxRootPgno;
6060 }
6061
6062 /* Set the new 'max-root-page' value in the database header. This
6063 ** is the old value less one, less one more if that happens to
6064 ** be a root-page number, less one again if that is the
6065 ** PENDING_BYTE_PAGE.
6066 */
6067 maxRootPgno--;
6068 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6069 maxRootPgno--;
6070 }
6071 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6072 maxRootPgno--;
6073 }
6074 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6075
6076 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6077 }else{
6078 rc = freePage(pPage);
6079 releasePage(pPage);
6080 }
6081#endif
6082 }else{
6083 /* If sqlite3BtreeDropTable was called on page 1. */
6084 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6085 releasePage(pPage);
6086 }
6087 return rc;
6088}
6089int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6090 int rc;
6091 sqlite3BtreeEnter(p);
6092 rc = btreeDropTable(p, iTable, piMoved);
6093 sqlite3BtreeLeave(p);
6094 return rc;
6095}
6096
6097
6098/*
6099** Read the meta-information out of a database file. Meta[0]
6100** is the number of free pages currently in the database. Meta[1]
6101** through meta[15] are available for use by higher layers. Meta[0]
6102** is read-only, the others are read/write.
6103**
6104** The schema layer numbers meta values differently. At the schema
6105** layer (and the SetCookie and ReadCookie opcodes) the number of
6106** free pages is not visible. So Cookie[0] is the same as Meta[1].
6107*/
6108int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6109 DbPage *pDbPage;
6110 int rc;
6111 unsigned char *pP1;
6112 BtShared *pBt = p->pBt;
6113
6114 sqlite3BtreeEnter(p);
6115
6116 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6117 ** the sqlite_master table. We grab this lock regardless of whether or
6118 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6119 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6120 */
6121 rc = queryTableLock(p, 1, READ_LOCK);
6122 if( rc!=SQLITE_OK ){
6123 sqlite3BtreeLeave(p);
6124 return rc;
6125 }
6126
6127 assert( idx>=0 && idx<=15 );
6128 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6129 if( rc ){
6130 sqlite3BtreeLeave(p);
6131 return rc;
6132 }
6133 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6134 *pMeta = get4byte(&pP1[36 + idx*4]);
6135 sqlite3PagerUnref(pDbPage);
6136
6137 /* If autovacuumed is disabled in this build but we are trying to
6138 ** access an autovacuumed database, then make the database readonly.
6139 */
6140#ifdef SQLITE_OMIT_AUTOVACUUM
6141 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6142#endif
6143
6144 /* Grab the read-lock on page 1. */
6145 rc = lockTable(p, 1, READ_LOCK);
6146 sqlite3BtreeLeave(p);
6147 return rc;
6148}
6149
6150/*
6151** Write meta-information back into the database. Meta[0] is
6152** read-only and may not be written.
6153*/
6154int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6155 BtShared *pBt = p->pBt;
6156 unsigned char *pP1;
6157 int rc;
6158 assert( idx>=1 && idx<=15 );
6159 sqlite3BtreeEnter(p);
6160 if( p->inTrans!=TRANS_WRITE ){
6161 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6162 }else{
6163 assert( pBt->pPage1!=0 );
6164 pP1 = pBt->pPage1->aData;
6165 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6166 if( rc==SQLITE_OK ){
6167 put4byte(&pP1[36 + idx*4], iMeta);
6168#ifndef SQLITE_OMIT_AUTOVACUUM
6169 if( idx==7 ){
6170 assert( pBt->autoVacuum || iMeta==0 );
6171 assert( iMeta==0 || iMeta==1 );
6172 pBt->incrVacuum = iMeta;
6173 }
6174#endif
6175 }
6176 }
6177 sqlite3BtreeLeave(p);
6178 return rc;
6179}
6180
6181/*
6182** Return the flag byte at the beginning of the page that the cursor
6183** is currently pointing to.
6184*/
6185int sqlite3BtreeFlags(BtCursor *pCur){
6186 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6187 ** restoreOrClearCursorPosition() here.
6188 */
6189 MemPage *pPage = pCur->pPage;
6190 assert( cursorHoldsMutex(pCur) );
6191 assert( pPage->pBt==pCur->pBt );
6192 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6193}
6194
6195
6196/*
6197** Return the pager associated with a BTree. This routine is used for
6198** testing and debugging only.
6199*/
6200Pager *sqlite3BtreePager(Btree *p){
6201 return p->pBt->pPager;
6202}
6203
6204#ifndef SQLITE_OMIT_INTEGRITY_CHECK
6205/*
6206** Append a message to the error message string.
6207*/
6208static void checkAppendMsg(
6209 IntegrityCk *pCheck,
6210 char *zMsg1,
6211 const char *zFormat,
6212 ...
6213){
6214 va_list ap;
6215 char *zMsg2;
6216 if( !pCheck->mxErr ) return;
6217 pCheck->mxErr--;
6218 pCheck->nErr++;
6219 va_start(ap, zFormat);
6220 zMsg2 = sqlite3VMPrintf(0, zFormat, ap);
6221 va_end(ap);
6222 if( zMsg1==0 ) zMsg1 = "";
6223 if( pCheck->zErrMsg ){
6224 char *zOld = pCheck->zErrMsg;
6225 pCheck->zErrMsg = 0;
6226 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
6227 sqlite3_free(zOld);
6228 }else{
6229 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
6230 }
6231 sqlite3_free(zMsg2);
6232}
6233#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6234
6235#ifndef SQLITE_OMIT_INTEGRITY_CHECK
6236/*
6237** Add 1 to the reference count for page iPage. If this is the second
6238** reference to the page, add an error message to pCheck->zErrMsg.
6239** Return 1 if there are 2 ore more references to the page and 0 if
6240** if this is the first reference to the page.
6241**
6242** Also check that the page number is in bounds.
6243*/
6244static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6245 if( iPage==0 ) return 1;
6246 if( iPage>pCheck->nPage || iPage<0 ){
6247 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6248 return 1;
6249 }
6250 if( pCheck->anRef[iPage]==1 ){
6251 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6252 return 1;
6253 }
6254 return (pCheck->anRef[iPage]++)>1;
6255}
6256
6257#ifndef SQLITE_OMIT_AUTOVACUUM
6258/*
6259** Check that the entry in the pointer-map for page iChild maps to
6260** page iParent, pointer type ptrType. If not, append an error message
6261** to pCheck.
6262*/
6263static void checkPtrmap(
6264 IntegrityCk *pCheck, /* Integrity check context */
6265 Pgno iChild, /* Child page number */
6266 u8 eType, /* Expected pointer map type */
6267 Pgno iParent, /* Expected pointer map parent page number */
6268 char *zContext /* Context description (used for error msg) */
6269){
6270 int rc;
6271 u8 ePtrmapType;
6272 Pgno iPtrmapParent;
6273
6274 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6275 if( rc!=SQLITE_OK ){
6276 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6277 return;
6278 }
6279
6280 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6281 checkAppendMsg(pCheck, zContext,
6282 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6283 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6284 }
6285}
6286#endif
6287
6288/*
6289** Check the integrity of the freelist or of an overflow page list.
6290** Verify that the number of pages on the list is N.
6291*/
6292static void checkList(
6293 IntegrityCk *pCheck, /* Integrity checking context */
6294 int isFreeList, /* True for a freelist. False for overflow page list */
6295 int iPage, /* Page number for first page in the list */
6296 int N, /* Expected number of pages in the list */
6297 char *zContext /* Context for error messages */
6298){
6299 int i;
6300 int expected = N;
6301 int iFirst = iPage;
6302 while( N-- > 0 && pCheck->mxErr ){
6303 DbPage *pOvflPage;
6304 unsigned char *pOvflData;
6305 if( iPage<1 ){
6306 checkAppendMsg(pCheck, zContext,
6307 "%d of %d pages missing from overflow list starting at %d",
6308 N+1, expected, iFirst);
6309 break;
6310 }
6311 if( checkRef(pCheck, iPage, zContext) ) break;
6312 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6313 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6314 break;
6315 }
6316 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6317 if( isFreeList ){
6318 int n = get4byte(&pOvflData[4]);
6319#ifndef SQLITE_OMIT_AUTOVACUUM
6320 if( pCheck->pBt->autoVacuum ){
6321 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6322 }
6323#endif
6324 if( n>pCheck->pBt->usableSize/4-8 ){
6325 checkAppendMsg(pCheck, zContext,
6326 "freelist leaf count too big on page %d", iPage);
6327 N--;
6328 }else{
6329 for(i=0; i<n; i++){
6330 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6331#ifndef SQLITE_OMIT_AUTOVACUUM
6332 if( pCheck->pBt->autoVacuum ){
6333 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6334 }
6335#endif
6336 checkRef(pCheck, iFreePage, zContext);
6337 }
6338 N -= n;
6339 }
6340 }
6341#ifndef SQLITE_OMIT_AUTOVACUUM
6342 else{
6343 /* If this database supports auto-vacuum and iPage is not the last
6344 ** page in this overflow list, check that the pointer-map entry for
6345 ** the following page matches iPage.
6346 */
6347 if( pCheck->pBt->autoVacuum && N>0 ){
6348 i = get4byte(pOvflData);
6349 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6350 }
6351 }
6352#endif
6353 iPage = get4byte(pOvflData);
6354 sqlite3PagerUnref(pOvflPage);
6355 }
6356}
6357#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6358
6359#ifndef SQLITE_OMIT_INTEGRITY_CHECK
6360/*
6361** Do various sanity checks on a single page of a tree. Return
6362** the tree depth. Root pages return 0. Parents of root pages
6363** return 1, and so forth.
6364**
6365** These checks are done:
6366**
6367** 1. Make sure that cells and freeblocks do not overlap
6368** but combine to completely cover the page.
6369** NO 2. Make sure cell keys are in order.
6370** NO 3. Make sure no key is less than or equal to zLowerBound.
6371** NO 4. Make sure no key is greater than or equal to zUpperBound.
6372** 5. Check the integrity of overflow pages.
6373** 6. Recursively call checkTreePage on all children.
6374** 7. Verify that the depth of all children is the same.
6375** 8. Make sure this page is at least 33% full or else it is
6376** the root of the tree.
6377*/
6378static int checkTreePage(
6379 IntegrityCk *pCheck, /* Context for the sanity check */
6380 int iPage, /* Page number of the page to check */
6381 MemPage *pParent, /* Parent page */
6382 char *zParentContext /* Parent context */
6383){
6384 MemPage *pPage;
6385 int i, rc, depth, d2, pgno, cnt;
6386 int hdr, cellStart;
6387 int nCell;
6388 u8 *data;
6389 BtShared *pBt;
6390 int usableSize;
6391 char zContext[100];
6392 char *hit;
6393
6394 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6395
6396 /* Check that the page exists
6397 */
6398 pBt = pCheck->pBt;
6399 usableSize = pBt->usableSize;
6400 if( iPage==0 ) return 0;
6401 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6402 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6403 checkAppendMsg(pCheck, zContext,
6404 "unable to get the page. error code=%d", rc);
6405 return 0;
6406 }
6407 if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){
6408 checkAppendMsg(pCheck, zContext,
6409 "sqlite3BtreeInitPage() returns error code %d", rc);
6410 releasePage(pPage);
6411 return 0;
6412 }
6413
6414 /* Check out all the cells.
6415 */
6416 depth = 0;
6417 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6418 u8 *pCell;
6419 int sz;
6420 CellInfo info;
6421
6422 /* Check payload overflow pages
6423 */
6424 sqlite3_snprintf(sizeof(zContext), zContext,
6425 "On tree page %d cell %d: ", iPage, i);
6426 pCell = findCell(pPage,i);
6427 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6428 sz = info.nData;
6429 if( !pPage->intKey ) sz += info.nKey;
6430 assert( sz==info.nPayload );
6431 if( sz>info.nLocal ){
6432 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6433 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6434#ifndef SQLITE_OMIT_AUTOVACUUM
6435 if( pBt->autoVacuum ){
6436 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6437 }
6438#endif
6439 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6440 }
6441
6442 /* Check sanity of left child page.
6443 */
6444 if( !pPage->leaf ){
6445 pgno = get4byte(pCell);
6446#ifndef SQLITE_OMIT_AUTOVACUUM
6447 if( pBt->autoVacuum ){
6448 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6449 }
6450#endif
6451 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6452 if( i>0 && d2!=depth ){
6453 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6454 }
6455 depth = d2;
6456 }
6457 }
6458 if( !pPage->leaf ){
6459 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6460 sqlite3_snprintf(sizeof(zContext), zContext,
6461 "On page %d at right child: ", iPage);
6462#ifndef SQLITE_OMIT_AUTOVACUUM
6463 if( pBt->autoVacuum ){
6464 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6465 }
6466#endif
6467 checkTreePage(pCheck, pgno, pPage, zContext);
6468 }
6469
6470 /* Check for complete coverage of the page
6471 */
6472 data = pPage->aData;
6473 hdr = pPage->hdrOffset;
6474 hit = sqlite3MallocZero( usableSize );
6475 if( hit ){
6476 memset(hit, 1, get2byte(&data[hdr+5]));
6477 nCell = get2byte(&data[hdr+3]);
6478 cellStart = hdr + 12 - 4*pPage->leaf;
6479 for(i=0; i<nCell; i++){
6480 int pc = get2byte(&data[cellStart+i*2]);
6481 int size = cellSizePtr(pPage, &data[pc]);
6482 int j;
6483 if( (pc+size-1)>=usableSize || pc<0 ){
6484 checkAppendMsg(pCheck, 0,
6485 "Corruption detected in cell %d on page %d",i,iPage,0);
6486 }else{
6487 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6488 }
6489 }
6490 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6491 cnt++){
6492 int size = get2byte(&data[i+2]);
6493 int j;
6494 if( (i+size-1)>=usableSize || i<0 ){
6495 checkAppendMsg(pCheck, 0,
6496 "Corruption detected in cell %d on page %d",i,iPage,0);
6497 }else{
6498 for(j=i+size-1; j>=i; j--) hit[j]++;
6499 }
6500 i = get2byte(&data[i]);
6501 }
6502 for(i=cnt=0; i<usableSize; i++){
6503 if( hit[i]==0 ){
6504 cnt++;
6505 }else if( hit[i]>1 ){
6506 checkAppendMsg(pCheck, 0,
6507 "Multiple uses for byte %d of page %d", i, iPage);
6508 break;
6509 }
6510 }
6511 if( cnt!=data[hdr+7] ){
6512 checkAppendMsg(pCheck, 0,
6513 "Fragmented space is %d byte reported as %d on page %d",
6514 cnt, data[hdr+7], iPage);
6515 }
6516 }
6517 sqlite3_free(hit);
6518
6519 releasePage(pPage);
6520 return depth+1;
6521}
6522#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6523
6524#ifndef SQLITE_OMIT_INTEGRITY_CHECK
6525/*
6526** This routine does a complete check of the given BTree file. aRoot[] is
6527** an array of pages numbers were each page number is the root page of
6528** a table. nRoot is the number of entries in aRoot.
6529**
6530** If everything checks out, this routine returns NULL. If something is
6531** amiss, an error message is written into memory obtained from malloc()
6532** and a pointer to that error message is returned. The calling function
6533** is responsible for freeing the error message when it is done.
6534*/
6535char *sqlite3BtreeIntegrityCheck(
6536 Btree *p, /* The btree to be checked */
6537 int *aRoot, /* An array of root pages numbers for individual trees */
6538 int nRoot, /* Number of entries in aRoot[] */
6539 int mxErr, /* Stop reporting errors after this many */
6540 int *pnErr /* Write number of errors seen to this variable */
6541){
6542 int i;
6543 int nRef;
6544 IntegrityCk sCheck;
6545 BtShared *pBt = p->pBt;
6546
6547 sqlite3BtreeEnter(p);
6548 nRef = sqlite3PagerRefcount(pBt->pPager);
6549 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6550 sqlite3BtreeLeave(p);
6551 return sqlite3StrDup("Unable to acquire a read lock on the database");
6552 }
6553 sCheck.pBt = pBt;
6554 sCheck.pPager = pBt->pPager;
6555 sCheck.nPage = sqlite3PagerPagecount(sCheck.pPager);
6556 sCheck.mxErr = mxErr;
6557 sCheck.nErr = 0;
6558 *pnErr = 0;
6559#ifndef SQLITE_OMIT_AUTOVACUUM
6560 if( pBt->nTrunc!=0 ){
6561 sCheck.nPage = pBt->nTrunc;
6562 }
6563#endif
6564 if( sCheck.nPage==0 ){
6565 unlockBtreeIfUnused(pBt);
6566 sqlite3BtreeLeave(p);
6567 return 0;
6568 }
6569 sCheck.anRef = sqlite3_malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6570 if( !sCheck.anRef ){
6571 unlockBtreeIfUnused(pBt);
6572 *pnErr = 1;
6573 sqlite3BtreeLeave(p);
6574 return sqlite3MPrintf(p->pSqlite, "Unable to malloc %d bytes",
6575 (sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
6576 }
6577 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6578 i = PENDING_BYTE_PAGE(pBt);
6579 if( i<=sCheck.nPage ){
6580 sCheck.anRef[i] = 1;
6581 }
6582 sCheck.zErrMsg = 0;
6583
6584 /* Check the integrity of the freelist
6585 */
6586 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6587 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6588
6589 /* Check all the tables.
6590 */
6591 for(i=0; i<nRoot && sCheck.mxErr; i++){
6592 if( aRoot[i]==0 ) continue;
6593#ifndef SQLITE_OMIT_AUTOVACUUM
6594 if( pBt->autoVacuum && aRoot[i]>1 ){
6595 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6596 }
6597#endif
6598 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6599 }
6600
6601 /* Make sure every page in the file is referenced
6602 */
6603 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6604#ifdef SQLITE_OMIT_AUTOVACUUM
6605 if( sCheck.anRef[i]==0 ){
6606 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6607 }
6608#else
6609 /* If the database supports auto-vacuum, make sure no tables contain
6610 ** references to pointer-map pages.
6611 */
6612 if( sCheck.anRef[i]==0 &&
6613 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6614 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6615 }
6616 if( sCheck.anRef[i]!=0 &&
6617 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6618 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6619 }
6620#endif
6621 }
6622
6623 /* Make sure this analysis did not leave any unref() pages
6624 */
6625 unlockBtreeIfUnused(pBt);
6626 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6627 checkAppendMsg(&sCheck, 0,
6628 "Outstanding page count goes from %d to %d during this analysis",
6629 nRef, sqlite3PagerRefcount(pBt->pPager)
6630 );
6631 }
6632
6633 /* Clean up and report errors.
6634 */
6635 sqlite3BtreeLeave(p);
6636 sqlite3_free(sCheck.anRef);
6637 *pnErr = sCheck.nErr;
6638 return sCheck.zErrMsg;
6639}
6640#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6641
6642/*
6643** Return the full pathname of the underlying database file.
6644**
6645** The pager filename is invariant as long as the pager is
6646** open so it is safe to access without the BtShared mutex.
6647*/
6648const char *sqlite3BtreeGetFilename(Btree *p){
6649 assert( p->pBt->pPager!=0 );
6650 return sqlite3PagerFilename(p->pBt->pPager);
6651}
6652
6653/*
6654** Return the pathname of the directory that contains the database file.
6655**
6656** The pager directory name is invariant as long as the pager is
6657** open so it is safe to access without the BtShared mutex.
6658*/
6659const char *sqlite3BtreeGetDirname(Btree *p){
6660 assert( p->pBt->pPager!=0 );
6661 return sqlite3PagerDirname(p->pBt->pPager);
6662}
6663
6664/*
6665** Return the pathname of the journal file for this database. The return
6666** value of this routine is the same regardless of whether the journal file
6667** has been created or not.
6668**
6669** The pager journal filename is invariant as long as the pager is
6670** open so it is safe to access without the BtShared mutex.
6671*/
6672const char *sqlite3BtreeGetJournalname(Btree *p){
6673 assert( p->pBt->pPager!=0 );
6674 return sqlite3PagerJournalname(p->pBt->pPager);
6675}
6676
6677#ifndef SQLITE_OMIT_VACUUM
6678/*
6679** Copy the complete content of pBtFrom into pBtTo. A transaction
6680** must be active for both files.
6681**
6682** The size of file pBtFrom may be reduced by this operation.
6683** If anything goes wrong, the transaction on pBtFrom is rolled back.
6684*/
6685static int btreeCopyFile(Btree *pTo, Btree *pFrom){
6686 int rc = SQLITE_OK;
6687 Pgno i, nPage, nToPage, iSkip;
6688
6689 BtShared *pBtTo = pTo->pBt;
6690 BtShared *pBtFrom = pFrom->pBt;
6691
6692 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
6693 return SQLITE_ERROR;
6694 }
6695 if( pBtTo->pCursor ) return SQLITE_BUSY;
6696 nToPage = sqlite3PagerPagecount(pBtTo->pPager);
6697 nPage = sqlite3PagerPagecount(pBtFrom->pPager);
6698 iSkip = PENDING_BYTE_PAGE(pBtTo);
6699 for(i=1; rc==SQLITE_OK && i<=nPage; i++){
6700 DbPage *pDbPage;
6701 if( i==iSkip ) continue;
6702 rc = sqlite3PagerGet(pBtFrom->pPager, i, &pDbPage);
6703 if( rc ) break;
6704 rc = sqlite3PagerOverwrite(pBtTo->pPager, i, sqlite3PagerGetData(pDbPage));
6705 sqlite3PagerUnref(pDbPage);
6706 }
6707
6708 /* If the file is shrinking, journal the pages that are being truncated
6709 ** so that they can be rolled back if the commit fails.
6710 */
6711 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
6712 DbPage *pDbPage;
6713 if( i==iSkip ) continue;
6714 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
6715 if( rc ) break;
6716 rc = sqlite3PagerWrite(pDbPage);
6717 sqlite3PagerDontWrite(pDbPage);
6718 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
6719 ** that is because the names of those procedures do not exactly
6720 ** represent what they do. Write() really means "put this page in the
6721 ** rollback journal and mark it as dirty so that it will be written
6722 ** to the database file later." DontWrite() undoes the second part of
6723 ** that and prevents the page from being written to the database. The
6724 ** page is still on the rollback journal, though. And that is the whole
6725 ** point of this loop: to put pages on the rollback journal. */
6726 sqlite3PagerUnref(pDbPage);
6727 }
6728 if( !rc && nPage<nToPage ){
6729 rc = sqlite3PagerTruncate(pBtTo->pPager, nPage);
6730 }
6731
6732 if( rc ){
6733 sqlite3BtreeRollback(pTo);
6734 }
6735 return rc;
6736}
6737int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
6738 int rc;
6739 sqlite3BtreeEnter(pTo);
6740 sqlite3BtreeEnter(pFrom);
6741 rc = btreeCopyFile(pTo, pFrom);
6742 sqlite3BtreeLeave(pFrom);
6743 sqlite3BtreeLeave(pTo);
6744 return rc;
6745}
6746
6747#endif /* SQLITE_OMIT_VACUUM */
6748
6749/*
6750** Return non-zero if a transaction is active.
6751*/
6752int sqlite3BtreeIsInTrans(Btree *p){
6753 assert( p==0 || sqlite3_mutex_held(p->pSqlite->mutex) );
6754 return (p && (p->inTrans==TRANS_WRITE));
6755}
6756
6757/*
6758** Return non-zero if a statement transaction is active.
6759*/
6760int sqlite3BtreeIsInStmt(Btree *p){
6761 assert( sqlite3BtreeHoldsMutex(p) );
6762 return (p->pBt && p->pBt->inStmt);
6763}
6764
6765/*
6766** Return non-zero if a read (or write) transaction is active.
6767*/
6768int sqlite3BtreeIsInReadTrans(Btree *p){
6769 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
6770 return (p && (p->inTrans!=TRANS_NONE));
6771}
6772
6773/*
6774** This function returns a pointer to a blob of memory associated with
6775** a single shared-btree. The memory is used by client code for it's own
6776** purposes (for example, to store a high-level schema associated with
6777** the shared-btree). The btree layer manages reference counting issues.
6778**
6779** The first time this is called on a shared-btree, nBytes bytes of memory
6780** are allocated, zeroed, and returned to the caller. For each subsequent
6781** call the nBytes parameter is ignored and a pointer to the same blob
6782** of memory returned.
6783**
6784** Just before the shared-btree is closed, the function passed as the
6785** xFree argument when the memory allocation was made is invoked on the
6786** blob of allocated memory. This function should not call sqlite3_free()
6787** on the memory, the btree layer does that.
6788*/
6789void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
6790 BtShared *pBt = p->pBt;
6791 sqlite3BtreeEnter(p);
6792 if( !pBt->pSchema ){
6793 pBt->pSchema = sqlite3MallocZero(nBytes);
6794 pBt->xFreeSchema = xFree;
6795 }
6796 sqlite3BtreeLeave(p);
6797 return pBt->pSchema;
6798}
6799
6800/*
6801** Return true if another user of the same shared btree as the argument
6802** handle holds an exclusive lock on the sqlite_master table.
6803*/
6804int sqlite3BtreeSchemaLocked(Btree *p){
6805 int rc;
6806 assert( sqlite3_mutex_held(p->pSqlite->mutex) );
6807 sqlite3BtreeEnter(p);
6808 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
6809 sqlite3BtreeLeave(p);
6810 return rc;
6811}
6812
6813
6814#ifndef SQLITE_OMIT_SHARED_CACHE
6815/*
6816** Obtain a lock on the table whose root page is iTab. The
6817** lock is a write lock if isWritelock is true or a read lock
6818** if it is false.
6819*/
6820int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
6821 int rc = SQLITE_OK;
6822 u8 lockType = (isWriteLock?WRITE_LOCK:READ_LOCK);
6823 sqlite3BtreeEnter(p);
6824 rc = queryTableLock(p, iTab, lockType);
6825 if( rc==SQLITE_OK ){
6826 rc = lockTable(p, iTab, lockType);
6827 }
6828 sqlite3BtreeLeave(p);
6829 return rc;
6830}
6831#endif
6832
6833#ifndef SQLITE_OMIT_INCRBLOB
6834/*
6835** Argument pCsr must be a cursor opened for writing on an
6836** INTKEY table currently pointing at a valid table entry.
6837** This function modifies the data stored as part of that entry.
6838** Only the data content may only be modified, it is not possible
6839** to change the length of the data stored.
6840*/
6841int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
6842 assert( cursorHoldsMutex(pCsr) );
6843 assert( sqlite3_mutex_held(pCsr->pBtree->pSqlite->mutex) );
6844 assert(pCsr->isIncrblobHandle);
6845 if( pCsr->eState>=CURSOR_REQUIRESEEK ){
6846 if( pCsr->eState==CURSOR_FAULT ){
6847 return pCsr->skip;
6848 }else{
6849 return SQLITE_ABORT;
6850 }
6851 }
6852
6853 /* Check some preconditions:
6854 ** (a) the cursor is open for writing,
6855 ** (b) there is no read-lock on the table being modified and
6856 ** (c) the cursor points at a valid row of an intKey table.
6857 */
6858 if( !pCsr->wrFlag ){
6859 return SQLITE_READONLY;
6860 }
6861 assert( !pCsr->pBt->readOnly
6862 && pCsr->pBt->inTransaction==TRANS_WRITE );
6863 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr) ){
6864 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
6865 }
6866 if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){
6867 return SQLITE_ERROR;
6868 }
6869
6870 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
6871}
6872
6873/*
6874** Set a flag on this cursor to cache the locations of pages from the
6875** overflow list for the current row. This is used by cursors opened
6876** for incremental blob IO only.
6877**
6878** This function sets a flag only. The actual page location cache
6879** (stored in BtCursor.aOverflow[]) is allocated and used by function
6880** accessPayload() (the worker function for sqlite3BtreeData() and
6881** sqlite3BtreePutData()).
6882*/
6883void sqlite3BtreeCacheOverflow(BtCursor *pCur){
6884 assert( cursorHoldsMutex(pCur) );
6885 assert( sqlite3_mutex_held(pCur->pBtree->pSqlite->mutex) );
6886 assert(!pCur->isIncrblobHandle);
6887 assert(!pCur->aOverflow);
6888 pCur->isIncrblobHandle = 1;
6889}
6890#endif