/* ** A read or write transaction may or may not be active on database handle ** db. If a transaction is active, commit it. If there is a ** write-transaction spanning more than one database file, this routine ** takes care of the master journal trickery. */ static int vdbeCommit(sqlite3 *db){ int i; int nTrans = 0; /* Number of databases with an active write-transaction */ int rc = SQLITE_OK; int needXcommit = 0; for(i=0; i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( pBt && sqlite3BtreeIsInTrans(pBt) ){ needXcommit = 1; if( i!=1 ) nTrans++; } } /* If there are any write-transactions at all, invoke the commit hook */ if( needXcommit && db->xCommitCallback ){ int rc; sqlite3SafetyOff(db); rc = db->xCommitCallback(db->pCommitArg); sqlite3SafetyOn(db); if( rc ){ return SQLITE_CONSTRAINT; } } /* The simple case - no more than one database file (not counting the ** TEMP database) has a transaction active. There is no need for the ** master-journal. ** ** If the return value of sqlite3BtreeGetFilename() is a zero length ** string, it means the main database is :memory:. In that case we do ** not support atomic multi-file commits, so use the simple case then ** too. */ if( 0==strlen(sqlite3BtreeGetFilename(db->aDb[0].pBt)) || nTrans<=1 ){ for(i=0; rc==SQLITE_OK && i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( pBt ){ rc = sqlite3BtreeSync(pBt, 0); } } /* Do the commit only if all databases successfully synced */ if( rc==SQLITE_OK ){ for(i=0; i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( pBt ){ sqlite3BtreeCommit(pBt); } } } } /* The complex case - There is a multi-file write-transaction active. ** This requires a master journal file to ensure the transaction is ** committed atomicly. */ else{ char *zMaster = 0; /* File-name for the master journal */ char const *zMainFile = sqlite3BtreeGetFilename(db->aDb[0].pBt); OsFile master; /* Select a master journal file name */ do { u32 random; sqliteFree(zMaster); sqlite3Randomness(sizeof(random), &random); zMaster = sqlite3MPrintf("%s-mj%08X", zMainFile, random&0x7fffffff); if( !zMaster ){ return SQLITE_NOMEM; } }while( sqlite3OsFileExists(zMaster) ); /* Open the master journal. */ memset(&master, 0, sizeof(master)); rc = sqlite3OsOpenExclusive(zMaster, &master, 0); if( rc!=SQLITE_OK ){ sqliteFree(zMaster); return rc; } /* Write the name of each database file in the transaction into the new ** master journal file. If an error occurs at this point close ** and delete the master journal file. All the individual journal files ** still have 'null' as the master journal pointer, so they will roll ** back independantly if a failure occurs. */ for(i=0; i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( i==1 ) continue; /* Ignore the TEMP database */ if( pBt && sqlite3BtreeIsInTrans(pBt) ){ char const *zFile = sqlite3BtreeGetJournalname(pBt); if( zFile[0]==0 ) continue; /* Ignore :memory: databases */ rc = sqlite3OsWrite(&master, zFile, strlen(zFile)+1); if( rc!=SQLITE_OK ){ sqlite3OsClose(&master); sqlite3OsDelete(zMaster); sqliteFree(zMaster); return rc; } } } /* Sync the master journal file. Before doing this, open the directory ** the master journal file is store in so that it gets synced too. */ zMainFile = sqlite3BtreeGetDirname(db->aDb[0].pBt); rc = sqlite3OsOpenDirectory(zMainFile, &master); if( rc!=SQLITE_OK ){ sqlite3OsClose(&master); sqlite3OsDelete(zMaster); sqliteFree(zMaster); return rc; } rc = sqlite3OsSync(&master); if( rc!=SQLITE_OK ){ sqlite3OsClose(&master); sqliteFree(zMaster); return rc; } /* Sync all the db files involved in the transaction. The same call ** sets the master journal pointer in each individual journal. If ** an error occurs here, do not delete the master journal file. ** ** If the error occurs during the first call to sqlite3BtreeSync(), ** then there is a chance that the master journal file will be ** orphaned. But we cannot delete it, in case the master journal ** file name was written into the journal file before the failure ** occured. */ for(i=0; i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( pBt && sqlite3BtreeIsInTrans(pBt) ){ rc = sqlite3BtreeSync(pBt, zMaster); if( rc!=SQLITE_OK ){ sqlite3OsClose(&master); sqliteFree(zMaster); return rc; } } } sqlite3OsClose(&master); /* Delete the master journal file. This commits the transaction. After ** doing this the directory is synced again before any individual ** transaction files are deleted. */ rc = sqlite3OsDelete(zMaster); assert( rc==SQLITE_OK ); sqliteFree(zMaster); zMaster = 0; rc = sqlite3OsSyncDirectory(zMainFile); if( rc!=SQLITE_OK ){ /* This is not good. The master journal file has been deleted, but ** the directory sync failed. There is no completely safe course of ** action from here. The individual journals contain the name of the ** master journal file, but there is no way of knowing if that ** master journal exists now or if it will exist after the operating ** system crash that may follow the fsync() failure. */ assert(0); sqliteFree(zMaster); return rc; } /* All files and directories have already been synced, so the following ** calls to sqlite3BtreeCommit() are only closing files and deleting ** journals. If something goes wrong while this is happening we don't ** really care. The integrity of the transaction is already guaranteed, ** but some stray 'cold' journals may be lying around. Returning an ** error code won't help matters. */ for(i=0; i<db->nDb; i++){ Btree *pBt = db->aDb[i].pBt; if( pBt ){ sqlite3BtreeCommit(pBt); } } } return rc; }
/* ** This procedure runs in a separate thread, reading messages off of the ** write queue and processing them one by one. ** ** If async.writerHaltNow is true, then this procedure exits ** after processing a single message. ** ** If async.writerHaltWhenIdle is true, then this procedure exits when ** the write queue is empty. ** ** If both of the above variables are false, this procedure runs ** indefinately, waiting for operations to be added to the write queue ** and processing them in the order in which they arrive. ** ** An artifical delay of async.ioDelay milliseconds is inserted before ** each write operation in order to simulate the effect of a slow disk. ** ** Only one instance of this procedure may be running at a time. */ static void *asyncWriterThread(void *NotUsed){ AsyncWrite *p = 0; int rc = SQLITE_OK; int holdingMutex = 0; if( pthread_mutex_trylock(&async.writerMutex) ){ return 0; } while( async.writerHaltNow==0 ){ OsFile *pBase = 0; if( !holdingMutex ){ pthread_mutex_lock(&async.queueMutex); } while( (p = async.pQueueFirst)==0 ){ pthread_cond_broadcast(&async.emptySignal); if( async.writerHaltWhenIdle ){ pthread_mutex_unlock(&async.queueMutex); break; }else{ ASYNC_TRACE(("IDLE\n")); pthread_cond_wait(&async.queueSignal, &async.queueMutex); ASYNC_TRACE(("WAKEUP\n")); } } if( p==0 ) break; holdingMutex = 1; /* Right now this thread is holding the mutex on the write-op queue. ** Variable 'p' points to the first entry in the write-op queue. In ** the general case, we hold on to the mutex for the entire body of ** the loop. ** ** However in the cases enumerated below, we relinquish the mutex, ** perform the IO, and then re-request the mutex before removing 'p' from ** the head of the write-op queue. The idea is to increase concurrency with ** sqlite threads. ** ** * An ASYNC_CLOSE operation. ** * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish ** the mutex, call the underlying xOpenExclusive() function, then ** re-aquire the mutex before seting the AsyncFile.pBaseRead ** variable. ** * ASYNC_SYNC and ASYNC_WRITE operations, if ** SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two ** file-handles are open for the particular file being "synced". */ if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){ p->op = ASYNC_NOOP; } if( p->pFile ){ pBase = p->pFile->pBaseWrite; if( p->op==ASYNC_CLOSE || p->op==ASYNC_OPENEXCLUSIVE || (pBase && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) ) ){ pthread_mutex_unlock(&async.queueMutex); holdingMutex = 0; } if( !pBase ){ pBase = p->pFile->pBaseRead; } } switch( p->op ){ case ASYNC_NOOP: break; case ASYNC_WRITE: assert( pBase ); ASYNC_TRACE(("WRITE %s %d bytes at %d\n", p->pFile->zName, p->nByte, p->iOffset)); rc = sqlite3OsSeek(pBase, p->iOffset); if( rc==SQLITE_OK ){ rc = sqlite3OsWrite(pBase, (const void *)(p->zBuf), p->nByte); } break; case ASYNC_SYNC: assert( pBase ); ASYNC_TRACE(("SYNC %s\n", p->pFile->zName)); rc = sqlite3OsSync(pBase, p->nByte); break; case ASYNC_TRUNCATE: assert( pBase ); ASYNC_TRACE(("TRUNCATE %s to %d bytes\n", p->pFile->zName, p->iOffset)); rc = sqlite3OsTruncate(pBase, p->iOffset); break; case ASYNC_CLOSE: ASYNC_TRACE(("CLOSE %s\n", p->pFile->zName)); sqlite3OsClose(&p->pFile->pBaseWrite); sqlite3OsClose(&p->pFile->pBaseRead); sqlite3OsFree(p->pFile); break; case ASYNC_OPENDIRECTORY: assert( pBase ); ASYNC_TRACE(("OPENDIR %s\n", p->zBuf)); sqlite3OsOpenDirectory(pBase, p->zBuf); break; case ASYNC_SETFULLSYNC: assert( pBase ); ASYNC_TRACE(("SETFULLSYNC %s %d\n", p->pFile->zName, p->nByte)); sqlite3OsSetFullSync(pBase, p->nByte); break; case ASYNC_DELETE: ASYNC_TRACE(("DELETE %s\n", p->zBuf)); rc = xOrigDelete(p->zBuf); break; case ASYNC_SYNCDIRECTORY: ASYNC_TRACE(("SYNCDIR %s\n", p->zBuf)); rc = xOrigSyncDirectory(p->zBuf); break; case ASYNC_OPENEXCLUSIVE: { AsyncFile *pFile = p->pFile; int delFlag = ((p->iOffset)?1:0); OsFile *pBase = 0; ASYNC_TRACE(("OPEN %s delFlag=%d\n", p->zBuf, delFlag)); assert(pFile->pBaseRead==0 && pFile->pBaseWrite==0); rc = xOrigOpenExclusive(p->zBuf, &pBase, delFlag); assert( holdingMutex==0 ); pthread_mutex_lock(&async.queueMutex); holdingMutex = 1; if( rc==SQLITE_OK ){ pFile->pBaseRead = pBase; } break; } default: assert(!"Illegal value for AsyncWrite.op"); } /* If we didn't hang on to the mutex during the IO op, obtain it now ** so that the AsyncWrite structure can be safely removed from the ** global write-op queue. */ if( !holdingMutex ){ pthread_mutex_lock(&async.queueMutex); holdingMutex = 1; } /* ASYNC_TRACE(("UNLINK %p\n", p)); */ if( p==async.pQueueLast ){ async.pQueueLast = 0; } async.pQueueFirst = p->pNext; sqlite3OsFree(p); assert( holdingMutex ); /* An IO error has occured. We cannot report the error back to the ** connection that requested the I/O since the error happened ** asynchronously. The connection has already moved on. There ** really is nobody to report the error to. ** ** The file for which the error occured may have been a database or ** journal file. Regardless, none of the currently queued operations ** associated with the same database should now be performed. Nor should ** any subsequently requested IO on either a database or journal file ** handle for the same database be accepted until the main database ** file handle has been closed and reopened. ** ** Furthermore, no further IO should be queued or performed on any file ** handle associated with a database that may have been part of a ** multi-file transaction that included the database associated with ** the IO error (i.e. a database ATTACHed to the same handle at some ** point in time). */ if( rc!=SQLITE_OK ){ async.ioError = rc; } /* Drop the queue mutex before continuing to the next write operation ** in order to give other threads a chance to work with the write queue. */ if( !async.pQueueFirst || !async.ioError ){ sqlite3ApiExit(0, 0); pthread_mutex_unlock(&async.queueMutex); holdingMutex = 0; if( async.ioDelay>0 ){ sqlite3OsSleep(async.ioDelay); }else{ sched_yield(); } } } pthread_mutex_unlock(&async.writerMutex); return 0; }