/* {{{ rberkeley_db_compact */ SEXP rberkeley_db_compact (SEXP _dbp, SEXP _txnid, SEXP _start, SEXP _stop, SEXP _c_data, SEXP _flags) { DB *dbp; DB_TXN *txnid; DBT start, stop, end; /*DB_COMPACT c_data;*/ u_int32_t flags; int ret; if(isNull(_txnid)) { txnid = R_ExternalPtrAddr(_txnid); } else { txnid = NULL; } if(!isNull(_start)) { memset(&start, 0, sizeof(DBT)); start.data = (unsigned char *)RAW(_start); start.size = length(_start); } if(!isNull(_stop)) { memset(&stop, 0, sizeof(DBT)); stop.data = (unsigned char *)RAW(_stop); stop.size = length(_stop); } flags = (u_int32_t)INTEGER(_flags)[0]; /*memset(&end, 0, sizeof(end));*/ dbp = R_ExternalPtrAddr(_dbp); if(R_ExternalPtrTag(_dbp) != RBerkeley_DB || dbp == NULL) error("invalid 'db' handle"); ret = dbp->compact(dbp, txnid, &start, &stop, NULL, flags, &end); return ScalarInteger(ret); }
MojErr MojDbBerkeleyEngine::compact() { const char * DatabaseRoot = "/var/db"; // FIXME: Should not be hard-coded, but so is the disk space monitor! struct statvfs statAtBeginning, statAfterCompact, statAtEnd; MojLogTrace(MojDbBerkeleyEngine::s_log); struct timeval totalStartTime = {0,0}, totalStopTime = {0,0}; gettimeofday(&totalStartTime, NULL); memset(&statAtBeginning, '\0', sizeof(statAtBeginning)); ::statvfs(DatabaseRoot, &statAtBeginning); const int blockSize = (int)statAtBeginning.f_bsize; // checkpoint before compact MojErr err = m_env->checkpoint(0); MojErrCheck(err); memset(&statAfterCompact, '\0', sizeof(statAfterCompact)); ::statvfs(DatabaseRoot, &statAfterCompact); int pre_compact_reclaimed_blocks = (int)(statAfterCompact.f_bfree - statAtBeginning.f_bfree); MojLogDebug(s_log, _T("Starting compact: Checkpoint freed %d bytes. Volume %s has %lu bytes free out of %lu bytes (%.1f full)\n"), pre_compact_reclaimed_blocks * blockSize, DatabaseRoot, statAfterCompact.f_bfree * blockSize, statAfterCompact.f_blocks * blockSize, (float)(statAfterCompact.f_blocks - statAfterCompact.f_bfree) * 100.0 / (float)statAfterCompact.f_blocks); // Retrieve setting for record count used to break up compact operations const int stepSize = m_env->compactStepSize(); memset(&statAtBeginning, '\0', sizeof(statAtBeginning)); ::statvfs(DatabaseRoot, &statAtBeginning); int total_pages_examined = 0, total_pages_freed = 0, total_pages_truncated = 0; int max_pages_examined = 0, max_pages_freed = 0, max_pages_truncated = 0; int total_log_generation_blocks = 0, total_reclaimed_blocks = 0; int max_log_generation_blocks = 0, max_reclaimed_blocks = 0; int total_compact_time = 0, total_step_time = 0; int max_compact_time = 0, max_step_time = 0; int total_key_total = 0, total_value_total = 0; int max_key_total = 0, max_value_total = 0; MojThreadGuard guard(m_dbMutex); // call compact on each database for (DatabaseVec::ConstIterator i = m_dbs.begin(); i != m_dbs.end(); ++i) { DB* db = (*i)->impl(); DB_COMPACT c_data; MojZero(&c_data, sizeof(c_data)); DBC * dbc = NULL; int dbErr; DBT key1, key2; DBT value; memset(&key1, '\0', sizeof(key1)); memset(&key2, '\0', sizeof(key2)); memset(&value, '\0', sizeof(value)); key1.flags = DB_DBT_REALLOC; key2.flags = DB_DBT_REALLOC; value.flags = DB_DBT_REALLOC; int key1_count = 0, key2_count = 0; dbErr = 0; // Continue compacting the database by chunks until we run into an error. If a stepSize // isn't configured, don't chunk it at all. while ((stepSize >= 1) && (dbErr == 0)) { // Construct key to step forward by a set number of records, to select the compact window. // We close the cursor after we've found the next key, so it won't keep a lock open that // could disrupt the compaction. Without locking, we might miss an insertion or deletion // happening between compactions, but that int key_total = 0, value_total = 0; // Tracked only for debugging purposes. dbErr = db->cursor(db, NULL, &dbc, 0); if (dbErr == 0) { if (key1.data == NULL) { // Move the cursor to the beginning of the database dbErr = dbc->get(dbc, &key1, &value, DB_FIRST); key_total += key1.size; value_total += value.size; // discard key1, we don't want the key for the beginning if (key1.data) free(key1.data); key1.data = NULL; key1.size = 0; } else { // move the cursor to the location of the prior key. // If that exact key is missing, this should choose the // next one. dbErr = dbc->get(dbc, &key1, &value, DB_SET_RANGE); } int elapsedStepTimeMS = 0; if (dbErr == DB_NOTFOUND) { // If we didn't find a first key, the DB is presumably empty, // and we shouldn't search for the end key. dbErr = 0; if (key1.data) free(key1.data); key1.data = NULL; key1.size = 0; if (key2.data) free(key2.data); key2.data = NULL; key2.size = 0; } else if (dbErr == 0) { int count; // Move the cursor forward by the chosen stepSize. // May exit early with error DB_NOTFOUND, indicating end of database. struct timeval startTime = {0,0}, stopTime = {0,0}; gettimeofday(&startTime, NULL); for (count = 0; (dbErr == 0) && (count < stepSize); count++) { dbErr = dbc->get(dbc, &key2, &value, DB_NEXT); key_total += key2.size; value_total += value.size; } key2_count = key1_count + count; if (dbErr == DB_NOTFOUND) { dbErr = 0; if (key2.data) free(key2.data); key2.data = NULL; key2.size = 0; } gettimeofday(&stopTime, NULL); elapsedStepTimeMS = (int)(stopTime.tv_sec - startTime.tv_sec) * 1000 + (int)(stopTime.tv_usec - startTime.tv_usec) / 1000; } dbc->close(dbc); if (dbErr != 0) break; // Compact from key1 to key2. (The documentation says it starts at 'the // smallest key greater than or equal to the specified key', and ends at // 'the page with the smallest key greater than the specified key'. I don't // know exactly what that means regarding inclusivity, so this procedure may // not be fully compacting the pages which contain the keys.) MojLogDebug(s_log, _T("Compacting %s (partial from ~record %d to %d). Stepped over %d/%d bytes of keys/values in %dms.\n"), (*i)->m_name.data(), key1_count, key2_count, key_total, value_total, elapsedStepTimeMS); struct statvfs statBeforeCompact, statAfterCompact, statAfterCheckpoint; memset(&statBeforeCompact, '\0', sizeof(statBeforeCompact)); ::statvfs(DatabaseRoot, &statBeforeCompact); struct timeval startTime = {0,0}, stopTime = {0,0}; gettimeofday(&startTime, NULL); MojZero(&c_data, sizeof(c_data)); dbErr = db->compact(db, NULL, key1.data ? &key1 : NULL, key2.data ? &key2 : NULL, &c_data, DB_FREE_SPACE, NULL); gettimeofday(&stopTime, NULL); int elapsedCompactTimeMS = (int)(stopTime.tv_sec - startTime.tv_sec) * 1000 + (int)(stopTime.tv_usec - startTime.tv_usec) / 1000; MojLogDebug(s_log, _T("Compact stats of %s (partial from ~record %d to %d): time %dms, compact_deadlock=%d, compact_pages_examine=%d, compact_pages_free=%d, compact_levels=%d, compact_pages_truncated=%d\n"), (*i)->m_name.data(), key1_count, key2_count, elapsedCompactTimeMS, c_data.compact_deadlock, c_data.compact_pages_examine, c_data.compact_pages_free, c_data.compact_levels, c_data.compact_pages_truncated); total_compact_time += elapsedCompactTimeMS; if (elapsedCompactTimeMS > max_compact_time) max_compact_time = elapsedCompactTimeMS; total_step_time += elapsedStepTimeMS; if (elapsedStepTimeMS > max_step_time) max_step_time = elapsedStepTimeMS; total_key_total += key_total; if (key_total > max_key_total) max_key_total = key_total; total_value_total += value_total; if (value_total > max_value_total) max_value_total = value_total; total_pages_examined += c_data.compact_pages_examine; if ((int)c_data.compact_pages_examine > max_pages_examined) max_pages_examined = c_data.compact_pages_examine; total_pages_freed += c_data.compact_pages_free; if ((int)c_data.compact_pages_free > max_pages_freed) max_pages_freed = c_data.compact_pages_free; total_pages_truncated += c_data.compact_pages_truncated; if ((int)c_data.compact_pages_truncated > max_pages_truncated) max_pages_truncated = c_data.compact_pages_truncated; memset(&statAfterCompact, '\0', sizeof(statAfterCompact)); ::statvfs(DatabaseRoot, &statAfterCompact); int log_generation_blocks = (int)(statBeforeCompact.f_bfree - statAfterCompact.f_bfree); total_log_generation_blocks += log_generation_blocks; if (log_generation_blocks > max_log_generation_blocks) max_log_generation_blocks = log_generation_blocks; err = m_env->checkpoint(0); MojErrCheck(err); memset(&statAfterCompact, '\0', sizeof(statAfterCheckpoint)); ::statvfs(DatabaseRoot, &statAfterCheckpoint); int reclaimed_blocks = (int)(statAfterCheckpoint.f_bfree - statBeforeCompact.f_bfree); total_reclaimed_blocks += reclaimed_blocks; if (reclaimed_blocks > max_reclaimed_blocks) max_reclaimed_blocks = reclaimed_blocks; MojLogDebug(s_log, _T("Compact of %s (partial from ~record %d to %d) generated %d bytes of log data, ultimately reclaiming %d bytes after checkpoint.\n"), (*i)->m_name.data(), key1_count, key2_count, log_generation_blocks * blockSize, reclaimed_blocks * blockSize); // copy key2 over key1 if (key1.data) free(key1.data); key1.data = key2.data; key1.size = key2.size; key2.data = NULL; key2.size = 0; key1_count = key2_count; // if key2 was empty, then we are done. if (key1.data == NULL) break; } } if (key1.data) free(key1.data); if (key2.data) free(key2.data); if (value.data) free(value.data); // If no step size was configured, fall back and do a complete compact. Do the same // if there was an error performing the chunked compaction. The complete compact risks // running out of disk space, but that's preferable to not compacting at all, which will // also likely eventually lead to running out of space. if (dbErr == DB_LOCK_DEADLOCK) { // But for deadlock, we should just give up, as this might // happen in normal use. MojBdbErrCheck(dbErr, _T("cursor and compact deadlocked")); } if ((stepSize <= 1) || (dbErr != 0)) { MojLogDebug(s_log, "Compacting %s\n", (*i)->m_name.data()); struct statvfs statBeforeCompact, statAfterCompact, statAfterCheckpoint; memset(&statBeforeCompact, '\0', sizeof(statBeforeCompact)); ::statvfs(DatabaseRoot, &statBeforeCompact); struct timeval startTime = {0,0}, stopTime = {0,0}; gettimeofday(&startTime, NULL); MojZero(&c_data, sizeof(c_data)); dbErr = db->compact(db, NULL, NULL, NULL, &c_data, DB_FREE_SPACE, NULL); gettimeofday(&stopTime, NULL); int elapsedCompactTimeMS = (int)(stopTime.tv_sec - startTime.tv_sec) * 1000 + (int)(stopTime.tv_usec - startTime.tv_usec) / 1000; total_compact_time += elapsedCompactTimeMS; if (elapsedCompactTimeMS > max_compact_time) max_compact_time = elapsedCompactTimeMS; MojLogDebug(s_log, "Compact stats of %s: time %dms, compact_deadlock=%d, compact_pages_examine=%d, compact_pages_free=%d, compact_levels=%d, compact_pages_truncated=%d\n", (*i)->m_name.data(), elapsedCompactTimeMS, c_data.compact_deadlock, c_data.compact_pages_examine, c_data.compact_pages_free, c_data.compact_levels, c_data.compact_pages_truncated); total_pages_examined += c_data.compact_pages_examine; if ((int)c_data.compact_pages_examine > max_pages_examined) max_pages_examined = c_data.compact_pages_examine; total_pages_freed += c_data.compact_pages_free; if ((int)c_data.compact_pages_free > max_pages_freed) max_pages_freed = c_data.compact_pages_free; total_pages_truncated += c_data.compact_pages_truncated; if ((int)c_data.compact_pages_truncated > max_pages_truncated) max_pages_truncated = c_data.compact_pages_truncated; memset(&statAfterCompact, '\0', sizeof(statAfterCompact)); ::statvfs(DatabaseRoot, &statAfterCompact); int log_generation_blocks = (int)(statBeforeCompact.f_bfree - statAfterCompact.f_bfree); total_log_generation_blocks += log_generation_blocks; if (log_generation_blocks > max_log_generation_blocks) max_log_generation_blocks = log_generation_blocks; err = m_env->checkpoint(0); MojErrCheck(err); memset(&statAfterCompact, '\0', sizeof(statAfterCheckpoint)); ::statvfs(DatabaseRoot, &statAfterCheckpoint); int reclaimed_blocks = (int)(statAfterCheckpoint.f_bfree - statBeforeCompact.f_bfree); total_reclaimed_blocks += reclaimed_blocks; if (reclaimed_blocks > max_reclaimed_blocks) max_reclaimed_blocks = reclaimed_blocks; MojLogDebug(s_log, "Compact of %s generated %d bytes of log data, ultimately reclaiming %d bytes after checkpoint.\n", (*i)->m_name.data(), log_generation_blocks * blockSize, reclaimed_blocks * blockSize); } MojBdbErrCheck(dbErr, _T("db->compact")); } guard.unlock(); gettimeofday(&totalStopTime, NULL); int elapsedTotalMS = (int)(totalStopTime.tv_sec - totalStartTime.tv_sec) * 1000 + (int)(totalStopTime.tv_usec - totalStartTime.tv_usec) / 1000; memset(&statAtEnd, '\0', sizeof(statAtEnd)); ::statvfs(DatabaseRoot, &statAtEnd); int compact_freed_blocks = (int)(statAtEnd.f_bfree - statAtBeginning.f_bfree); MojLogDebug(s_log, _T("During compact: %d db pages examined (max burst %d), %d db pages freed (max burst %d), " "%d db pages truncated (max burst %d), " "%d log bytes created by compacts (max burst %d), " "%d bytes reclaimed by checkpoints (max burst %d), " "%d bytes of keys stepped over (max burst %d), " "%d bytes of values stepped over (max burst %d), " "%dms spent in stepping (max burst %dms), " "%dms spent in compact (max burst %dms)\n"), total_pages_examined, max_pages_examined, total_pages_freed, max_pages_freed, total_pages_truncated, max_pages_truncated, total_log_generation_blocks * blockSize, max_log_generation_blocks * blockSize, total_reclaimed_blocks * blockSize, max_reclaimed_blocks * blockSize, total_key_total, max_key_total, total_value_total, max_value_total, total_step_time, max_step_time, total_compact_time, max_step_time ); MojLogDebug(s_log, _T("Compact complete: took %dms, freed %d bytes (including pre-checkpoint of %d bytes). Volume %s has %lu bytes free out of %lu bytes (%.1f full)\n"), elapsedTotalMS, compact_freed_blocks * blockSize, pre_compact_reclaimed_blocks * blockSize, DatabaseRoot, statAfterCompact.f_bfree * blockSize, statAfterCompact.f_blocks * blockSize, (float)(statAfterCompact.f_blocks - statAfterCompact.f_bfree) * 100.0 / (float)statAfterCompact.f_blocks); return MojErrNone; }
/* ** A write transaction must be opened before calling this function. ** It performs a single unit of work towards an incremental vacuum. ** Specifically, in the Berkeley DB storage manager, it attempts to compact ** one table. ** ** If the incremental vacuum is finished after this function has run, ** SQLITE_DONE is returned. If it is not finished, but no error occurred, ** SQLITE_OK is returned. Otherwise an SQLite error code. ** ** The caller can get and accumulate the number of truncated pages truncated ** with input parameter truncatedPages. Also, btreeIncrVacuum would skip ** the vacuum if enough pages has been truncated for optimization. */ int btreeIncrVacuum(Btree *p, u_int32_t *truncatedPages) { BtShared *pBt; CACHED_DB *cached_db; DB *dbp; DBT key, data; char *fileName, *tableName, tableNameBuf[DBNAME_SIZE]; void *app; int iTable, rc, ret, t_ret; u_int32_t was_create; DB_COMPACT compact_data; DBT *pStart, end; /* start/end of db_compact() */ struct VacuumInfo *pInfo; int vacuumMode; assert(p->pBt->dbStorage == DB_STORE_NAMED); if (!p->connected && (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK) return rc; pBt = p->pBt; rc = SQLITE_OK; cached_db = NULL; dbp = NULL; memset(&end, 0, sizeof(end)); #ifndef BDBSQL_OMIT_LEAKCHECK /* Let BDB use the user-specified malloc function (btreeMalloc) */ end.flags |= DB_DBT_MALLOC; #endif /* * Turn off DB_CREATE: we don't want to create any tables that don't * already exist. */ was_create = (pBt->db_oflags & DB_CREATE); pBt->db_oflags &= ~DB_CREATE; memset(&key, 0, sizeof(key)); key.data = tableNameBuf; key.ulen = sizeof(tableNameBuf); key.flags = DB_DBT_USERMEM; memset(&data, 0, sizeof(data)); data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; UPDATE_DURING_BACKUP(p); if (p->compact_cursor == NULL) { if ((ret = pTablesDb->cursor(pTablesDb, pReadTxn, &p->compact_cursor, 0)) != 0) goto err; } if ((ret = p->compact_cursor->get(p->compact_cursor, &key, &data, DB_NEXT)) == DB_NOTFOUND) { (void)p->compact_cursor->close(p->compact_cursor); p->compact_cursor = NULL; pBt->db_oflags |= was_create; return SQLITE_DONE; } else if (ret != 0) goto err; tableNameBuf[key.size] = '\0'; if (strncmp(tableNameBuf, "table", 5) != 0) { iTable = 0; #ifdef BDBSQL_FILE_PER_TABLE /* Cannot compact the metadata file */ goto err; #endif /* Open a DB handle on that table. */ if ((ret = db_create(&dbp, pDbEnv, 0)) != 0) goto err; if (pBt->encrypted && (ret = dbp->set_flags(dbp, DB_ENCRYPT)) != 0) goto err; tableName = tableNameBuf; FIX_TABLENAME(pBt, fileName, tableName); /* * We know we're not creating this table, open it using the * family transaction because that keeps the dbreg records out * of the vacuum transaction, reducing pressure on the log * region (since we copy the filename of every open DB handle * into the log region). */ if ((ret = dbp->open(dbp, pFamilyTxn, fileName, tableName, DB_BTREE, GET_AUTO_COMMIT(pBt, pFamilyTxn), 0)) != 0) goto err; } else { if ((ret = btreeTableNameToId(tableNameBuf, key.size, &iTable)) != 0) goto err; /* Try to retrieve the matching handle from the cache. */ rc = btreeFindOrCreateDataTable(p, &iTable, &cached_db, 0); if (rc != SQLITE_OK) goto err; assert(cached_db != NULL && cached_db->dbp != NULL); dbp = cached_db->dbp; if ((iTable & 1) == 0) { /* * Attach the DB handle to a SQLite index, required for * the key comparator to work correctly. If we can't * find an Index struct, just skip this database. It * may not be open yet (c.f. whereA-1.7). */ #ifdef BDBSQL_SINGLE_THREAD rc = btreeGetKeyInfo(p, iTable, (KeyInfo **)&(dbp->app_private)); #else rc = btreeGetKeyInfo(p, iTable, &((TableInfo *)dbp->app_private)->pKeyInfo); #endif if (rc != SQLITE_OK) goto err; } } /* * In following db_compact, we use the family transaction because * DB->compact will then auto-commit, and it has built-in smarts * about retrying on deadlock. */ /* Setup compact_data as configured */ memset(&compact_data, 0, sizeof(compact_data)); compact_data.compact_fillpercent = p->fillPercent; vacuumMode = sqlite3BtreeGetAutoVacuum(p); if (vacuumMode == BTREE_AUTOVACUUM_NONE) { ret = dbp->compact(dbp, pFamilyTxn, NULL, NULL, &compact_data, DB_FREE_SPACE, NULL); /* Skip current table if we have truncated enough pages */ } else if (truncatedPages == NULL || (truncatedPages != NULL && *truncatedPages < p->vacuumPages)) { /* Find DBT for db_compact start */ for (pInfo = p->vacuumInfo, pStart = NULL; pInfo != NULL; pInfo = pInfo->next) { if (pInfo->iTable == iTable) break; } /* Create new VacuumInfo for current iTable as needed */ if (pInfo == NULL) { /* Create info for current iTable */ if ((pInfo = (struct VacuumInfo *)sqlite3_malloc( sizeof(struct VacuumInfo))) == NULL) { rc = SQLITE_NOMEM; goto err; } memset(pInfo, 0, sizeof(struct VacuumInfo)); pInfo->iTable = iTable; pInfo->next = p->vacuumInfo; p->vacuumInfo = pInfo; } pStart = &(pInfo->start); /* Do page compact for IncrVacuum */ if (vacuumMode == BTREE_AUTOVACUUM_INCR) { /* Do compact with given arguments */ compact_data.compact_pages = p->vacuumPages; if ((ret = dbp->compact(dbp, pFamilyTxn, (pStart->data == NULL) ? NULL : pStart, NULL, &compact_data, 0, &end)) != 0) goto err; /* Save current vacuum position */ if (pStart->data != NULL) sqlite3_free(pStart->data); memcpy(pStart, &end, sizeof(DBT)); memset(&end, 0, sizeof(end)); /* Rewind to start if we reach the end of subdb */ if (compact_data.compact_pages_free < p->vacuumPages || p->vacuumPages == 0) { if (pStart->data != NULL) sqlite3_free(pStart->data); memset(pStart, 0, sizeof(DBT)); } } /* Because of the one-pass nature of the compaction algorithm, * any unemptied page near the end of the file inhibits * returning pages to the file system. * A repeated call to the DB->compact() method with a low * compact_fillpercent may be used to return pages in this case. */ memset(&compact_data, 0, sizeof(compact_data)); compact_data.compact_fillpercent = 1; if ((ret = dbp->compact(dbp, pFamilyTxn, NULL, NULL, &compact_data, DB_FREE_SPACE, NULL)) != 0) goto err; if (truncatedPages != NULL && *truncatedPages > 0) *truncatedPages += compact_data.compact_pages_truncated; } err: /* Free cursor and DBT if run into error */ if (ret != 0) { if (p->compact_cursor != NULL) { (void)p->compact_cursor->close(p->compact_cursor); p->compact_cursor = NULL; } if (end.data != NULL) sqlite3_free(end.data); btreeFreeVacuumInfo(p); } if (cached_db != NULL) { #ifdef BDBSQL_SINGLE_THREAD if ((app = dbp->app_private) != NULL) sqlite3DbFree(p->db, app); #else if (dbp->app_private != NULL && (app = ((TableInfo *)dbp->app_private)->pKeyInfo) != NULL) { sqlite3DbFree(p->db, app); ((TableInfo *)dbp->app_private)->pKeyInfo = NULL; } #endif } else if (dbp != NULL) { app = dbp->app_private; if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) ret = t_ret; if (app != NULL) sqlite3DbFree(p->db, app); } pBt->db_oflags |= was_create; return MAP_ERR(rc, ret, p); }