bool CopyOnWriteContext::notifyTupleDelete(TableTuple &tuple) { assert(m_iterator != NULL); if (tuple.isDirty() || m_finishedTableScan) { return true; } /** * Find out which block the address is contained in. Lower bound returns the first entry * in the index >= the address. Unless the address happens to be equal then the block * we are looking for is probably the previous entry. Then check if the address fits * in the previous entry. If it doesn't then the block is something new. */ TBPtr block = PersistentTable::findBlock(tuple.address(), m_blocks, getTable().getTableAllocationSize()); if (block.get() == NULL) { // tuple not in snapshot region, don't care about this tuple return true; } /** * Now check where this is relative to the COWIterator. */ CopyOnWriteIterator *iter = reinterpret_cast<CopyOnWriteIterator*>(m_iterator.get()); return !iter->needToDirtyTuple(block->address(), tuple.address()); }
Table* TableFactory::getPersistentTable( voltdb::CatalogId databaseId, const std::string &name, TupleSchema* schema, const std::vector<std::string> &columnNames, char *signature, bool tableIsMaterialized, int partitionColumn, bool exportEnabled, bool exportOnly, int tableAllocationTargetSize, int tupleLimit, int32_t compactionThreshold, bool drEnabled) { Table *table = NULL; StreamedTable *streamedTable = NULL; PersistentTable *persistentTable = NULL; if (exportOnly) { table = streamedTable = new StreamedTable(partitionColumn); } else { table = persistentTable = new PersistentTable(partitionColumn, signature, tableIsMaterialized, tableAllocationTargetSize, tupleLimit, drEnabled); } initCommon(databaseId, table, name, schema, columnNames, true, // table will take ownership of TupleSchema object compactionThreshold); TableStats *stats; if (exportOnly) { stats = streamedTable->getTableStats(); } else { stats = persistentTable->getTableStats(); // Allocate and assign the tuple storage block to the persistent table ahead of time instead // of doing so at time of first tuple insertion. The intent of block allocation ahead of time // is to avoid allocation cost at time of tuple insertion TBPtr block = persistentTable->allocateNextBlock(); assert(block->hasFreeTuples()); persistentTable->m_blocksWithSpace.insert(block); } // initialize stats for the table configureStats(name, stats); return table; }
void CopyOnWriteContext::markTupleDirty(TableTuple tuple, bool newTuple) { assert(m_iterator != NULL); if (newTuple) { m_inserts++; } else { m_updates++; } /** * If this an update or a delete of a tuple that is already dirty then no further action is * required. */ if (!newTuple && tuple.isDirty()) { return; } /** * If the table has been scanned already there is no need to continue marking tuples dirty * If the tuple is dirty then it has already been backed up. */ if (m_finishedTableScan) { tuple.setDirtyFalse(); return; } /** * Find out which block the address is contained in. */ TBPtr block = PersistentTable::findBlock(tuple.address(), m_blocks, getTable().getTableAllocationSize()); if (block.get() == NULL) { // tuple not in snapshot region, don't care about this tuple, no need to dirty it tuple.setDirtyFalse(); return; } /** * Now check where this is relative to the COWIterator. */ CopyOnWriteIterator *iter = reinterpret_cast<CopyOnWriteIterator*>(m_iterator.get()); if (iter->needToDirtyTuple(block->address(), tuple.address())) { tuple.setDirtyTrue(); /** * Don't back up a newly introduced tuple, just mark it as dirty. */ if (!newTuple) { m_backedUpTuples->insertTupleNonVirtualWithDeepCopy(tuple, &m_pool); } } else { tuple.setDirtyFalse(); return; } }
Table* TableFactory::getPersistentTable( voltdb::CatalogId databaseId, const std::string &name, TupleSchema* schema, const std::vector<std::string> &columnNames, char *signature, bool tableIsMaterialized, int partitionColumn, bool exportEnabled, bool exportOnly, int tableAllocationTargetSize, int tupleLimit, int32_t compactionThreshold, bool drEnabled) { Table *table = NULL; if (exportOnly) { table = new StreamedTable(exportEnabled); } else { table = new PersistentTable(partitionColumn, signature, tableIsMaterialized, tableAllocationTargetSize, tupleLimit, drEnabled); } initCommon(databaseId, table, name, schema, columnNames, true, // table will take ownership of TupleSchema object compactionThreshold); // initialize stats for the table configureStats(databaseId, name, table); if(!exportOnly) { // allocate tuple storage block for the persistent table ahead of time // instead of waiting till first tuple insertion. Intend of allocating tuple // block storage ahead is to improve performance on first tuple insertion. PersistentTable *persistentTable = static_cast<PersistentTable*>(table); TBPtr block = persistentTable->allocateNextBlock(); assert(block->hasFreeTuples()); persistentTable->m_blocksWithSpace.insert(block); } return table; }
void CopyOnWriteContext::notifyBlockWasCompactedAway(TBPtr block) { assert(!m_finishedTableScan); m_blocksCompacted++; CopyOnWriteIterator *iter = static_cast<CopyOnWriteIterator*>(m_iterator.get()); if (iter->m_blockIterator != m_blocks.end()) { TBPtr nextBlock = iter->m_blockIterator.data(); //The next block is the one that was compacted away //Need to move the iterator forward to skip it if (nextBlock == block) { iter->m_blockIterator++; //There is another block after the one that was compacted away if (iter->m_blockIterator != m_blocks.end()) { TBPtr newNextBlock = iter->m_blockIterator.data(); m_blocks.erase(block->address()); iter->m_blockIterator = m_blocks.find(newNextBlock->address()); iter->m_end = m_blocks.end(); assert(iter->m_blockIterator != m_blocks.end()); } else { //No block after the one compacted away //set everything to end m_blocks.erase(block->address()); iter->m_blockIterator = m_blocks.end(); iter->m_end = m_blocks.end(); } } else { //Some random block was compacted away. Remove it and regenerate the iterator m_blocks.erase(block->address()); iter->m_blockIterator = m_blocks.find(nextBlock->address()); iter->m_end = m_blocks.end(); assert(iter->m_blockIterator != m_blocks.end()); } } }
/** * Block compaction hook. */ void ElasticScanner::notifyBlockWasCompactedAway(TBPtr block) { if (!m_scanComplete && m_blockIterator != m_blockEnd) { TBPtr nextBlock = m_blockIterator.data(); if (nextBlock == block) { // The next block was compacted away. m_blockIterator++; if (m_blockIterator != m_blockEnd) { // There is a block to skip to. TBPtr newNextBlock = m_blockIterator.data(); m_blockMap.erase(block->address()); m_blockIterator = m_blockMap.find(newNextBlock->address()); m_blockEnd = m_blockMap.end(); assert(m_blockIterator != m_blockMap.end()); } else { // There isn't a block to skip to, so we're done. m_blockMap.erase(block->address()); m_blockIterator = m_blockMap.end(); m_blockEnd = m_blockMap.end(); } } else { // Some random block was compacted away. // Remove it and regenerate the iterator. m_blockMap.erase(block->address()); m_blockIterator = m_blockMap.find(nextBlock->address()); m_blockEnd = m_blockMap.end(); assert(m_blockIterator != m_blockMap.end()); } } }
/** * When a tuple is "dirty" it is still active, but will never be a "found" tuple * since it is skipped. The tuple may be dirty because it was deleted (this is why it is always skipped). In that * case the CopyOnWriteContext calls this to ensure that the iteration finds the correct number of tuples * in the used portion of the table blocks and doesn't overrun to the uninitialized block memory because * it skiped a dirty tuple and didn't end up with the right found tuple count upon reaching the end. */ bool CopyOnWriteIterator::needToDirtyTuple(char *tupleAddress) { if (m_tableEmpty) { // snapshot was activated when the table was empty. // Tuple is not in snapshot region, don't care about this tuple assert(m_currentBlock == NULL); return false; } /** * Find out which block the address is contained in. Lower bound returns the first entry * in the index >= the address. Unless the address happens to be equal then the block * we are looking for is probably the previous entry. Then check if the address fits * in the previous entry. If it doesn't then the block is something new. */ TBPtr block = PersistentTable::findBlock(tupleAddress, m_blocks, m_table->getTableAllocationSize()); if (block.get() == NULL) { // tuple not in snapshot region, don't care about this tuple return false; } assert(m_currentBlock != NULL); /** * Now check where this is relative to the COWIterator. */ const char *blockAddress = block->address(); if (blockAddress > m_currentBlock->address()) { return true; } assert(blockAddress == m_currentBlock->address()); if (tupleAddress >= m_location) { return true; } else { return false; } }
// ------------------------------------------------------------------ // OPERATIONS // ------------------------------------------------------------------ void PersistentTable::nextFreeTuple(TableTuple *tuple) { // First check whether we have any in our list // In the memcheck it uses the heap instead of a free list to help Valgrind. if (!m_blocksWithSpace.empty()) { VOLT_TRACE("GRABBED FREE TUPLE!\n"); stx::btree_set<TBPtr >::iterator begin = m_blocksWithSpace.begin(); TBPtr block = (*begin); std::pair<char*, int> retval = block->nextFreeTuple(); /** * Check to see if the block needs to move to a new bucket */ if (retval.second != -1) { //Check if if the block is currently pending snapshot if (m_blocksNotPendingSnapshot.find(block) != m_blocksNotPendingSnapshot.end()) { block->swapToBucket(m_blocksNotPendingSnapshotLoad[retval.second]); //Check if the block goes into the pending snapshot set of buckets } else if (m_blocksPendingSnapshot.find(block) != m_blocksPendingSnapshot.end()) { block->swapToBucket(m_blocksPendingSnapshotLoad[retval.second]); } else { //In this case the block is actively being snapshotted and isn't eligible for merge operations at all //do nothing, once the block is finished by the iterator, the iterator will return it } } tuple->move(retval.first); if (!block->hasFreeTuples()) { m_blocksWithSpace.erase(block); } assert (m_columnCount == tuple->sizeInValues()); return; } // if there are no tuples free, we need to grab another chunk of memory // Allocate a new set of tuples TBPtr block = allocateNextBlock(); // get free tuple assert (m_columnCount == tuple->sizeInValues()); std::pair<char*, int> retval = block->nextFreeTuple(); /** * Check to see if the block needs to move to a new bucket */ if (retval.second != -1) { //Check if the block goes into the pending snapshot set of buckets if (m_blocksPendingSnapshot.find(block) != m_blocksPendingSnapshot.end()) { //std::cout << "Swapping block to nonsnapshot bucket " << static_cast<void*>(block.get()) << " to bucket " << retval.second << std::endl; block->swapToBucket(m_blocksPendingSnapshotLoad[retval.second]); //Now check if it goes in with the others } else if (m_blocksNotPendingSnapshot.find(block) != m_blocksNotPendingSnapshot.end()) { //std::cout << "Swapping block to snapshot bucket " << static_cast<void*>(block.get()) << " to bucket " << retval.second << std::endl; block->swapToBucket(m_blocksNotPendingSnapshotLoad[retval.second]); } else { //In this case the block is actively being snapshotted and isn't eligible for merge operations at all //do nothing, once the block is finished by the iterator, the iterator will return it } } tuple->move(retval.first); //cout << "table::nextFreeTuple(" << reinterpret_cast<const void *>(this) << ") m_usedTuples == " << m_usedTuples << endl; if (block->hasFreeTuples()) { m_blocksWithSpace.insert(block); } }
std::pair<int, int> TupleBlock::merge(Table *table, TBPtr source) { assert(source != this); /* std::cout << "Attempting to merge " << static_cast<void*> (this) << "(" << m_activeTuples << ") with " << static_cast<void*>(source.get()) << "(" << source->m_activeTuples << ")"; std::cout << " source last compaction offset is " << source->lastCompactionOffset() << " and active tuple count is " << source->m_activeTuples << std::endl; */ uint32_t m_nextTupleInSourceOffset = source->lastCompactionOffset(); int sourceTuplesPendingDeleteOnUndoRelease = 0; while (hasFreeTuples() && !source->isEmpty()) { TableTuple sourceTuple(table->schema()); TableTuple destinationTuple(table->schema()); bool foundSourceTuple = false; //Iterate further into the block looking for active tuples //Stop when running into the unused tuple boundry while (m_nextTupleInSourceOffset < source->unusedTupleBoundry()) { sourceTuple.move(&source->address()[m_tupleLength * m_nextTupleInSourceOffset]); m_nextTupleInSourceOffset++; if (sourceTuple.isActive()) { foundSourceTuple = true; break; } } if (!foundSourceTuple) { //The block isn't empty, but there are no more active tuples. //Some of the tuples that make it register as not empty must have been //pending delete and those aren't mergable assert(sourceTuplesPendingDeleteOnUndoRelease); break; } //Can't move a tuple with a pending undo action, it would invalidate the pointer //Keep a count so that the block can be notified of the number //of tuples pending delete on undo release when calculating the correct bucket //index. If all the active tuples are pending delete on undo release the block //is effectively empty and shouldn't be considered for merge ops. //It will be completely discarded once the undo log releases the block. if (sourceTuple.isPendingDeleteOnUndoRelease()) { sourceTuplesPendingDeleteOnUndoRelease++; continue; } destinationTuple.move(nextFreeTuple().first); table->swapTuples( sourceTuple, destinationTuple); source->freeTuple(sourceTuple.address()); } source->lastCompactionOffset(m_nextTupleInSourceOffset); int newBucketIndex = calculateBucketIndex(); if (newBucketIndex != m_bucketIndex) { m_bucketIndex = newBucketIndex; //std::cout << "Merged " << static_cast<void*> (this) << "(" << m_activeTuples << ") with " << static_cast<void*>(source.get()) << "(" << source->m_activeTuples << ")"; //std::cout << " found " << sourceTuplesPendingDeleteOnUndoRelease << " tuples pending delete on undo release "<< std::endl; return std::pair<int, int>(newBucketIndex, source->calculateBucketIndex(sourceTuplesPendingDeleteOnUndoRelease)); } else { //std::cout << "Merged " << static_cast<void*> (this) << "(" << m_activeTuples << ") with " << static_cast<void*>(source.get()) << "(" << source->m_activeTuples << ")"; //std::cout << " found " << sourceTuplesPendingDeleteOnUndoRelease << " tuples pending delete on undo release "<< std::endl; return std::pair<int, int>( -1, source->calculateBucketIndex(sourceTuplesPendingDeleteOnUndoRelease)); } }