int64_t CopyOnWriteContext::handleStreamMore(TupleOutputStreamProcessor &outputStreams, std::vector<int> &retPositions) { int64_t remaining = serializeMore(outputStreams); // If more was streamed copy current positions for return. // Can this copy be avoided? for (size_t i = 0; i < outputStreams.size(); i++) { retPositions.push_back((int)outputStreams.at(i).position()); } return remaining; }
/** * Mandatory TableStreamContext override. */ int64_t RecoveryContext::handleStreamMore(TupleOutputStreamProcessor &outputStreams, std::vector<int> &retPositions) { if (outputStreams.size() != 1) { throwFatalException("RecoveryContext::handleStreamMore: Expect 1 output stream " "for recovery, received %ld", outputStreams.size()); } /* * Table ids don't change during recovery because * catalog changes are not allowed. */ bool hasMore = nextMessage(&outputStreams[0]); // Non-zero if some tuples remain, we're just not sure how many. int64_t remaining = (hasMore ? 1 : 0); for (size_t i = 0; i < outputStreams.size(); i++) { retPositions.push_back((int)outputStreams.at(i).position()); } return remaining; }
/* * Serialize to multiple output streams. * Return remaining tuple count, 0 if done, or -1 on error. */ int64_t CopyOnWriteContext::serializeMore(TupleOutputStreamProcessor &outputStreams) { // Don't expect to be re-called after streaming all the tuples. if (m_tuplesRemaining == 0) { throwFatalException("serializeMore() was called again after streaming completed.") } // Need to initialize the output stream list. if (outputStreams.empty()) { throwFatalException("serializeMore() expects at least one output stream."); } outputStreams.open(m_table, m_maxTupleLength, m_partitionId, m_predicates); //=== Tuple processing loop TableTuple tuple(m_table.schema()); // Set to true to break out of the loop after the tuples dry up // or the byte count threshold is hit. bool yield = false; while (!yield) { // Next tuple? bool hasMore = m_iterator->next(tuple); if (hasMore) { // -1 is used as a sentinel value to disable counting for tests. if (m_tuplesRemaining > 0) { m_tuplesRemaining--; } /* * Write the tuple to all the output streams. * Done if any of the buffers filled up. * The returned copy count helps decide when to delete if m_doDelete is true. */ int32_t numCopiesMade = 0; yield = outputStreams.writeRow(m_serializer, tuple, numCopiesMade); /* * May want to delete tuple if processing the actual table. */ if (!m_finishedTableScan) { /* * If this is the table scan, check to see if the tuple is pending * delete and return the tuple if it iscop */ if (tuple.isPendingDelete()) { assert(!tuple.isPendingDeleteOnUndoRelease()); CopyOnWriteIterator *iter = static_cast<CopyOnWriteIterator*>(m_iterator.get()); //Save the extra lookup if possible m_table.deleteTupleStorage(tuple, iter->m_currentBlock); } /* * Delete a moved tuple? * This is used for Elastic rebalancing, which is wrapped in a transaction. * The delete for undo is generic enough to support this operation. */ else if (m_doDelete && numCopiesMade > 0) { m_table.deleteTupleForUndo(tuple.address(), true); } } } else if (!m_finishedTableScan) { /* * After scanning the persistent table switch to scanning the temp * table with the tuples that were backed up. */ m_finishedTableScan = true; m_iterator.reset(m_backedUpTuples.get()->makeIterator()); } else { /* * No more tuples in the temp table and had previously finished the * persistent table. */ if (m_tuplesRemaining > 0) { #ifdef DEBUG throwFatalException("serializeMore(): tuple count > 0 after streaming:\n" "Table name: %s\n" "Table type: %s\n" "Original tuple count: %jd\n" "Active tuple count: %jd\n" "Remaining tuple count: %jd\n" "Compacted block count: %jd\n" "Dirty insert count: %jd\n" "Dirty update count: %jd\n" "Partition column: %d\n", m_table.name().c_str(), m_table.tableType().c_str(), (intmax_t)m_totalTuples, (intmax_t)m_table.activeTupleCount(), (intmax_t)m_tuplesRemaining, (intmax_t)m_blocksCompacted, (intmax_t)m_inserts, (intmax_t)m_updates, m_table.partitionColumn()); #else char message[1024 * 16]; snprintf(message, 1024 * 16, "serializeMore(): tuple count > 0 after streaming:\n" "Table name: %s\n" "Table type: %s\n" "Original tuple count: %jd\n" "Active tuple count: %jd\n" "Remaining tuple count: %jd\n" "Compacted block count: %jd\n" "Dirty insert count: %jd\n" "Dirty update count: %jd\n" "Partition column: %d\n", m_table.name().c_str(), m_table.tableType().c_str(), (intmax_t)m_totalTuples, (intmax_t)m_table.activeTupleCount(), (intmax_t)m_tuplesRemaining, (intmax_t)m_blocksCompacted, (intmax_t)m_inserts, (intmax_t)m_updates, m_table.partitionColumn()); LogManager::getThreadLogger(LOGGERID_HOST)->log(LOGLEVEL_ERROR, message); #endif } // -1 is used for tests when we don't bother counting. Need to force it to 0 here. if (m_tuplesRemaining < 0) { m_tuplesRemaining = 0; } } // All tuples serialized, bail if (m_tuplesRemaining == 0) { /* * CAUTION: m_iterator->next() is NOT side-effect free!!! It also * returns the block back to the table if the call causes it to go * over the boundary of used tuples. In case it actually returned * the very last tuple in the table last time it's called, the block * is still hanging around. So we need to call it again to return * the block here. */ if (hasMore) { hasMore = m_iterator->next(tuple); if (hasMore) { assert(false); } } yield = true; } } // end tuple processing while loop // Need to close the output streams and insert row counts. outputStreams.close(); m_serializationBatches++; // Handle the sentinel value of -1 which is passed in from tests that don't // care about the active tuple count. Return max int as if there are always // tuples remaining (until the counter is forced to zero when done). if (m_tuplesRemaining < 0) { return std::numeric_limits<int32_t>::max(); } // Done when the table scan is finished and iteration is complete. return m_tuplesRemaining; }
/** * Exercise the multi-COW. */ TEST_F(CopyOnWriteTest, MultiStreamTest) { // Constants const int32_t npartitions = 7; const int tupleCount = TUPLE_COUNT; DefaultTupleSerializer serializer; initTable(true); addRandomUniqueTuples(m_table, tupleCount); MultiStreamTestTool tool(*m_table, npartitions); for (size_t iteration = 0; iteration < NUM_REPETITIONS; iteration++) { // The last repetition does the delete after streaming. bool doDelete = (iteration == NUM_REPETITIONS - 1); tool.iterate(); int totalInserted = 0; // Total tuple counter. boost::scoped_ptr<char> buffers[npartitions]; // Stream buffers. std::vector<std::string> strings(npartitions); // Range strings. TupleSet expected[npartitions]; // Expected tuple values by partition. TupleSet actual[npartitions]; // Actual tuple values by partition. int totalSkipped = 0; // Prepare streams by generating ranges and range strings based on // the desired number of partitions/predicates. // Since integer hashes use a simple modulus we just need to provide // the partition number for the range. // Also prepare a buffer for each stream. // Skip one partition to make it interesting. int32_t skippedPartition = npartitions / 2; for (int32_t i = 0; i < npartitions; i++) { buffers[i].reset(new char[BUFFER_SIZE]); if (i != skippedPartition) { strings[i] = tool.generatePredicateString(i); } else { strings[i] = tool.generatePredicateString(-1); } } char buffer[1024 * 256]; ReferenceSerializeOutput output(buffer, 1024 * 256); output.writeByte((int8_t)(doDelete ? 1 : 0)); output.writeInt(npartitions); for (std::vector<std::string>::iterator i = strings.begin(); i != strings.end(); i++) { output.writeTextString(*i); } tool.context("precalculate"); // Map original tuples to expected partitions. voltdb::TableIterator& iterator = m_table->iterator(); int partCol = m_table->partitionColumn(); TableTuple tuple(m_table->schema()); while (iterator.next(tuple)) { int64_t value = *reinterpret_cast<int64_t*>(tuple.address() + 1); int32_t ipart = (int32_t)(ValuePeeker::peekAsRawInt64(tuple.getNValue(partCol)) % npartitions); if (ipart != skippedPartition) { bool inserted = expected[ipart].insert(value).second; if (!inserted) { int32_t primaryKey = ValuePeeker::peekAsInteger(tuple.getNValue(0)); tool.error("Duplicate primary key %d iteration=%lu", primaryKey, iteration); } ASSERT_TRUE(inserted); } else { totalSkipped++; } } tool.context("activate"); ReferenceSerializeInput input(buffer, output.position()); bool alreadyActivated = m_table->activateStream(serializer, TABLE_STREAM_SNAPSHOT, 0, m_tableId, input); if (alreadyActivated) { tool.error("COW was previously activated"); } ASSERT_FALSE(alreadyActivated); int64_t remaining = tupleCount; while (remaining > 0) { // Prepare output streams and their buffers. TupleOutputStreamProcessor outputStreams; for (int32_t i = 0; i < npartitions; i++) { outputStreams.add((void*)buffers[i].get(), BUFFER_SIZE); } std::vector<int> retPositions; remaining = m_table->streamMore(outputStreams, retPositions); if (remaining >= 0) { ASSERT_EQ(outputStreams.size(), retPositions.size()); } // Per-predicate iterators. TupleOutputStreamProcessor::iterator outputStream = outputStreams.begin(); // Record the final result of streaming to each partition/predicate. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("serialize: partition=%lu remaining=%lld", ipart, remaining); const int serialized = static_cast<int>(outputStream->position()); if (serialized > 0) { // Skip partition id, row count and first tuple length. int ibuf = sizeof(int32_t) * 3; while (ibuf < (serialized - sizeof(int32_t))) { int32_t values[2]; values[0] = ntohl(*reinterpret_cast<const int32_t*>(buffers[ipart].get()+ibuf)); values[1] = ntohl(*reinterpret_cast<const int32_t*>(buffers[ipart].get()+ibuf+4)); int64_t value = *reinterpret_cast<int64_t*>(values); const bool inserted = actual[ipart].insert(value).second; if (!inserted) { tool.valueError(values, "Buffer duplicate: ipart=%lu totalInserted=%d ibuf=%d", ipart, totalInserted, ibuf); } ASSERT_TRUE(inserted); totalInserted++; // Account for tuple data and second tuple length. ibuf += static_cast<int>(m_tupleWidth + sizeof(int32_t)); } } // Mozy along to the next predicate/partition. // Do a silly cross-check that the iterator doesn't end prematurely. ++outputStream; ASSERT_TRUE(ipart == npartitions - 1 || outputStream != outputStreams.end()); } // Mutate the table. if (!doDelete) { for (size_t imutation = 0; imutation < NUM_MUTATIONS; imutation++) { doRandomTableMutation(m_table); } } } // Summarize partitions with incorrect tuple counts. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("check size: partition=%lu", ipart); if (expected[ipart].size() != actual[ipart].size()) { tool.error("Size mismatch: expected=%lu actual=%lu", expected[ipart].size(), actual[ipart].size()); } } // Summarize partitions where expected and actual aren't equal. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("check equality: partition=%lu", ipart); if (expected[ipart] != actual[ipart]) { tool.error("Not equal"); } } // Look for tuples that are missing from partitions. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("missing: partition=%lu", ipart); tool.diff(expected[ipart], actual[ipart]); } // Look for extra tuples that don't belong in partitions. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("extra: partition=%lu", ipart); tool.diff(actual[ipart], expected[ipart]); } // Check tuple diff for each predicate/partition. for (size_t ipart = 0; ipart < npartitions; ipart++) { tool.context("check equality: partition=%lu", ipart); ASSERT_EQ(expected[ipart].size(), actual[ipart].size()); ASSERT_TRUE(expected[ipart] == actual[ipart]); } // Check for dirty tuples. tool.context("check dirty"); int numTuples = 0; iterator = m_table->iterator(); while (iterator.next(tuple)) { if (tuple.isDirty()) { tool.error("Found tuple %d is active and dirty at end of COW", ValuePeeker::peekAsInteger(tuple.getNValue(0))); } numTuples++; ASSERT_FALSE(tuple.isDirty()); } // If deleting check the tuples remaining in the table. if (doDelete) { ASSERT_EQ(numTuples, totalSkipped); } else { ASSERT_EQ(numTuples, tupleCount + (m_tuplesInserted - m_tuplesDeleted)); } ASSERT_EQ(tool.nerrors, 0); } }