/// NOTE: This ID is used to create part names which are then persisted in ZK and as directory names on the file system. /// So if you want to change this method, be sure to guarantee compatibility with existing table data. String MergeTreePartition::getID(const Block & partition_key_sample) const { if (value.size() != partition_key_sample.columns()) throw Exception("Invalid partition key size: " + toString(value.size()), ErrorCodes::LOGICAL_ERROR); if (value.empty()) return "all"; /// It is tempting to use an empty string here. But that would break directory structure in ZK. /// In case all partition fields are represented by integral types, try to produce a human-readable ID. /// Otherwise use a hex-encoded hash. bool are_all_integral = true; for (const Field & field : value) { if (field.getType() != Field::Types::UInt64 && field.getType() != Field::Types::Int64) { are_all_integral = false; break; } } String result; if (are_all_integral) { FieldVisitorToString to_string_visitor; for (size_t i = 0; i < value.size(); ++i) { if (i > 0) result += '-'; if (typeid_cast<const DataTypeDate *>(partition_key_sample.getByPosition(i).type.get())) result += toString(DateLUT::instance().toNumYYYYMMDD(DayNum(value[i].safeGet<UInt64>()))); else result += applyVisitor(to_string_visitor, value[i]); /// It is tempting to output DateTime as YYYYMMDDhhmmss, but that would make partition ID /// timezone-dependent. } return result; } SipHash hash; FieldVisitorHash hashing_visitor(hash); for (const Field & field : value) applyVisitor(hashing_visitor, field); char hash_data[16]; hash.get128(hash_data); result.resize(32); for (size_t i = 0; i < 16; ++i) writeHexByteLowercase(hash_data[i], &result[2 * i]); return result; }
void IAST::getTreeHashImpl(SipHash & hash_state) const { auto id = getID(); hash_state.update(id.data(), id.size()); size_t num_children = children.size(); hash_state.update(reinterpret_cast<const char *>(&num_children), sizeof(num_children)); for (const auto & child : children) child->getTreeHashImpl(hash_state); }
static Compiler::HashedKey getHash(const std::string & key) { SipHash hash; auto revision = ClickHouseRevision::get(); hash.update(reinterpret_cast<const char *>(&revision), sizeof(revision)); hash.update(key.data(), key.size()); Compiler::HashedKey res; hash.get128(res.first, res.second); return res; }
void IAST::updateTreeHash(SipHash & hash_state) const { updateTreeHashImpl(hash_state); hash_state.update(children.size()); for (const auto & child : children) child->updateTreeHash(hash_state); }
void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const { size_t array_size = sizeAt(n); size_t offset = offsetAt(n); hash.update(array_size); for (size_t i = 0; i < array_size; ++i) getData().updateHashWithValue(offset + i, hash); }
void IAST::updateTreeHashImpl(SipHash & hash_state) const { auto id = getID(); hash_state.update(id.data(), id.size()); }
void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const { hash.update(reinterpret_cast<const char *>(&chars[n * index]), n); }
void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const { hash.update(data[n]); }
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block) { last_block_is_duplicate = false; /// TODO Is it possible to not lock the table structure here? storage.data.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event); auto zookeeper = storage.getZooKeeper(); assertSessionIsNotExpired(zookeeper); /** If write is with quorum, then we check that the required number of replicas is now live, * and also that for all previous parts for which quorum is required, this quorum is reached. * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node). * TODO Too complex logic, you can do better. */ if (quorum) checkQuorumPrecondition(zookeeper); auto part_blocks = storage.writer.splitBlockIntoParts(block); for (auto & current_block : part_blocks) { Stopwatch watch; /// Write part to the filesystem under temporary name. Calculate a checksum. MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block); String block_id; if (deduplicate) { SipHash hash; part->checksums.computeTotalChecksumDataOnly(hash); union { char bytes[16]; UInt64 words[2]; } hash_value; hash.get128(hash_value.bytes); /// We add the hash from the data and partition identifier to deduplication ID. /// That is, do not insert the same data to the same partition twice. block_id = part->info.partition_id + "_" + toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]); LOG_DEBUG(log, "Wrote block with ID '" << block_id << "', " << block.rows() << " rows"); } else { LOG_DEBUG(log, "Wrote block with " << block.rows() << " rows"); } try { commitPart(zookeeper, part, block_id); /// Set a special error code if the block is duplicate int error = (deduplicate && last_block_is_duplicate) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0; PartLog::addNewPart(storage.context, part, watch.elapsed(), ExecutionStatus(error)); } catch (...) { PartLog::addNewPart(storage.context, part, watch.elapsed(), ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__)); throw; } } }
int main(int argc, char ** argv) { using Strings = std::vector<std::string>; using Hashes = std::vector<char>; Strings strings; size_t rows = 0; size_t bytes = 0; { Stopwatch watch; DB::ReadBufferFromFileDescriptor in(STDIN_FILENO); while (!in.eof()) { strings.push_back(std::string()); DB::readEscapedString(strings.back(), in); DB::assertChar('\n', in); bytes += strings.back().size() + 1; } watch.stop(); rows = strings.size(); std::cerr << std::fixed << std::setprecision(2) << "Read " << rows << " rows, " << bytes / 1000000.0 << " MB" << ", elapsed: " << watch.elapsedSeconds() << " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)" << std::endl; } Hashes hashes(16 * rows); { Stopwatch watch; for (size_t i = 0; i < rows; ++i) { *reinterpret_cast<UInt64*>(&hashes[i * 16]) = CityHash64(strings[i].data(), strings[i].size()); } watch.stop(); UInt64 check = CityHash64(&hashes[0], hashes.size()); std::cerr << std::fixed << std::setprecision(2) << "CityHash64 (check = " << check << ")" << ", elapsed: " << watch.elapsedSeconds() << " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)" << std::endl; } /* { Stopwatch watch; std::vector<char> seed(16); for (size_t i = 0; i < rows; ++i) { sipHash( reinterpret_cast<unsigned char *>(&hashes[i * 16]), reinterpret_cast<const unsigned char *>(strings[i].data()), strings[i].size(), reinterpret_cast<const unsigned char *>(&seed[0])); } watch.stop(); UInt64 check = CityHash64(&hashes[0], hashes.size()); std::cerr << std::fixed << std::setprecision(2) << "SipHash (check = " << check << ")" << ", elapsed: " << watch.elapsedSeconds() << " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)" << std::endl; }*/ { Stopwatch watch; for (size_t i = 0; i < rows; ++i) { SipHash hash; hash.update(strings[i].data(), strings[i].size()); hash.get128(&hashes[i * 16]); } watch.stop(); UInt64 check = CityHash64(&hashes[0], hashes.size()); std::cerr << std::fixed << std::setprecision(2) << "SipHash, stream (check = " << check << ")" << ", elapsed: " << watch.elapsedSeconds() << " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)" << std::endl; } { Stopwatch watch; for (size_t i = 0; i < rows; ++i) { MD5_CTX state; MD5_Init(&state); MD5_Update(&state, reinterpret_cast<const unsigned char *>(strings[i].data()), strings[i].size()); MD5_Final(reinterpret_cast<unsigned char *>(&hashes[i * 16]), &state); } watch.stop(); UInt64 check = CityHash64(&hashes[0], hashes.size()); std::cerr << std::fixed << std::setprecision(2) << "MD5 (check = " << check << ")" << ", elapsed: " << watch.elapsedSeconds() << " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)" << std::endl; } return 0; }
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block) { /// TODO Can I not lock the table structure here? storage.data.delayInsertIfNeeded(&storage.restarting_thread->getWakeupEvent()); auto zookeeper = storage.getZooKeeper(); assertSessionIsNotExpired(zookeeper); /** If write is with quorum, then we check that the required number of replicas is now live, * and also that for all previous pieces for which quorum is required, this quorum is reached. * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node). * TODO Too complex logic, you can do better. */ String quorum_status_path = storage.zookeeper_path + "/quorum/status"; String is_active_node_value; int is_active_node_version = -1; int host_node_version = -1; if (quorum) { zkutil::ZooKeeper::TryGetFuture quorum_status_future = zookeeper->asyncTryGet(quorum_status_path); zkutil::ZooKeeper::TryGetFuture is_active_future = zookeeper->asyncTryGet(storage.replica_path + "/is_active"); zkutil::ZooKeeper::TryGetFuture host_future = zookeeper->asyncTryGet(storage.replica_path + "/host"); /// List of live replicas. All of them register an ephemeral node for leader_election. zkutil::Stat leader_election_stat; zookeeper->get(storage.zookeeper_path + "/leader_election", &leader_election_stat); if (leader_election_stat.numChildren < static_cast<int32_t>(quorum)) throw Exception("Number of alive replicas (" + toString(leader_election_stat.numChildren) + ") is less than requested quorum (" + toString(quorum) + ").", ErrorCodes::TOO_LESS_LIVE_REPLICAS); /** Is there a quorum for the last piece for which a quorum is needed? * Write of all the pieces with the included quorum is linearly ordered. * This means that at any time there can be only one piece, * for which you need, but not yet reach the quorum. * Information about this piece will be located in `/quorum/status` node. * If the quorum is reached, then the node is deleted. */ auto quorum_status = quorum_status_future.get(); if (quorum_status.exists) throw Exception("Quorum for previous write has not been satisfied yet. Status: " + quorum_status.value, ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); /// Both checks are implicitly made also later (otherwise there would be a race condition). auto is_active = is_active_future.get(); auto host = host_future.get(); if (!is_active.exists || !host.exists) throw Exception("Replica is not active right now", ErrorCodes::READONLY); is_active_node_value = is_active.value; is_active_node_version = is_active.stat.version; host_node_version = host.stat.version; } auto part_blocks = storage.writer.splitBlockIntoParts(block); for (auto & current_block : part_blocks) { assertSessionIsNotExpired(zookeeper); ++block_index; String block_id = insert_id.empty() ? "" : insert_id + "__" + toString(block_index); String month_name = toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(current_block.min_date)) / 100); AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name); /// 2 RTT Int64 part_number = block_number_lock.getNumber(); MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, part_number); String part_name = ActiveDataPartSet::getPartName(part->left_date, part->right_date, part->left, part->right, part->level); /// Hash from the data. SipHash hash; part->checksums.summaryDataChecksum(hash); union { char bytes[16]; UInt64 words[2]; } hash_value; hash.get128(hash_value.bytes); String checksum(hash_value.bytes, 16); /// If no ID is specified in query, we take the hash from the data as ID. That is, do not insert the same data twice. /// NOTE: If you do not need this deduplication, you can leave `block_id` empty instead. /// Setting or syntax in the query (for example, `ID = null`) could be done for this. if (block_id.empty()) { block_id = toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]); if (block_id.empty()) throw Exception("Logical error: block_id is empty.", ErrorCodes::LOGICAL_ERROR); } LOG_DEBUG(log, "Wrote block " << part_number << " with ID " << block_id << ", " << current_block.block.rows() << " rows"); StorageReplicatedMergeTree::LogEntry log_entry; log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; log_entry.create_time = time(0); log_entry.source_replica = storage.replica_name; log_entry.new_part_name = part_name; log_entry.quorum = quorum; log_entry.block_id = block_id; /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock. /// Information about the block. zkutil::Ops ops; auto acl = zookeeper->getDefaultACL(); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id, "", acl, zkutil::CreateMode::Persistent)); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id + "/checksum", checksum, acl, zkutil::CreateMode::Persistent)); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id + "/number", toString(part_number), acl, zkutil::CreateMode::Persistent)); /// Information about the part, in the replica data. storage.addNewPartToZooKeeper(part, ops, part_name); /// Replication log. ops.emplace_back(std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/log/log-", log_entry.toString(), acl, zkutil::CreateMode::PersistentSequential)); /// Deletes the information that the block number is used for writing. block_number_lock.getUnlockOps(ops); /** If you need a quorum - create a node in which the quorum is monitored. * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached. * You can not do the next quorum record at this time.) */ if (quorum) { ReplicatedMergeTreeQuorumEntry quorum_entry; quorum_entry.part_name = part_name; quorum_entry.required_number_of_replicas = quorum; quorum_entry.replicas.insert(storage.replica_name); /** At this point, this node will contain information that the current replica received a piece. * When other replicas will receive this piece (in the usual way, processing the replication log), * they will add themselves to the contents of this node. * When it contains information about `quorum` number of replicas, this node is deleted, * which indicates that the quorum has been reached. */ ops.emplace_back( std::make_unique<zkutil::Op::Create>( quorum_status_path, quorum_entry.toString(), acl, zkutil::CreateMode::Persistent)); /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished). ops.emplace_back( std::make_unique<zkutil::Op::Check>( storage.replica_path + "/is_active", is_active_node_version)); /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version. /// But then the `host` value will change. We will check this. /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread). ops.emplace_back( std::make_unique<zkutil::Op::Check>( storage.replica_path + "/host", host_node_version)); } MergeTreeData::Transaction transaction; /// If you can not add a piece to ZK, we'll remove it again from the working set. storage.data.renameTempPartAndAdd(part, nullptr, &transaction); try { auto code = zookeeper->tryMulti(ops); if (code == ZOK) { transaction.commit(); storage.merge_selecting_event.set(); } else if (code == ZNODEEXISTS) { /// If the block with such ID already exists in the table, roll back its insertion. String expected_checksum; if (!block_id.empty() && zookeeper->tryGet( storage.zookeeper_path + "/blocks/" + block_id + "/checksum", expected_checksum)) { LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")"); /// If the data is different from the ones that were inserted earlier with the same ID, throw an exception. if (expected_checksum != checksum) { if (!insert_id.empty()) throw Exception("Attempt to insert block with same ID but different checksum", ErrorCodes::CHECKSUM_DOESNT_MATCH); else throw Exception("Logical error: got ZNODEEXISTS while inserting data, block ID is derived from checksum but checksum doesn't match", ErrorCodes::LOGICAL_ERROR); } transaction.rollback(); } else if (zookeeper->exists(quorum_status_path)) { transaction.rollback(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } else { /// if the node with the quorum existed, but was quickly removed. throw Exception("Unexpected ZNODEEXISTS while adding block " + toString(part_number) + " with ID " + block_id + ": " + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } } else { throw Exception("Unexpected error while adding block " + toString(part_number) + " with ID " + block_id + ": " + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } } catch (const zkutil::KeeperException & e) { /** If the connection is lost, and we do not know if the changes were applied, you can not delete the local chunk * if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again. */ if (e.code == ZOPERATIONTIMEOUT || e.code == ZCONNECTIONLOSS) { transaction.commit(); storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER); /// We do not know whether or not data has been inserted. throw Exception("Unknown status, client must retry. Reason: " + e.displayText(), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } throw; } if (quorum) { /// We are waiting for the quorum to be reached. LOG_TRACE(log, "Waiting for quorum"); try { while (true) { zkutil::EventPtr event = std::make_shared<Poco::Event>(); std::string value; /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there. if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event)) break; ReplicatedMergeTreeQuorumEntry quorum_entry(value); /// If the node has time to disappear, and then appear again for the next insert. if (quorum_entry.part_name != part_name) break; if (!event->tryWait(quorum_timeout_ms)) throw Exception("Timeout while waiting for quorum"); } /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted? String value; if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr) || value != is_active_node_value) throw Exception("Replica become inactive while waiting for quorum"); } catch (...) { /// We do not know whether or not data has been inserted /// - whether other replicas have time to download the part and mark the quorum as done. throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } LOG_TRACE(log, "Quorum satisfied"); } } }