/// NOTE: This ID is used to create part names which are then persisted in ZK and as directory names on the file system.
/// So if you want to change this method, be sure to guarantee compatibility with existing table data.
String MergeTreePartition::getID(const Block & partition_key_sample) const
{
    if (value.size() != partition_key_sample.columns())
        throw Exception("Invalid partition key size: " + toString(value.size()), ErrorCodes::LOGICAL_ERROR);

    if (value.empty())
        return "all"; /// It is tempting to use an empty string here. But that would break directory structure in ZK.

    /// In case all partition fields are represented by integral types, try to produce a human-readable ID.
    /// Otherwise use a hex-encoded hash.
    bool are_all_integral = true;
    for (const Field & field : value)
    {
        if (field.getType() != Field::Types::UInt64 && field.getType() != Field::Types::Int64)
        {
            are_all_integral = false;
            break;
        }
    }

    String result;

    if (are_all_integral)
    {
        FieldVisitorToString to_string_visitor;
        for (size_t i = 0; i < value.size(); ++i)
        {
            if (i > 0)
                result += '-';

            if (typeid_cast<const DataTypeDate *>(partition_key_sample.getByPosition(i).type.get()))
                result += toString(DateLUT::instance().toNumYYYYMMDD(DayNum(value[i].safeGet<UInt64>())));
            else
                result += applyVisitor(to_string_visitor, value[i]);

            /// It is tempting to output DateTime as YYYYMMDDhhmmss, but that would make partition ID
            /// timezone-dependent.
        }

        return result;
    }

    SipHash hash;
    FieldVisitorHash hashing_visitor(hash);
    for (const Field & field : value)
        applyVisitor(hashing_visitor, field);

    char hash_data[16];
    hash.get128(hash_data);
    result.resize(32);
    for (size_t i = 0; i < 16; ++i)
        writeHexByteLowercase(hash_data[i], &result[2 * i]);

    return result;
}
Example #2
0
void IAST::getTreeHashImpl(SipHash & hash_state) const
{
    auto id = getID();
    hash_state.update(id.data(), id.size());

    size_t num_children = children.size();
    hash_state.update(reinterpret_cast<const char *>(&num_children), sizeof(num_children));

    for (const auto & child : children)
        child->getTreeHashImpl(hash_state);
}
Example #3
0
static Compiler::HashedKey getHash(const std::string & key)
{
    SipHash hash;

    auto revision = ClickHouseRevision::get();
    hash.update(reinterpret_cast<const char *>(&revision), sizeof(revision));
    hash.update(key.data(), key.size());

    Compiler::HashedKey res;
    hash.get128(res.first, res.second);
    return res;
}
Example #4
0
void IAST::updateTreeHash(SipHash & hash_state) const
{
    updateTreeHashImpl(hash_state);
    hash_state.update(children.size());
    for (const auto & child : children)
        child->updateTreeHash(hash_state);
}
Example #5
0
void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const
{
    size_t array_size = sizeAt(n);
    size_t offset = offsetAt(n);

    hash.update(array_size);
    for (size_t i = 0; i < array_size; ++i)
        getData().updateHashWithValue(offset + i, hash);
}
Example #6
0
void IAST::updateTreeHashImpl(SipHash & hash_state) const
{
    auto id = getID();
    hash_state.update(id.data(), id.size());
}
Example #7
0
void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const
{
    hash.update(reinterpret_cast<const char *>(&chars[n * index]), n);
}
Example #8
0
void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const
{
    hash.update(data[n]);
}
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block)
{
    last_block_is_duplicate = false;

    /// TODO Is it possible to not lock the table structure here?
    storage.data.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event);

    auto zookeeper = storage.getZooKeeper();
    assertSessionIsNotExpired(zookeeper);

    /** If write is with quorum, then we check that the required number of replicas is now live,
      *  and also that for all previous parts for which quorum is required, this quorum is reached.
      * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
      * TODO Too complex logic, you can do better.
      */
    if (quorum)
        checkQuorumPrecondition(zookeeper);

    auto part_blocks = storage.writer.splitBlockIntoParts(block);

    for (auto & current_block : part_blocks)
    {
        Stopwatch watch;

        /// Write part to the filesystem under temporary name. Calculate a checksum.

        MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block);

        String block_id;

        if (deduplicate)
        {
            SipHash hash;
            part->checksums.computeTotalChecksumDataOnly(hash);
            union
            {
                char bytes[16];
                UInt64 words[2];
            } hash_value;
            hash.get128(hash_value.bytes);

            /// We add the hash from the data and partition identifier to deduplication ID.
            /// That is, do not insert the same data to the same partition twice.
            block_id = part->info.partition_id + "_" + toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]);

            LOG_DEBUG(log, "Wrote block with ID '" << block_id << "', " << block.rows() << " rows");
        }
        else
        {
            LOG_DEBUG(log, "Wrote block with " << block.rows() << " rows");
        }

        try
        {
            commitPart(zookeeper, part, block_id);

            /// Set a special error code if the block is duplicate
            int error = (deduplicate && last_block_is_duplicate) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0;
            PartLog::addNewPart(storage.context, part, watch.elapsed(), ExecutionStatus(error));
        }
        catch (...)
        {
            PartLog::addNewPart(storage.context, part, watch.elapsed(), ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__));
            throw;
        }
    }
}
Example #10
0
int main(int argc, char ** argv)
{
	using Strings = std::vector<std::string>;
	using Hashes = std::vector<char>;
	Strings strings;
	size_t rows = 0;
	size_t bytes = 0;

	{
		Stopwatch watch;

		DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);

		while (!in.eof())
		{
			strings.push_back(std::string());
			DB::readEscapedString(strings.back(), in);
			DB::assertChar('\n', in);
			bytes += strings.back().size() + 1;
		}

		watch.stop();
		rows = strings.size();
		std::cerr << std::fixed << std::setprecision(2)
			<< "Read " << rows << " rows, " << bytes / 1000000.0 << " MB"
			<< ", elapsed: " << watch.elapsedSeconds()
			<< " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)"
			<< std::endl;
	}

	Hashes hashes(16 * rows);

	{
		Stopwatch watch;

		for (size_t i = 0; i < rows; ++i)
		{
			*reinterpret_cast<UInt64*>(&hashes[i * 16]) = CityHash64(strings[i].data(), strings[i].size());
		}

		watch.stop();

		UInt64 check = CityHash64(&hashes[0], hashes.size());

		std::cerr << std::fixed << std::setprecision(2)
			<< "CityHash64 (check = " << check << ")"
			<< ", elapsed: " << watch.elapsedSeconds()
			<< " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)"
			<< std::endl;
	}
	
/*	{
		Stopwatch watch;

		std::vector<char> seed(16);

		for (size_t i = 0; i < rows; ++i)
		{
			sipHash(
				reinterpret_cast<unsigned char *>(&hashes[i * 16]),
				reinterpret_cast<const unsigned char *>(strings[i].data()),
				strings[i].size(),
				reinterpret_cast<const unsigned char *>(&seed[0]));
		}

		watch.stop();

		UInt64 check = CityHash64(&hashes[0], hashes.size());

		std::cerr << std::fixed << std::setprecision(2)
			<< "SipHash (check = " << check << ")"
			<< ", elapsed: " << watch.elapsedSeconds()
			<< " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)"
			<< std::endl;
	}*/

	{
		Stopwatch watch;

		for (size_t i = 0; i < rows; ++i)
		{
			SipHash hash;
			hash.update(strings[i].data(), strings[i].size());
			hash.get128(&hashes[i * 16]);
		}

		watch.stop();

		UInt64 check = CityHash64(&hashes[0], hashes.size());

		std::cerr << std::fixed << std::setprecision(2)
			<< "SipHash, stream (check = " << check << ")"
			<< ", elapsed: " << watch.elapsedSeconds()
			<< " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)"
			<< std::endl;
	}

	{
		Stopwatch watch;

		for (size_t i = 0; i < rows; ++i)
		{
			MD5_CTX state;
			MD5_Init(&state);
			MD5_Update(&state, reinterpret_cast<const unsigned char *>(strings[i].data()), strings[i].size());
			MD5_Final(reinterpret_cast<unsigned char *>(&hashes[i * 16]), &state);
		}

		watch.stop();

		UInt64 check = CityHash64(&hashes[0], hashes.size());

		std::cerr << std::fixed << std::setprecision(2)
			<< "MD5 (check = " << check << ")"
			<< ", elapsed: " << watch.elapsedSeconds()
			<< " (" << rows / watch.elapsedSeconds() << " rows/sec., " << bytes / 1000000.0 / watch.elapsedSeconds() << " MB/sec.)"
			<< std::endl;
	}

	return 0;
}
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block)
{
    /// TODO Can I not lock the table structure here?
    storage.data.delayInsertIfNeeded(&storage.restarting_thread->getWakeupEvent());

    auto zookeeper = storage.getZooKeeper();

    assertSessionIsNotExpired(zookeeper);

    /** If write is with quorum, then we check that the required number of replicas is now live,
      *  and also that for all previous pieces for which quorum is required, this quorum is reached.
      * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
      * TODO Too complex logic, you can do better.
      */
    String quorum_status_path = storage.zookeeper_path + "/quorum/status";
    String is_active_node_value;
    int is_active_node_version = -1;
    int host_node_version = -1;
    if (quorum)
    {
        zkutil::ZooKeeper::TryGetFuture quorum_status_future = zookeeper->asyncTryGet(quorum_status_path);
        zkutil::ZooKeeper::TryGetFuture is_active_future = zookeeper->asyncTryGet(storage.replica_path + "/is_active");
        zkutil::ZooKeeper::TryGetFuture host_future = zookeeper->asyncTryGet(storage.replica_path + "/host");

        /// List of live replicas. All of them register an ephemeral node for leader_election.

        zkutil::Stat leader_election_stat;
        zookeeper->get(storage.zookeeper_path + "/leader_election", &leader_election_stat);

        if (leader_election_stat.numChildren < static_cast<int32_t>(quorum))
            throw Exception("Number of alive replicas ("
                + toString(leader_election_stat.numChildren) + ") is less than requested quorum (" + toString(quorum) + ").",
                ErrorCodes::TOO_LESS_LIVE_REPLICAS);

        /** Is there a quorum for the last piece for which a quorum is needed?
            * Write of all the pieces with the included quorum is linearly ordered.
            * This means that at any time there can be only one piece,
            *  for which you need, but not yet reach the quorum.
            * Information about this piece will be located in `/quorum/status` node.
            * If the quorum is reached, then the node is deleted.
            */

        auto quorum_status = quorum_status_future.get();
        if (quorum_status.exists)
            throw Exception("Quorum for previous write has not been satisfied yet. Status: " + quorum_status.value, ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);

        /// Both checks are implicitly made also later (otherwise there would be a race condition).

        auto is_active = is_active_future.get();
        auto host = host_future.get();

        if (!is_active.exists || !host.exists)
            throw Exception("Replica is not active right now", ErrorCodes::READONLY);

        is_active_node_value = is_active.value;
        is_active_node_version = is_active.stat.version;
        host_node_version = host.stat.version;
    }

    auto part_blocks = storage.writer.splitBlockIntoParts(block);

    for (auto & current_block : part_blocks)
    {
        assertSessionIsNotExpired(zookeeper);

        ++block_index;
        String block_id = insert_id.empty() ? "" : insert_id + "__" + toString(block_index);
        String month_name = toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(current_block.min_date)) / 100);

        AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name);    /// 2 RTT

        Int64 part_number = block_number_lock.getNumber();

        MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, part_number);
        String part_name = ActiveDataPartSet::getPartName(part->left_date, part->right_date, part->left, part->right, part->level);

        /// Hash from the data.
        SipHash hash;
        part->checksums.summaryDataChecksum(hash);
        union
        {
            char bytes[16];
            UInt64 words[2];
        } hash_value;
        hash.get128(hash_value.bytes);

        String checksum(hash_value.bytes, 16);

        /// If no ID is specified in query, we take the hash from the data as ID. That is, do not insert the same data twice.
        /// NOTE: If you do not need this deduplication, you can leave `block_id` empty instead.
        ///       Setting or syntax in the query (for example, `ID = null`) could be done for this.
        if (block_id.empty())
        {
            block_id = toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]);

            if (block_id.empty())
                throw Exception("Logical error: block_id is empty.", ErrorCodes::LOGICAL_ERROR);
        }

        LOG_DEBUG(log, "Wrote block " << part_number << " with ID " << block_id << ", " << current_block.block.rows() << " rows");

        StorageReplicatedMergeTree::LogEntry log_entry;
        log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
        log_entry.create_time = time(0);
        log_entry.source_replica = storage.replica_name;
        log_entry.new_part_name = part_name;
        log_entry.quorum = quorum;
        log_entry.block_id = block_id;

        /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock.

        /// Information about the block.
        zkutil::Ops ops;
        auto acl = zookeeper->getDefaultACL();

        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id,
                "",
                acl,
                zkutil::CreateMode::Persistent));
        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id + "/checksum",
                checksum,
                acl,
                zkutil::CreateMode::Persistent));
        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id + "/number",
                toString(part_number),
                acl,
                zkutil::CreateMode::Persistent));

        /// Information about the part, in the replica data.
        storage.addNewPartToZooKeeper(part, ops, part_name);

        /// Replication log.
        ops.emplace_back(std::make_unique<zkutil::Op::Create>(
            storage.zookeeper_path + "/log/log-",
            log_entry.toString(),
            acl,
            zkutil::CreateMode::PersistentSequential));

        /// Deletes the information that the block number is used for writing.
        block_number_lock.getUnlockOps(ops);

        /** If you need a quorum - create a node in which the quorum is monitored.
            * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached.
            *  You can not do the next quorum record at this time.)
            */
        if (quorum)
        {
            ReplicatedMergeTreeQuorumEntry quorum_entry;
            quorum_entry.part_name = part_name;
            quorum_entry.required_number_of_replicas = quorum;
            quorum_entry.replicas.insert(storage.replica_name);

            /** At this point, this node will contain information that the current replica received a piece.
                * When other replicas will receive this piece (in the usual way, processing the replication log),
                *  they will add themselves to the contents of this node.
                * When it contains information about `quorum` number of replicas, this node is deleted,
                *  which indicates that the quorum has been reached.
                */

            ops.emplace_back(
                std::make_unique<zkutil::Op::Create>(
                    quorum_status_path,
                    quorum_entry.toString(),
                    acl,
                    zkutil::CreateMode::Persistent));

            /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished).
            ops.emplace_back(
                std::make_unique<zkutil::Op::Check>(
                    storage.replica_path + "/is_active",
                    is_active_node_version));

            /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version.
            /// But then the `host` value will change. We will check this.
            /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread).
            ops.emplace_back(
                std::make_unique<zkutil::Op::Check>(
                    storage.replica_path + "/host",
                    host_node_version));
        }

        MergeTreeData::Transaction transaction; /// If you can not add a piece to ZK, we'll remove it again from the working set.
        storage.data.renameTempPartAndAdd(part, nullptr, &transaction);

        try
        {
            auto code = zookeeper->tryMulti(ops);
            if (code == ZOK)
            {
                transaction.commit();
                storage.merge_selecting_event.set();
            }
            else if (code == ZNODEEXISTS)
            {
                /// If the block with such ID already exists in the table, roll back its insertion.
                String expected_checksum;
                if (!block_id.empty() && zookeeper->tryGet(
                    storage.zookeeper_path + "/blocks/" + block_id + "/checksum", expected_checksum))
                {
                    LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")");

                    /// If the data is different from the ones that were inserted earlier with the same ID, throw an exception.
                    if (expected_checksum != checksum)
                    {
                        if (!insert_id.empty())
                            throw Exception("Attempt to insert block with same ID but different checksum", ErrorCodes::CHECKSUM_DOESNT_MATCH);
                        else
                            throw Exception("Logical error: got ZNODEEXISTS while inserting data, block ID is derived from checksum but checksum doesn't match", ErrorCodes::LOGICAL_ERROR);
                    }

                    transaction.rollback();
                }
                else if (zookeeper->exists(quorum_status_path))
                {
                    transaction.rollback();

                    throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
                }
                else
                {
                    /// if the node with the quorum existed, but was quickly removed.

                    throw Exception("Unexpected ZNODEEXISTS while adding block " + toString(part_number) + " with ID " + block_id + ": "
                        + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
                }
            }
            else
            {
                throw Exception("Unexpected error while adding block " + toString(part_number) + " with ID " + block_id + ": "
                    + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
            }
        }
        catch (const zkutil::KeeperException & e)
        {
            /** If the connection is lost, and we do not know if the changes were applied, you can not delete the local chunk
                *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
                */
            if (e.code == ZOPERATIONTIMEOUT ||
                e.code == ZCONNECTIONLOSS)
            {
                transaction.commit();
                storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);

                /// We do not know whether or not data has been inserted.
                throw Exception("Unknown status, client must retry. Reason: " + e.displayText(), ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
            }

            throw;
        }

        if (quorum)
        {
            /// We are waiting for the quorum to be reached.
            LOG_TRACE(log, "Waiting for quorum");

            try
            {
                while (true)
                {
                    zkutil::EventPtr event = std::make_shared<Poco::Event>();

                    std::string value;
                    /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there.
                    if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event))
                        break;

                    ReplicatedMergeTreeQuorumEntry quorum_entry(value);

                    /// If the node has time to disappear, and then appear again for the next insert.
                    if (quorum_entry.part_name != part_name)
                        break;

                    if (!event->tryWait(quorum_timeout_ms))
                        throw Exception("Timeout while waiting for quorum");
                }

                /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted?
                String value;
                if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr)
                    || value != is_active_node_value)
                    throw Exception("Replica become inactive while waiting for quorum");
            }
            catch (...)
            {
                /// We do not know whether or not data has been inserted
                /// - whether other replicas have time to download the part and mark the quorum as done.
                throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false),
                    ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
            }

            LOG_TRACE(log, "Quorum satisfied");
        }
    }
}