std::ostream& operator<<(std::ostream& os, const TransferStats& stats) { folly::RWSpinLock::ReadHolder lock(stats.mutex_.get()); double headerOverhead = 100; double failureOverhead = 100; if (stats.effectiveDataBytes_ > 0) { headerOverhead = 100.0 * stats.headerBytes_ / stats.effectiveDataBytes_; failureOverhead = 100.0 * (stats.dataBytes_ - stats.effectiveDataBytes_) / stats.effectiveDataBytes_; } if (stats.errCode_ == OK && stats.remoteErrCode_ == OK) { os << "Transfer status = OK."; } else { os << "Transfer status (local) = " << errorCodeToStr(stats.errCode_) << ", (remote) = " << errorCodeToStr(stats.remoteErrCode_) << "."; } if (stats.numFiles_ > 0) { os << " Number of files transferred = " << stats.numFiles_ << "."; } else { os << " Number of blocks transferred = " << stats.numBlocks_ << "."; } os << " Data Mbytes = " << stats.effectiveDataBytes_ / kMbToB << ". Header kBytes = " << stats.headerBytes_ / 1024. << " (" << headerOverhead << "% overhead)" << ". Total bytes = " << (stats.dataBytes_ + stats.headerBytes_) << ". Wasted bytes due to failure = " << (stats.dataBytes_ - stats.effectiveDataBytes_) << " (" << failureOverhead << "% overhead)."; return os; }
TransferReport::TransferReport( std::vector<TransferStats>& transferredSourceStats, std::vector<TransferStats>& failedSourceStats, std::vector<TransferStats>& threadStats, std::vector<std::string>& failedDirectories, double totalTime, int64_t totalFileSize, int64_t numDiscoveredFiles) : transferredSourceStats_(std::move(transferredSourceStats)), failedSourceStats_(std::move(failedSourceStats)), threadStats_(std::move(threadStats)), failedDirectories_(std::move(failedDirectories)), totalTime_(totalTime), totalFileSize_(totalFileSize) { for (const auto& stats : threadStats_) { summary_ += stats; } ErrorCode summaryErrorCode = summary_.getErrorCode(); bool atLeastOneOk = false; for (auto& stats : threadStats_) { if (stats.getErrorCode() == OK) { atLeastOneOk = true; break; } } LOG(INFO) << "Error code summary " << errorCodeToStr(summaryErrorCode); // none of the files or directories failed bool possiblyOk = true; if (!failedDirectories_.empty()) { possiblyOk = false; summaryErrorCode = getMoreInterestingError(summaryErrorCode, BYTE_SOURCE_READ_ERROR); } for (const auto& sourceStat : failedSourceStats_) { possiblyOk = false; summaryErrorCode = getMoreInterestingError(summaryErrorCode, sourceStat.getErrorCode()); } if (possiblyOk && atLeastOneOk) { if (summaryErrorCode != OK) { LOG(WARNING) << "WDT successfully recovered from error " << errorCodeToStr(summaryErrorCode); } summaryErrorCode = OK; } setErrorCode(summaryErrorCode); if (summary_.getEffectiveDataBytes() != totalFileSize_) { // sender did not send all the bytes LOG(INFO) << "Could not send all the bytes " << totalFileSize_ << " " << summary_.getEffectiveDataBytes(); WDT_CHECK(summaryErrorCode != OK) << "BUG: All threads OK yet sized based error detected"; } std::set<std::string> failedFilesSet; for (auto& stats : failedSourceStats_) { failedFilesSet.insert(stats.getId()); } int64_t numTransferredFiles = numDiscoveredFiles - failedFilesSet.size(); summary_.setNumFiles(numTransferredFiles); }
ErrorCode WdtSocket::closeConnectionInternal(bool doTagIOs) { VLOG(1) << "Closing socket " << port_ << " " << fd_; if (fd_ < 0) { return OK; } ErrorCode errorCode = getNonRetryableErrCode(); if (!writesFinalized_) { errorCode = getMoreInterestingError(errorCode, finalizeWrites(doTagIOs)); } if (!readsFinalized_) { errorCode = getMoreInterestingError(errorCode, finalizeReads(doTagIOs)); } if (::close(fd_) != 0) { PLOG(ERROR) << "Failed to close socket " << fd_ << " " << port_; errorCode = getMoreInterestingError(ERROR, errorCode); } // This looks like a reset() make it explicit (and check it's complete) fd_ = -1; readErrorCode_ = OK; writeErrorCode_ = OK; encryptionSettingsRead_ = false; encryptionSettingsWritten_ = false; writesFinalized_ = false; readsFinalized_ = false; VLOG(1) << "Error code from close " << errorCodeToStr(errorCode); return errorCode; }
TransferReport::TransferReport(TransferStats&& globalStats) : summary_(std::move(globalStats)) { const int64_t numBlocksSend = summary_.getNumBlocksSend(); const int64_t numBlocksReceived = summary_.getNumBlocks(); const int64_t numBytesSend = summary_.getTotalSenderBytes(); const int64_t numBytesReceived = summary_.getEffectiveDataBytes(); ErrorCode summaryErrorCode = summary_.getLocalErrorCode(); VLOG(1) << "Summary Error Code " << summaryErrorCode; // TODO this is messy and error (ah!) prone if (numBlocksSend == -1 || numBlocksSend != numBlocksReceived) { LOG(ERROR) << "Did not receive all the blocks sent by the sender " << numBlocksSend << " " << numBlocksReceived; summaryErrorCode = getMoreInterestingError(summaryErrorCode, ERROR); } else if (numBytesSend != -1 && numBytesSend != numBytesReceived) { // did not receive all the bytes LOG(ERROR) << "Number of bytes sent and received do not match " << numBytesSend << " " << numBytesReceived; summaryErrorCode = getMoreInterestingError(summaryErrorCode, ERROR); } else { // We got all the bytes... but if we have an encryption error we should // make it stick (unlike the connection error on a single port...) if (summaryErrorCode != OK && summaryErrorCode != ENCRYPTION_ERROR) { LOG(WARNING) << "All bytes received, turning " << errorCodeToStr(summaryErrorCode) << " into local OK"; summaryErrorCode = OK; } } // only the local error code is set here. Any remote error means transfer // failure. Since, getErrorCode checks both local and remote codes, it will // return the correct one summary_.setLocalErrorCode(summaryErrorCode); }
ErrorCode SenderThread::readNextReceiverCmd() { int numUnackedBytes = socket_->getUnackedBytes(); int timeToClearSendBuffer = 0; Clock::time_point startTime = Clock::now(); while (true) { int numRead = socket_->read(buf_, 1); if (numRead == 1) { return OK; } if (getThreadAbortCode() != OK) { return ABORT; } if (numRead == 0) { WTPLOG(ERROR) << "Got unexpected EOF, reconnecting"; return SOCKET_READ_ERROR; } WDT_CHECK_LT(numRead, 0); ErrorCode errCode = socket_->getReadErrCode(); WTLOG(ERROR) << "Failed to read receiver cmd " << numRead << " " << errorCodeToStr(errCode); if (errCode != WDT_TIMEOUT) { // not timed out return SOCKET_READ_ERROR; } int curUnackedBytes = socket_->getUnackedBytes(); if (numUnackedBytes < 0 || curUnackedBytes < 0) { WTLOG(ERROR) << "Failed to read number of unacked bytes, reconnecting"; return SOCKET_READ_ERROR; } WDT_CHECK_GE(numUnackedBytes, curUnackedBytes); if (curUnackedBytes == 0) { timeToClearSendBuffer = durationMillis(Clock::now() - startTime); break; } if (curUnackedBytes == numUnackedBytes) { WTLOG(ERROR) << "Number of unacked bytes did not change, reconnecting " << curUnackedBytes; return SOCKET_READ_ERROR; } WTLOG(INFO) << "Read receiver command failed, but number of unacked " "bytes decreased, retrying socket read " << numUnackedBytes << " " << curUnackedBytes; numUnackedBytes = curUnackedBytes; } // we are assuming that sender and receiver tcp buffer sizes are same. So, we // expect another timeToClearSendBuffer milliseconds for receiver to clear its // buffer int readTimeout = timeToClearSendBuffer + options_.drain_extra_ms; WTLOG(INFO) << "Send buffer cleared in " << timeToClearSendBuffer << "ms, waiting for " << readTimeout << "ms for receiver buffer to clear"; // readWithTimeout internally checks for abort periodically int numRead = socket_->readWithTimeout(buf_, 1, readTimeout); if (numRead != 1) { WTLOG(ERROR) << "Failed to read receiver cmd " << numRead; return SOCKET_READ_ERROR; } return OK; }
ErrorCode Wdt::wdtSend(const std::string &wdtNamespace, const WdtTransferRequest &req, std::shared_ptr<IAbortChecker> abortChecker, bool terminateExistingOne) { if (!settingsApplied_) { applySettings(); } if (req.errorCode != OK) { LOG(ERROR) << "Transfer request error " << errorCodeToStr(req.errorCode); return req.errorCode; } // Protocol issues will/should be flagged as error when we call createSender // try to create sender SenderPtr sender; auto wdtController = WdtResourceController::get(); // TODO should be using recoverid const std::string secondKey = req.hostName; ErrorCode errCode = wdtController->createSender(wdtNamespace, secondKey, req, sender); if (errCode == ALREADY_EXISTS && terminateExistingOne) { LOG(WARNING) << "Found pre-existing sender for " << wdtNamespace << " " << secondKey << " aborting it and making a new one"; sender->abort(ABORTED_BY_APPLICATION); // This may log an error too wdtController->releaseSender(wdtNamespace, secondKey); // Try#2 errCode = wdtController->createSender(wdtNamespace, secondKey, req, sender); } if (errCode != OK) { LOG(ERROR) << "Failed to create sender " << errorCodeToStr(errCode) << " " << wdtNamespace << " " << secondKey; return errCode; } wdtSetAbortSocketCreatorAndReporter(wdtNamespace, sender.get(), req, abortChecker); auto transferReport = sender->transfer(); ErrorCode ret = transferReport->getSummary().getErrorCode(); wdtController->releaseSender(wdtNamespace, secondKey); LOG(INFO) << "wdtSend for " << wdtNamespace << " " << secondKey << " " << " ended with " << errorCodeToStr(ret); return ret; }
string WdtTransferRequest::generateUrl(bool genFull) const { if (errorCode == ERROR || errorCode == URI_PARSE_ERROR) { LOG(ERROR) << "Transfer request has errors present "; return errorCodeToStr(errorCode); } WdtUri wdtUri; wdtUri.setHostName(hostName); wdtUri.setQueryParam(TRANSFER_ID_PARAM, transferId); wdtUri.setQueryParam(PROTOCOL_VERSION_PARAM, folly::to<string>(protocolVersion)); wdtUri.setQueryParam(PORTS_PARAM, getSerializedPortsList()); if (genFull) { wdtUri.setQueryParam(DIRECTORY_PARAM, directory); } return wdtUri.generateUrl(); }
void SenderThread::start() { Clock::time_point startTime = Clock::now(); if (buf_ == nullptr) { WTLOG(ERROR) << "Unable to allocate buffer"; threadStats_.setLocalErrorCode(MEMORY_ALLOCATION_ERROR); return; } setFooterType(); controller_->executeAtStart([&]() { wdtParent_->startNewTransfer(); }); SenderState state = CONNECT; while (state != END) { ErrorCode abortCode = getThreadAbortCode(); if (abortCode != OK) { WTLOG(ERROR) << "Transfer aborted " << errorCodeToStr(abortCode); threadStats_.setLocalErrorCode(ABORT); if (abortCode == VERSION_MISMATCH) { state = PROCESS_VERSION_MISMATCH; } else { break; } } state = (this->*stateMap_[state])(); } EncryptionType encryptionType = (socket_ ? socket_->getEncryptionType() : ENC_NONE); threadStats_.setEncryptionType(encryptionType); double totalTime = durationSeconds(Clock::now() - startTime); WTLOG(INFO) << "Port " << port_ << " done. " << threadStats_ << " Total throughput = " << threadStats_.getEffectiveTotalBytes() / totalTime / kMbToB << " Mbytes/sec"; ThreadTransferHistory &transferHistory = getTransferHistory(); transferHistory.markNotInUse(); controller_->deRegisterThread(threadIndex_); controller_->executeAtEnd([&]() { wdtParent_->endCurTransfer(); }); // Important to delete the socket before the thread dies for sub class // of clientsocket which have thread local data socket_ = nullptr; return; }
SenderState SenderThread::connect() { WTVLOG(1) << "entered CONNECT state"; if (socket_) { ErrorCode socketErrCode = socket_->getNonRetryableErrCode(); if (socketErrCode != OK) { WTLOG(ERROR) << "Socket has non-retryable error " << errorCodeToStr(socketErrCode); threadStats_.setLocalErrorCode(socketErrCode); return END; } socket_->closeNoCheck(); } if (numReconnectWithoutProgress_ >= options_.max_transfer_retries) { WTLOG(ERROR) << "Sender thread reconnected " << numReconnectWithoutProgress_ << " times without making any progress, giving up. port: " << socket_->getPort(); threadStats_.setLocalErrorCode(NO_PROGRESS); return END; } ErrorCode code; // TODO cleanup more but for now avoid having 2 socket object live per port socket_ = nullptr; socket_ = connectToReceiver(port_, threadCtx_->getAbortChecker(), code); if (code == ABORT) { threadStats_.setLocalErrorCode(ABORT); if (getThreadAbortCode() == VERSION_MISMATCH) { return PROCESS_VERSION_MISMATCH; } return END; } if (code != OK) { threadStats_.setLocalErrorCode(code); return END; } auto nextState = SEND_SETTINGS; if (threadStats_.getLocalErrorCode() != OK) { nextState = READ_LOCAL_CHECKPOINT; } // resetting the status of thread reset(); return nextState; }
SenderState SenderThread::processDoneCmd() { WTVLOG(1) << "entered PROCESS_DONE_CMD state"; // DONE cmd implies that all the blocks sent till now is acked ThreadTransferHistory &transferHistory = getTransferHistory(); transferHistory.markAllAcknowledged(); // send ack for DONE buf_[0] = Protocol::DONE_CMD; socket_->write(buf_, 1); socket_->shutdownWrites(); ErrorCode retCode = socket_->expectEndOfStream(); if (retCode != OK) { WTLOG(WARNING) << "Logical EOF not found when expected " << errorCodeToStr(retCode); threadStats_.setLocalErrorCode(retCode); return CONNECT; } WTVLOG(1) << "done with transfer, port " << port_; return END; }
SenderState SenderThread::processAbortCmd() { WTLOG(INFO) << "entered PROCESS_ABORT_CMD state "; ThreadTransferHistory &transferHistory = getTransferHistory(); threadStats_.setLocalErrorCode(ABORT); int toRead = Protocol::kAbortLength; auto numRead = socket_->read(buf_, toRead); if (numRead != toRead) { // can not read checkpoint, but still must exit because of ABORT WTLOG(ERROR) << "Error while trying to read ABORT cmd " << numRead << " " << toRead; return END; } int64_t offset = 0; int32_t negotiatedProtocol; ErrorCode remoteError; int64_t checkpoint; Protocol::decodeAbort(buf_, offset, bufSize_, negotiatedProtocol, remoteError, checkpoint); threadStats_.setRemoteErrorCode(remoteError); std::string failedFileName = transferHistory.getSourceId(checkpoint); WTLOG(WARNING) << "Received abort on " << " remote protocol version " << negotiatedProtocol << " remote error code " << errorCodeToStr(remoteError) << " file " << failedFileName << " checkpoint " << checkpoint; wdtParent_->abort(remoteError); if (remoteError == VERSION_MISMATCH) { if (Protocol::negotiateProtocol( negotiatedProtocol, threadProtocolVersion_) == negotiatedProtocol) { // sender can support this negotiated version negotiatedProtocol_ = negotiatedProtocol; return PROCESS_VERSION_MISMATCH; } else { WTLOG(ERROR) << "Sender can not support receiver version " << negotiatedProtocol; threadStats_.setRemoteErrorCode(VERSION_INCOMPATIBLE); } } return END; }
ErrorCode TransferLogManager::parseVerifyAndFix( const std::string &recoveryId, int64_t config, bool parseOnly, std::vector<FileChunksInfo> &parsedInfo) { if (fd_ < 0) { return INVALID_LOG; } LogParser parser(options_, encoderDecoder_, rootDir_, recoveryId, config, parseOnly); resumptionStatus_ = parser.parseLog(fd_, senderIp_, parsedInfo); if (resumptionStatus_ == INVALID_LOG) { // leave the log, but close it. Keeping the invalid log ensures that the // directory remains invalid. Closing the log means that nothing else can // be written to it again closeLog(); } else if (resumptionStatus_ == INCONSISTENT_DIRECTORY) { if (!parseOnly) { // if we are only parsing, we should not modify anything invalidateDirectory(); } } LOG(INFO) << "Transfer log parsing finished " << errorCodeToStr(resumptionStatus_); return resumptionStatus_; }
const WdtTransferRequest &Receiver::init() { if (validateTransferRequest() != OK) { WLOG(ERROR) << "Couldn't validate the transfer request " << transferRequest_.getLogSafeString(); return transferRequest_; } checkAndUpdateBufferSize(); backlog_ = options_.backlog; if (getTransferId().empty()) { setTransferId(WdtBase::generateTransferId()); } setProtocolVersion(transferRequest_.protocolVersion); setDir(transferRequest_.directory); auto numThreads = transferRequest_.ports.size(); // This creates the destination directory (which is needed for transferLogMgr) fileCreator_.reset(new FileCreator(destDir_, numThreads, transferLogManager_, options_.skip_writes)); // Make sure we can get the lock on the transfer log manager early // so if we can't we don't generate a valid but useless url and end up // starting a sender doomed to fail if (options_.enable_download_resumption) { WDT_CHECK(!options_.skip_writes) << "Can not skip transfers with download resumption turned on"; if (options_.resume_using_dir_tree) { WDT_CHECK(!options_.shouldPreallocateFiles()) << "Can not resume using directory tree if preallocation is enabled"; } ErrorCode errCode = transferLogManager_.openLog(); if (errCode != OK) { WLOG(ERROR) << "Failed to open transfer log " << errorCodeToStr(errCode); transferRequest_.errorCode = errCode; return transferRequest_; } ErrorCode code = transferLogManager_.parseAndMatch( recoveryId_, getTransferConfig(), fileChunksInfo_); if (code == OK && options_.resume_using_dir_tree) { WDT_CHECK(fileChunksInfo_.empty()); traverseDestinationDir(fileChunksInfo_); } } EncryptionType encryptionType = parseEncryptionType(options_.encryption_type); // is encryption enabled? bool encrypt = (encryptionType != ENC_NONE && protocolVersion_ >= Protocol::ENCRYPTION_V1_VERSION); if (encrypt) { WLOG(INFO) << encryptionTypeToStr(encryptionType) << " encryption is enabled for this transfer "; if (!transferRequest_.encryptionData.isSet()) { WLOG(INFO) << "Receiver generating encryption key for type " << encryptionTypeToStr(encryptionType); transferRequest_.encryptionData = EncryptionParams::generateEncryptionParams(encryptionType); } if (!transferRequest_.encryptionData.isSet()) { WLOG(ERROR) << "Unable to generate encryption key for type " << encryptionTypeToStr(encryptionType); transferRequest_.errorCode = ENCRYPTION_ERROR; return transferRequest_; } } else { if (encryptionType != ENC_NONE) { WLOG(WARNING) << "Encryption is enabled, but protocol version is " << protocolVersion_ << ", minimum version required for encryption is " << Protocol::ENCRYPTION_V1_VERSION; } transferRequest_.encryptionData.erase(); } threadsController_ = new ThreadsController(numThreads); threadsController_->setNumFunnels(ReceiverThread::NUM_FUNNELS); threadsController_->setNumBarriers(ReceiverThread::NUM_BARRIERS); threadsController_->setNumConditions(ReceiverThread::NUM_CONDITIONS); // TODO: take transferRequest directly ! receiverThreads_ = threadsController_->makeThreads<Receiver, ReceiverThread>( this, transferRequest_.ports.size(), transferRequest_.ports); size_t numSuccessfulInitThreads = 0; for (auto &receiverThread : receiverThreads_) { ErrorCode code = receiverThread->init(); if (code == OK) { ++numSuccessfulInitThreads; } } WLOG(INFO) << "Registered " << numSuccessfulInitThreads << " successful sockets"; ErrorCode code = OK; const size_t targetSize = transferRequest_.ports.size(); // TODO: replace with getNumPorts/thread if (numSuccessfulInitThreads != targetSize) { code = FEWER_PORTS; if (numSuccessfulInitThreads == 0) { code = ERROR; } } transferRequest_.protocolVersion = protocolVersion_; transferRequest_.ports.clear(); for (const auto &receiverThread : receiverThreads_) { transferRequest_.ports.push_back(receiverThread->getPort()); } if (transferRequest_.hostName.empty()) { char hostName[1024]; int ret = gethostname(hostName, sizeof(hostName)); if (ret == 0) { transferRequest_.hostName.assign(hostName); } else { PLOG(ERROR) << "Couldn't find the host name"; code = ERROR; } } transferRequest_.directory = getDir(); transferRequest_.errorCode = code; return transferRequest_; }