void ClusterWriter::write(OperationContext* opCtx, const BatchedCommandRequest& request, BatchWriteExecStats* stats, BatchedCommandResponse* response) { const NamespaceString& nss = request.getNS(); LastError::Disabled disableLastError(&LastError::get(opCtx->getClient())); // Config writes and shard writes are done differently if (nss.db() == NamespaceString::kConfigDb || nss.db() == NamespaceString::kAdminDb) { Grid::get(opCtx)->catalogClient()->writeConfigServerDirect(opCtx, request, response); } else { TargeterStats targeterStats; { ChunkManagerTargeter targeter(request.getTargetingNS(), &targeterStats); Status targetInitStatus = targeter.init(opCtx); if (!targetInitStatus.isOK()) { toBatchError({targetInitStatus.code(), str::stream() << "unable to target" << (request.isInsertIndexRequest() ? " index" : "") << " write op for collection " << request.getTargetingNS().ns() << causedBy(targetInitStatus)}, response); return; } BatchWriteExec::executeBatch(opCtx, targeter, request, response, stats); } splitIfNeeded(opCtx, request.getNS(), targeterStats); } }
void Strategy::writeOp(OperationContext* txn, int op, Request& request) { // make sure we have a last error dassert(&LastError::get(cc())); OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned; vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector(); msgToBatchRequests(request.m(), &commandRequests); for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin(); it != commandRequests.end(); ++it) { // Multiple commands registered to last error as multiple requests if (it != commandRequests.begin()) LastError::get(cc()).startRequest(); BatchedCommandRequest* commandRequest = *it; // Adjust namespaces for command NamespaceString fullNS(commandRequest->getNS()); string cmdNS = fullNS.getCommandNS(); // We only pass in collection name to command commandRequest->setNS(fullNS); BSONObjBuilder builder; BSONObj requestBSON = commandRequest->toBSON(); { // Disable the last error object for the duration of the write cmd LastError::Disabled disableLastError(&LastError::get(cc())); Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0); } BatchedCommandResponse commandResponse; bool parsed = commandResponse.parseBSON(builder.done(), NULL); (void)parsed; // for compile dassert(parsed && commandResponse.isValid(NULL)); // Populate the lastError object based on the write response LastError::get(cc()).reset(); bool hadError = batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc())); // Check if this is an ordered batch and we had an error which should stop processing if (commandRequest->getOrdered() && hadError) break; } }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write LastError::Disabled disableLastError(&LastError::get(opCtx->getClient())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
bool ClusterWriteCmd::run(OperationContext* txn, const string& dbName, BSONObj& cmdObj, int options, string& errMsg, BSONObjBuilder& result, bool ) { BatchedCommandRequest request( _writeType ); BatchedCommandResponse response; ClusterWriter writer( true /* autosplit */, 0 /* timeout */ ); // NOTE: Sometimes this command is invoked with LE disabled for legacy writes LastError* cmdLastError = lastError.get( false ); { // Disable the last error object for the duration of the write LastError::Disabled disableLastError( cmdLastError ); // TODO: if we do namespace parsing, push this to the type if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) { // Batch parse failure response.setOk( false ); response.setErrCode( ErrorCodes::FailedToParse ); response.setErrMessage( errMsg ); } else { // Fixup the namespace to be a full ns internally NamespaceString nss( dbName, request.getNS() ); request.setNS( nss.ns() ); writer.write( request, &response ); } dassert( response.isValid( NULL ) ); } if ( cmdLastError ) { // Populate the lastError object based on the write response cmdLastError->reset(); batchErrorToLastError( request, response, cmdLastError ); } size_t numAttempts; if ( !response.getOk() ) { numAttempts = 0; } else if ( request.getOrdered() && response.isErrDetailsSet() ) { numAttempts = response.getErrDetailsAt(0)->getIndex() + 1; // Add one failed attempt } else { numAttempts = request.sizeWriteOps(); } // TODO: increase opcounters by more than one if ( _writeType == BatchedCommandRequest::BatchType_Insert ) { for( size_t i = 0; i < numAttempts; ++i ) { globalOpCounters.gotInsert(); } } else if ( _writeType == BatchedCommandRequest::BatchType_Update ) { for( size_t i = 0; i < numAttempts; ++i ) { globalOpCounters.gotUpdate(); } } else if ( _writeType == BatchedCommandRequest::BatchType_Delete ) { for( size_t i = 0; i < numAttempts; ++i ) { globalOpCounters.gotDelete(); } } // Save the last opTimes written on each shard for this client, to allow GLE to work if ( ClientInfo::exists() && writer.getStats().hasShardStats() ) { ClientInfo* clientInfo = ClientInfo::get( NULL ); clientInfo->addHostOpTimes( writer.getStats().getShardStats().getWriteOpTimes() ); } // TODO // There's a pending issue about how to report response here. If we use // the command infra-structure, we should reuse the 'errmsg' field. But // we have already filed that message inside the BatchCommandResponse. // return response.getOk(); result.appendElements( response.toBSON() ); return true; }