/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse, bool fsyncCheck ) { NamespaceString nss( clientRequest.getNS() ); dassert( nss.db() == "config" || nss.db() == "admin" ); dassert( clientRequest.sizeWriteOps() == 1u ); if ( fsyncCheck ) { // // Sanity check that all configs are still reachable using fsync, preserving legacy // behavior // OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned; vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector(); // // Send side // for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; FsyncRequest fsyncRequest; _dispatcher->addCommand( configHost, "admin", fsyncRequest ); } _dispatcher->sendAll(); // // Recv side // bool fsyncError = false; while ( _dispatcher->numPending() > 0 ) { fsyncResponses.push_back( new ConfigFsyncResponse() ); ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back(); Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost, &fsyncResponse.response ); // We've got to recv everything, no matter what if ( !dispatchStatus.isOK() ) { fsyncError = true; buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response ); } else if ( !fsyncResponse.response.getOk() ) { fsyncError = true; } } if ( fsyncError ) { combineFsyncErrors( fsyncResponses, clientResponse ); return; } else { fsyncResponsesOwned.clear(); } } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; _dispatcher->addCommand( configHost, nss.db(), configRequest ); } // Send them all out _dispatcher->sendAll(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response responses.push_back( new ConfigResponse() ); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost, &configResponse.response ); if ( !dispatchStatus.isOK() ) { buildErrorFrom( dispatchStatus, &configResponse.response ); } } combineResponses( responses, clientResponse ); }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
void CScriptDebugging::OnLuaMainDestroy ( CLuaMain* pLuaMain ) { dassert ( !ListContains ( m_LuaMainStack, pLuaMain ) ); ListRemove ( m_LuaMainStack, pLuaMain ); }
const UpdateIndexData& CollectionInfoCache::getIndexKeys(OperationContext* opCtx) const { // This requires "some" lock, and MODE_IS is an expression for that, for now. dassert(opCtx->lockState()->isCollectionLockedForMode(_collection->ns().ns(), MODE_IS)); invariant(_keysComputed); return _indexedPaths; }
void nfs_client_impl::end_get_file_size( ::dsn::error_code err, const ::dsn::service::get_file_size_response& resp, void* context) { user_request* ureq = (user_request*)context; if (err != ::dsn::ERR_OK) { derror("remote copy request failed"); ureq->nfs_task->enqueue(err, 0, ureq->nfs_task->node()); delete ureq; return; } err.set(resp.error); if (err != ::dsn::ERR_OK) { derror("remote copy request failed"); error_code resp_err; resp_err.set(resp.error); ureq->nfs_task->enqueue(resp_err, 0, ureq->nfs_task->node()); delete ureq; return; } for (size_t i = 0; i < resp.size_list.size(); i++) // file list { file_context *filec; uint64_t size = resp.size_list[i]; filec = new file_context(ureq, resp.file_list[i], resp.size_list[i]); ureq->file_context_map.insert(std::pair<std::string, file_context*>( ureq->file_size_req.dst_dir + resp.file_list[i], filec)); //dinfo("this file size is %d, name is %s", size, resp.file_list[i].c_str()); // new all the copy requests uint64_t req_offset = 0; uint32_t req_size; if (size > _opts.nfs_copy_block_bytes) req_size = _opts.nfs_copy_block_bytes; else req_size = static_cast<uint32_t>(size); int idx = 0; for (;;) // send one file with multi-round rpc { auto req = boost::intrusive_ptr<copy_request_ex>(new copy_request_ex(filec, idx++)); filec->copy_requests.push_back(req); { zauto_lock l(_copy_requests_lock); _copy_requests.push(req); } req->copy_req.source = ureq->file_size_req.source; req->copy_req.file_name = resp.file_list[i]; req->copy_req.offset = req_offset; req->copy_req.size = req_size; req->copy_req.dst_dir = ureq->file_size_req.dst_dir; req->copy_req.source_dir = ureq->file_size_req.source_dir; req->copy_req.overwrite = ureq->file_size_req.overwrite; req->copy_req.is_last = (size <= req_size); req_offset += req_size; size -= req_size; if (size <= 0) { dassert(size == 0, "last request must read exactly the remaing size of the file"); break; } if (size > _opts.nfs_copy_block_bytes) req_size = _opts.nfs_copy_block_bytes; else req_size = static_cast<uint32_t>(size); } } continue_copy(0); }
Status DBClientShardResolver::chooseWriteHost( const string& shardName, ConnectionString* shardHost ) const { // Declare up here for parsing later string errMsg; // Special-case for config and admin if ( shardName == "config" || shardName == "admin" ) { *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg ); dassert( errMsg == "" ); return Status::OK(); } // // First get the information about the shard from the shard cache // // Internally uses our shard cache, does no reload Shard shard = Shard::findIfExists( shardName ); if ( shard.getName() == "" ) { return Status( ErrorCodes::ShardNotFound, string("unknown shard name ") + shardName ); } ConnectionString rawShardHost = ConnectionString::parse( shard.getConnString(), errMsg ); dassert( errMsg == "" ); dassert( rawShardHost.type() == ConnectionString::SET || rawShardHost.type() == ConnectionString::MASTER ); if ( rawShardHost.type() == ConnectionString::MASTER ) { *shardHost = rawShardHost; return Status::OK(); } // // If we need to, then get the particular node we're targeting in the replica set // // Does not reload the monitor if it doesn't currently exist ReplicaSetMonitorPtr replMonitor = ReplicaSetMonitor::get( rawShardHost.getSetName(), false ); if ( !replMonitor ) { return Status( ErrorCodes::ReplicaSetNotFound, string("unknown replica set ") + rawShardHost.getSetName() ); } try { // This can throw when we don't find a master! HostAndPort masterHostAndPort = replMonitor->getMaster(); *shardHost = ConnectionString::parse( masterHostAndPort.toString( true ), errMsg ); dassert( errMsg == "" ); return Status::OK(); } catch ( const DBException& ) { return Status( ErrorCodes::HostNotFound, string("could not contact primary for replica set ") + replMonitor->getName() ); } // Unreachable dassert( false ); return Status( ErrorCodes::UnknownError, "" ); }
KeyV1Owned::KeyV1Owned(const KeyV1& rhs) { b.appendBuf(rhs.data(), rhs.dataSize()); _keyData = (const unsigned char*)b.buf(); dassert(b.len() == dataSize()); // check datasize method is correct dassert((*_keyData & cNOTUSED) == 0); }
error_code io_looper::bind_io_handle( dsn_handle_t handle, io_loop_callback* cb, unsigned int events, ref_counter* ctx ) { int fd; short filters[2]; int nr_filters; struct kevent e; if (cb == nullptr) { derror("cb == nullptr"); return ERR_INVALID_PARAMETERS; } fd = (int)(intptr_t)(handle); if (fd < 0) { if (fd != IO_LOOPER_USER_NOTIFICATION_FD) { derror("The fd %d is less than 0.", fd); return ERR_INVALID_PARAMETERS; } } if (_filters.find((short)events) == _filters.end()) { derror("The filter %u is unsupported.", events); return ERR_INVALID_PARAMETERS; } if (fd > 0) { int flags = fcntl(fd, F_GETFL, 0); dassert (flags != -1, "fcntl failed, err = %s, fd = %d", strerror(errno), fd); if (!(flags & O_NONBLOCK)) { flags |= O_NONBLOCK; flags = fcntl(fd, F_SETFL, flags); dassert(flags != -1, "fcntl failed, err = %s, fd = %d", strerror(errno), fd); } } uintptr_t cb0 = (uintptr_t)cb; dassert((cb0 & 0x1) == 0, "the least one bit must be zero for the callback address"); if (ctx) { cb0 |= 0x1; // has ref_counter utils::auto_lock<utils::ex_lock_nr_spin> l(_io_sessions_lock); auto pr = _io_sessions.insert(io_sessions::value_type(cb, ctx)); dassert(pr.second, "the callback must not be registered before"); } if ((short)events == EVFILT_READ_WRITE) { filters[0] = EVFILT_READ; filters[1] = EVFILT_WRITE; nr_filters = 2; } else { filters[0] = (short)events; nr_filters = 1; } for (int i = 0; i < nr_filters; i++) { EV_SET(&e, fd, filters[i], (EV_ADD | EV_ENABLE | EV_CLEAR), 0, 0, (void*)cb0); if (kevent(_io_queue, &e, 1, nullptr, 0, nullptr) == -1) { derror("bind io handler to kqueue failed, err = %s, fd = %d", strerror(errno), fd); if (ctx) { utils::auto_lock<utils::ex_lock_nr_spin> l(_io_sessions_lock); auto r = _io_sessions.erase(cb); dassert(r > 0, "the callback must be present"); } for (int j = 0; j < i; j++) { EV_SET(&e, fd, filters[j], EV_DELETE, 0, 0, nullptr); if (kevent(_io_queue, &e, 1, nullptr, 0, nullptr) == -1) { derror("Unregister kqueue failed, filter = %d, err = %s, fd = %d", filters[j], strerror(errno), fd); } } return ERR_BIND_IOCP_FAILED; } } return ERR_OK; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); auto ws = make_unique<WorkingSet>(); auto root = make_unique<QueuedDataStage>(txn, ws.get()); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); ClientCursor* cursor = new ClientCursor(CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace, txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot()); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
Status MetadataLoader::initChunks( const string& ns, const string& shard, const CollectionMetadata* oldMetadata, CollectionMetadata* metadata ) const { map<string, ChunkVersion> versionMap; // Preserve the epoch versionMap[shard] = metadata->_shardVersion; OID epoch = metadata->getCollVersion().epoch(); bool fullReload = true; // Check to see if we should use the old version or not. if ( oldMetadata ) { // If our epochs are compatible, it's useful to use the old metadata for diffs if ( oldMetadata->getCollVersion().hasCompatibleEpoch( epoch ) ) { fullReload = false; dassert( oldMetadata->isValid() ); versionMap[shard] = oldMetadata->_shardVersion; metadata->_collVersion = oldMetadata->_collVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. metadata->_chunksMap = oldMetadata->_chunksMap; LOG( 2 ) << "loading new chunks for collection " << ns << " using old metadata w/ version " << oldMetadata->getShardVersion() << " and " << metadata->_chunksMap.size() << " chunks" << endl; } else { warning() << "reloading collection metadata for " << ns << " with new epoch " << epoch.toString() << ", the current epoch is " << oldMetadata->getCollVersion().epoch().toString() << endl; } } // Exposes the new metadata's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ( shard ); differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap ); try { ScopedDbConnection conn( _configLoc.toString(), 30 ); auto_ptr<DBClientCursor> cursor = conn->query( ChunkType::ConfigNS, differ.configDiffQuery() ); if ( !cursor.get() ) { // Make our metadata invalid metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::HostUnreachable, "problem opening chunk metadata cursor" ); } // // The diff tracker should always find at least one chunk (the highest chunk we saw // last time). If not, something has changed on the config server (potentially between // when we read the collection data and when we read the chunks data). // int diffsApplied = differ.calculateConfigDiff( *cursor ); if ( diffsApplied > 0 ) { // Chunks found, return ok LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns << " with version " << metadata->_collVersion << endl; metadata->_shardVersion = versionMap[shard]; metadata->fillRanges(); conn.done(); dassert( metadata->isValid() ); return Status::OK(); } else if ( diffsApplied == 0 ) { // No chunks found, the collection is dropping or we're confused // If this is a full reload, assume it is a drop for backwards compatibility // TODO: drop the config.collections entry *before* the chunks and eliminate this // ambiguity string errMsg = str::stream() << "no chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ( fullReload ? ", this is a drop" : "" ); warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return fullReload ? Status( ErrorCodes::NamespaceNotFound, errMsg ) : Status( ErrorCodes::RemoteChangeDetected, errMsg ); } else { // Invalid chunks found, our epoch may have changed because we dropped/recreated // the collection. string errMsg = // br str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ", this should be rare"; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); conn.done(); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } } catch ( const DBException& e ) { string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e ); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } }
Status MetadataLoader::initCollection( const string& ns, const string& shard, CollectionMetadata* metadata ) const { // // Bring collection entry from the config server. // BSONObj collDoc; { try { ScopedDbConnection conn( _configLoc.toString(), 30 ); collDoc = conn->findOne( CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns)); conn.done(); } catch ( const DBException& e ) { string errMsg = str::stream() << "could not query collection metadata" << causedBy( e ); // We deliberately do not return conn to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } } string errMsg; if ( collDoc.isEmpty() ) { errMsg = str::stream() << "could not load metadata, collection " << ns << " not found"; warning() << errMsg << endl; return Status( ErrorCodes::NamespaceNotFound, errMsg ); } CollectionType collInfo; if ( !collInfo.parseBSON( collDoc, &errMsg ) || !collInfo.isValid( &errMsg ) ) { errMsg = str::stream() << "could not parse metadata for collection " << ns << causedBy( errMsg ); warning() << errMsg << endl; return Status( ErrorCodes::FailedToParse, errMsg ); } if ( collInfo.isDroppedSet() && collInfo.getDropped() ) { errMsg = str::stream() << "could not load metadata, collection " << ns << " was dropped"; warning() << errMsg << endl; return Status( ErrorCodes::NamespaceNotFound, errMsg ); } if ( collInfo.isKeyPatternSet() && !collInfo.getKeyPattern().isEmpty() ) { // Sharded collection, need to load chunks metadata->_keyPattern = collInfo.getKeyPattern(); metadata->_shardVersion = ChunkVersion( 0, 0, collInfo.getEpoch() ); metadata->_collVersion = ChunkVersion( 0, 0, collInfo.getEpoch() ); return Status::OK(); } else if ( collInfo.isPrimarySet() && collInfo.getPrimary() == shard ) { // A collection with a non-default primary // Empty primary field not allowed if set dassert( collInfo.getPrimary() != "" ); metadata->_keyPattern = BSONObj(); metadata->_shardVersion = ChunkVersion( 1, 0, collInfo.getEpoch() ); metadata->_collVersion = metadata->_shardVersion; return Status::OK(); } else { // A collection with a primary that doesn't match this shard or is empty, the primary // may have changed before we loaded. errMsg = // br str::stream() << "collection " << ns << " does not have a shard key " << "and primary " << ( collInfo.isPrimarySet() ? collInfo.getPrimary() : "" ) << " does not match this shard " << shard; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } }
Status ReadConcernArgs::initialize(const BSONElement& readConcernElem) { invariant(isEmpty()); // only legal to call on uninitialized object. if (readConcernElem.eoo()) { return Status::OK(); } dassert(readConcernElem.fieldNameStringData() == kReadConcernFieldName); if (readConcernElem.type() != Object) { return Status(ErrorCodes::FailedToParse, str::stream() << kReadConcernFieldName << " field should be an object"); } BSONObj readConcernObj = readConcernElem.Obj(); for (auto&& field : readConcernObj) { auto fieldName = field.fieldNameStringData(); if (fieldName == kAfterOpTimeFieldName) { OpTime opTime; // TODO pass field in rather than scanning again. auto opTimeStatus = bsonExtractOpTimeField(readConcernObj, kAfterOpTimeFieldName, &opTime); if (!opTimeStatus.isOK()) { return opTimeStatus; } _opTime = opTime; } else if (fieldName == kAfterClusterTimeFieldName) { Timestamp afterClusterTime; auto afterClusterTimeStatus = bsonExtractTimestampField( readConcernObj, kAfterClusterTimeFieldName, &afterClusterTime); if (!afterClusterTimeStatus.isOK()) { return afterClusterTimeStatus; } _afterClusterTime = LogicalTime(afterClusterTime); } else if (fieldName == kAtClusterTimeFieldName) { Timestamp atClusterTime; auto atClusterTimeStatus = bsonExtractTimestampField(readConcernObj, kAtClusterTimeFieldName, &atClusterTime); if (!atClusterTimeStatus.isOK()) { return atClusterTimeStatus; } _atClusterTime = LogicalTime(atClusterTime); } else if (fieldName == kLevelFieldName) { std::string levelString; // TODO pass field in rather than scanning again. auto readCommittedStatus = bsonExtractStringField(readConcernObj, kLevelFieldName, &levelString); if (!readCommittedStatus.isOK()) { return readCommittedStatus; } if (levelString == kLocalReadConcernStr) { _level = ReadConcernLevel::kLocalReadConcern; } else if (levelString == kMajorityReadConcernStr) { _level = ReadConcernLevel::kMajorityReadConcern; } else if (levelString == kLinearizableReadConcernStr) { _level = ReadConcernLevel::kLinearizableReadConcern; } else if (levelString == kAvailableReadConcernStr) { _level = ReadConcernLevel::kAvailableReadConcern; } else if (levelString == kSnapshotReadConcernStr) { _level = ReadConcernLevel::kSnapshotReadConcern; } else { return Status(ErrorCodes::FailedToParse, str::stream() << kReadConcernFieldName << '.' << kLevelFieldName << " must be either 'local', 'majority', " "'linearizable', 'available', or 'snapshot'"); } } else { return Status(ErrorCodes::InvalidOptions, str::stream() << "Unrecognized option in " << kReadConcernFieldName << ": " << fieldName); } } if (_afterClusterTime && _opTime) { return Status(ErrorCodes::InvalidOptions, str::stream() << "Can not specify both " << kAfterClusterTimeFieldName << " and " << kAfterOpTimeFieldName); } if (_afterClusterTime && _atClusterTime) { return Status(ErrorCodes::InvalidOptions, str::stream() << "Can not specify both " << kAfterClusterTimeFieldName << " and " << kAtClusterTimeFieldName); } // Note: 'available' should not be used with after cluster time, as cluster time can wait for // replication whereas the premise of 'available' is to avoid waiting. 'linearizable' should not // be used with after cluster time, since linearizable reads are inherently causally consistent. if (_afterClusterTime && getLevel() != ReadConcernLevel::kMajorityReadConcern && getLevel() != ReadConcernLevel::kLocalReadConcern && getLevel() != ReadConcernLevel::kSnapshotReadConcern) { return Status(ErrorCodes::InvalidOptions, str::stream() << kAfterClusterTimeFieldName << " field can be set only if " << kLevelFieldName << " is equal to " << kMajorityReadConcernStr << ", " << kLocalReadConcernStr << ", or " << kSnapshotReadConcernStr); } if (_opTime && getLevel() == ReadConcernLevel::kSnapshotReadConcern) { return Status(ErrorCodes::InvalidOptions, str::stream() << kAfterOpTimeFieldName << " field cannot be set if " << kLevelFieldName << " is equal to " << kSnapshotReadConcernStr); } if (_atClusterTime && getLevel() != ReadConcernLevel::kSnapshotReadConcern) { return Status(ErrorCodes::InvalidOptions, str::stream() << kAtClusterTimeFieldName << " field can be set only if " << kLevelFieldName << " is equal to " << kSnapshotReadConcernStr); } if (_afterClusterTime && _afterClusterTime == LogicalTime::kUninitialized) { return Status(ErrorCodes::InvalidOptions, str::stream() << kAfterClusterTimeFieldName << " cannot be a null timestamp"); } if (_atClusterTime && _atClusterTime == LogicalTime::kUninitialized) { return Status(ErrorCodes::InvalidOptions, str::stream() << kAtClusterTimeFieldName << " cannot be a null timestamp"); } return Status::OK(); }
void FindAndModifyRequest::setUpsert(bool upsert) { dassert(_update); _isUpsert = upsert; }
void FindAndModifyRequest::setShouldReturnNew(bool shouldReturnNew) { dassert(_update); _shouldReturnNew = shouldReturnNew; }
// @param reconf true if this is a reconfiguration and not an initial load of the configuration. // @return true if ok; throws if config really bad; false if config doesn't include self bool ReplSetImpl::initFromConfig(OperationContext* txn, ReplSetConfig& c, bool reconf) { // NOTE: haveNewConfig() writes the new config to disk before we get here. So // we cannot error out at this point, except fatally. Check errors earlier. lock lk(this); if (!getLastErrorDefault.isEmpty() || !c.getLastErrorDefaults.isEmpty()) { getLastErrorDefault = c.getLastErrorDefaults; } list<ReplSetConfig::MemberCfg*> newOnes; // additive short-cuts the new config setup. If we are just adding a // node/nodes and nothing else is changing, this is additive. If it's // not a reconfig, we're not adding anything bool additive = reconf; bool updateConfigs = false; { unsigned nfound = 0; int me = 0; for (vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++) { ReplSetConfig::MemberCfg& m = *i; if (isSelf(m.h)) { me++; } if (reconf) { const Member *old = findById(m._id); if (old) { nfound++; verify((int) old->id() == m._id); if (!old->config().isSameIgnoringTags(m)) { additive = false; } if (!updateConfigs && old->config() != m) { updateConfigs = true; } } else { newOnes.push_back(&m); } } } if (me == 0) { // we're not in the config -- we must have been removed if (state().shunned()) { // already took note of our ejection from the set // so just sit tight and poll again return false; } _members.orphanAll(); // kill off rsHealthPoll threads (because they Know Too Much about our past) endOldHealthTasks(); // close sockets to force clients to re-evaluate this member MessagingPort::closeAllSockets(0); // take note of our ejection changeState(MemberState::RS_SHUNNED); // go into holding pattern log() << "replSet info self not present in the repl set configuration:" << rsLog; log() << c.toString() << rsLog; loadConfig(txn); // redo config from scratch return false; } uassert(13302, "replSet error self appears twice in the repl set configuration", me<=1); // if we found different members that the original config, reload everything if (reconf && config().members.size() != nfound) additive = false; } // If we are changing chaining rules, we don't want this to be an additive reconfig so that // the primary can step down and the sync targets change. // TODO: This can be removed once SERVER-5208 is fixed. if (reconf && config().chainingAllowed() != c.chainingAllowed()) { additive = false; } _cfg = new ReplSetConfig(c); // config() is same thing but const, so we use that when we can for clarity below dassert(&config() == _cfg); verify(config().ok()); verify(_name.empty() || _name == config()._id); _name = config()._id; verify(!_name.empty()); // this is a shortcut for simple changes if (additive) { log() << "replSet info : additive change to configuration" << rsLog; if (updateConfigs) { // we have new configs for existing members, so we need to repopulate _members // with the most recent configs _members.orphanAll(); // for logging string members = ""; // not setting _self to 0 as other threads use _self w/o locking int me = 0; for(vector<ReplSetConfig::MemberCfg>::const_iterator i = config().members.begin(); i != config().members.end(); i++) { const ReplSetConfig::MemberCfg& m = *i; Member *mi; members += (members == "" ? "" : ", ") + m.h.toString(); if (isSelf(m.h)) { verify(me++ == 0); mi = new Member(m.h, m._id, &m, true); setSelfTo(mi); } else { mi = new Member(m.h, m._id, &m, false); _members.push(mi); } } // trigger a handshake to update the syncSource of our writeconcern information syncSourceFeedback.forwardSlaveHandshake(); } // add any new members for (list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++) { ReplSetConfig::MemberCfg *m = *i; Member *mi = new Member(m->h, m->_id, m, false); // we will indicate that new members are up() initially so that we don't relinquish // our primary state because we can't (transiently) see a majority. they should be // up as we check that new members are up before getting here on reconfig anyway. mi->get_hbinfo().health = 0.1; _members.push(mi); startHealthTaskFor(mi); } // if we aren't creating new members, we may have to update the // groups for the current ones _cfg->updateMembers(_members); return true; } // start with no members. if this is a reconfig, drop the old ones. _members.orphanAll(); endOldHealthTasks(); int oldPrimaryId = -1; { const Member *p = box.getPrimary(); if (p) oldPrimaryId = p->id(); } forgetPrimary(txn); // not setting _self to 0 as other threads use _self w/o locking int me = 0; // For logging string members = ""; for (vector<ReplSetConfig::MemberCfg>::const_iterator i = config().members.begin(); i != config().members.end(); i++) { const ReplSetConfig::MemberCfg& m = *i; Member *mi; members += (members == "" ? "" : ", ") + m.h.toString(); if (isSelf(m.h)) { verify(me++ == 0); mi = new Member(m.h, m._id, &m, true); if (!reconf) { log() << "replSet I am " << m.h.toString() << rsLog; } setSelfTo(mi); if ((int)mi->id() == oldPrimaryId) box.setSelfPrimary(mi); } else { mi = new Member(m.h, m._id, &m, false); _members.push(mi); if ((int)mi->id() == oldPrimaryId) box.setOtherPrimary(mi); } } if (me == 0){ log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog; } else { // Do this after we've found ourselves, since _self needs // to be set before we can start the heartbeat tasks for (Member *mb = _members.head(); mb; mb=mb->next()) { startHealthTaskFor(mb); } } return true; }
error_code deploy_svc_service_impl::start() { std::string pdir = utils::filesystem::path_combine(dsn_get_current_app_data_dir(), "services"); _service_dir = dsn_config_get_value_string("deploy.service", "deploy_dir", pdir.c_str(), "where to put temporal deployment resources" ); // load clusters const char* clusters[100]; int sz = 100; int count = dsn_config_get_all_keys("deploy.service.clusters", clusters, &sz); dassert(count <= 100, "too many clusters"); for (int i = 0; i < count; i++) { std::string cluster_name = dsn_config_get_value_string( clusters[i], "name", "", "cluster name" ); if (nullptr != get_cluster(cluster_name)) { derror("cluster %s already defined", cluster_name.c_str()); return ERR_CLUSTER_ALREADY_EXIST; } std::string cluster_factory_type = dsn_config_get_value_string( clusters[i], "factory", "", "factory name to create the target cluster scheduler" ); auto cluster = ::dsn::utils::factory_store<cluster_scheduler>::create( cluster_factory_type.c_str(), PROVIDER_TYPE_MAIN ); if (nullptr == cluster) { derror("cluster type %s is not defined", cluster_factory_type.c_str()); return ERR_OBJECT_NOT_FOUND; } std::shared_ptr<cluster_ex> ce(new cluster_ex); ce->scheduler.reset(cluster); ce->cluster.name = cluster_name; ce->cluster.type = cluster->type(); _clusters[cluster_name] = ce; } _cli_deploy = dsn_cli_app_register( "deploy", "deploy deploy_request(in json format)", "deploy an app via our deployment service", (void*)this, [](void *context, int argc, const char **argv, dsn_cli_reply *reply) { auto this_ = (deploy_svc_service_impl*)context; this_->on_deploy_cli(context, argc, argv, reply); }, __svc_cli_freeer__ ); _cli_undeploy = dsn_cli_app_register( "undeploy", "undeploy service_name(in json format)", "undeploy an app via our deployment service", (void*)this, [](void *context, int argc, const char **argv, dsn_cli_reply *reply) { auto this_ = (deploy_svc_service_impl*)context; this_->on_undeploy_cli(context, argc, argv, reply); }, __svc_cli_freeer__ ); _cli_get_service_list = dsn_cli_app_register( "service_list", "service_list package_id(in json format)", "get service list of a package via our deployment service", (void*)this, [](void *context, int argc, const char **argv, dsn_cli_reply *reply) { auto this_ = (deploy_svc_service_impl*)context; this_->on_get_service_list_cli(context, argc, argv, reply); }, __svc_cli_freeer__ ); _cli_get_service_info = dsn_cli_app_register( "service_info", "service_info service_name(in json format)", "get service info of a service via our deployment service", (void*)this, [](void *context, int argc, const char **argv, dsn_cli_reply *reply) { auto this_ = (deploy_svc_service_impl*)context; this_->on_get_service_info_cli(context, argc, argv, reply); }, __svc_cli_freeer__ ); _cli_get_cluster_list = dsn_cli_app_register( "cluster_list", "cluster_list format(in json format)", "get cluster list with a specific format via our deployment service", (void*)this, [](void *context, int argc, const char **argv, dsn_cli_reply *reply) { auto this_ = (deploy_svc_service_impl*)context; this_->on_get_cluster_list_cli(context, argc, argv, reply); }, __svc_cli_freeer__ ); return ERR_OK; }
void scheduler::schedule() { _is_scheduling = true; check(); // check before schedule while (true) { // run ready workers whenever possible std::vector<int> ready_workers; for (auto& s : _threads) { if ((s->in_continuation && s->is_continuation_ready) || (!s->in_continuation && s->worker->queue()->approx_count() > 0) ) { ready_workers.push_back(s->index); } } if (ready_workers.size() > 0) { int i = dsn_random32(0, (uint32_t)ready_workers.size() - 1); _threads[ready_workers[i]]->runnable.release(); _is_scheduling = false; return; } // otherwise, run the timed tasks uint64_t ts = 0; auto events = _wheel.pop_next_events(ts); if (events) { { utils::auto_lock< ::dsn::utils::ex_lock> l(_lock); _time_ns = ts; } // randomize the events, and see std::random_shuffle(events->begin(), events->end(), [](int n) { return dsn_random32(0, n - 1); }); for (auto e : *events) { if (e.app_task != nullptr) { task* t = e.app_task; { node_scoper ns(t->node()); t->enqueue(); } t->release_ref(); // added by previous t->enqueue from app } else { dassert(e.system_task != nullptr, "app and system tasks cannot be both empty"); e.system_task(); } } delete events; continue; } // wait a moment std::this_thread::sleep_for(std::chrono::milliseconds(100)); } _is_scheduling = false; }
error_code native_posix_aio_provider::aio_internal(aio_task* aio_tsk, bool async, __out_param uint32_t* pbytes /*= nullptr*/) { auto aio = (posix_disk_aio_context *)aio_tsk->aio(); int r; aio->this_ = this; aio->cb.aio_fildes = static_cast<int>((ssize_t)aio->file); aio->cb.aio_buf = aio->buffer; aio->cb.aio_nbytes = aio->buffer_size; aio->cb.aio_offset = aio->file_offset; // set up callback aio->cb.aio_sigevent.sigev_notify = SIGEV_THREAD; aio->cb.aio_sigevent.sigev_notify_function = aio_completed; aio->cb.aio_sigevent.sigev_notify_attributes = nullptr; aio->cb.aio_sigevent.sigev_value.sival_ptr = aio; if (!async) { aio->evt = new utils::notify_event(); aio->err = ERR_OK; aio->bytes = 0; } switch (aio->type) { case AIO_Read: r = aio_read(&aio->cb); break; case AIO_Write: r = aio_write(&aio->cb); break; default: dassert (false, "unknown aio type %u", static_cast<int>(aio->type)); break; } if (r != 0) { derror("file op failed, err = %d (%s). On FreeBSD, you may need to load" " aio kernel module by running 'sudo kldload aio'.", errno, strerror(errno)); if (async) { complete_io(aio_tsk, ERR_FILE_OPERATION_FAILED, 0); } else { delete aio->evt; aio->evt = nullptr; } return ERR_FILE_OPERATION_FAILED; } else { if (async) { return ERR_IO_PENDING; } else { aio->evt->wait(); delete aio->evt; aio->evt = nullptr; *pbytes = aio->bytes; return aio->err; } } }
Status ModifierRename::prepare(mutablebson::Element root, const StringData& matchedField, ExecInfo* execInfo) { // Rename doesn't work with positional fields ($) dassert(matchedField.empty()); _preparedState.reset(new PreparedState(root)); // Locate the to field name in 'root', which must exist. size_t fromIdxFound; Status status = pathsupport::findLongestPrefix(_fromFieldRef, root, &fromIdxFound, &_preparedState->fromElemFound); // If we can't find the full element in the from field then we can't do anything. if (!status.isOK()) { execInfo->noOp = true; _preparedState->fromElemFound = root.getDocument().end(); // TODO: remove this special case from existing behavior if (status.code() == ErrorCodes::PathNotViable) { return status; } return Status::OK(); } // Ensure no array in ancestry if what we found is not at the root mutablebson::Element curr = _preparedState->fromElemFound.parent(); if (curr != curr.getDocument().root()) while (curr.ok() && (curr != curr.getDocument().root())) { if (curr.getType() == Array) return Status(ErrorCodes::BadValue, str::stream() << "The source field cannot be an array element, '" << _fromFieldRef.dottedField() << "' in doc with " << findElementNamed(root.leftChild(), "_id").toString() << " has an array field called '" << curr.getFieldName() << "'"); curr = curr.parent(); } // "To" side validation below status = pathsupport::findLongestPrefix(_toFieldRef, root, &_preparedState->toIdxFound, &_preparedState->toElemFound); // FindLongestPrefix may return not viable or any other error and then we cannot proceed. if (status.code() == ErrorCodes::NonExistentPath) { // Not an error condition as we will create the "to" path as needed. } else if (!status.isOK()) { return status; } const bool destExists = _preparedState->toElemFound.ok() && (_preparedState->toIdxFound == (_toFieldRef.numParts()-1)); // Ensure no array in ancestry of "to" Element // Set to either parent, or node depending on if the full path element was found curr = (destExists ? _preparedState->toElemFound.parent() : _preparedState->toElemFound); if (curr != curr.getDocument().root()) { while (curr.ok()) { if (curr.getType() == Array) return Status(ErrorCodes::BadValue, str::stream() << "The destination field cannot be an array element, '" << _fromFieldRef.dottedField() << "' in doc with " << findElementNamed(root.leftChild(), "_id").toString() << " has an array field called '" << curr.getFieldName() << "'"); curr = curr.parent(); } } // We register interest in the field name. The driver needs this info to sort out if // there is any conflict among mods. execInfo->fieldRef[0] = &_fromFieldRef; execInfo->fieldRef[1] = &_toFieldRef; execInfo->noOp = false; return Status::OK(); }
bool LogComponentSettings::hasMinimumLogSeverity(LogComponent component) const { dassert(int(component) >= 0 && int(component) < LogComponent::kNumLogComponents); return _hasMinimumLoggedSeverity[component]; }
// fromBSON to Key format KeyV1Owned::KeyV1Owned(const BSONObj& obj) { BSONObj::iterator i(obj); unsigned char bits = 0; while (1) { BSONElement e = i.next(); if (i.more()) bits |= cHASMORE; switch (e.type()) { case MinKey: b.appendUChar(cminkey | bits); break; case jstNULL: b.appendUChar(cnull | bits); break; case MaxKey: b.appendUChar(cmaxkey | bits); break; case Bool: b.appendUChar((e.boolean() ? ctrue : cfalse) | bits); break; case jstOID: b.appendUChar(coid | bits); b.appendBuf(e.__oid().view().view(), OID::kOIDSize); break; case BinData: { int t = e.binDataType(); // 0-7 and 0x80 to 0x87 are supported by Key if ((t & 0x78) == 0 && t != ByteArrayDeprecated) { int len; const char* d = e.binData(len); if (len <= BinDataLenMax) { int code = BinDataLengthToCode[len]; if (code >= 0) { if (t >= 128) t = (t - 128) | 0x08; dassert((code & t) == 0); b.appendUChar(cbindata | bits); b.appendUChar(code | t); b.appendBuf(d, len); break; } } } traditional(obj); return; } case Date: b.appendUChar(cdate | bits); b.appendStruct(e.date()); break; case String: { b.appendUChar(cstring | bits); // note we do not store the terminating null, to save space. unsigned x = (unsigned)e.valuestrsize() - 1; if (x > 255) { traditional(obj); return; } b.appendUChar(x); b.appendBuf(e.valuestr(), x); break; } case NumberInt: b.appendUChar(cint | bits); b.appendNum((double)e._numberInt()); break; case NumberLong: { long long n = e._numberLong(); long long m = 2LL << 52; DEV { long long d = m - 1; verify(((long long)((double)-d)) == -d); } if (n >= m || n <= -m) { // can't represent exactly as a double traditional(obj); return; } b.appendUChar(clong | bits); b.appendNum((double)n); break; } case NumberDouble: { double d = e._numberDouble(); if (isNaN(d)) { traditional(obj); return; } b.appendUChar(cdouble | bits); b.appendNum(d); break; } default: // if other types involved, store as traditional BSON traditional(obj); return; } if (!i.more()) break; bits = 0; } _keyData = (const unsigned char*)b.buf(); dassert(b.len() == dataSize()); // check datasize method is correct dassert((*_keyData & cNOTUSED) == 0); }
LogSeverity LogComponentSettings::getMinimumLogSeverity(LogComponent component) const { dassert(int(component) >= 0 && int(component) < LogComponent::kNumLogComponents); return LogSeverity::cast(_minimumLoggedSeverity[component]); }
void ChunkManager::getAllShardIds(set<ShardId>* all) const { dassert(all); all->insert(_shardIds.begin(), _shardIds.end()); }
void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse ) { BatchWriteOp batchOp; batchOp.initClientRequest( &clientRequest ); // Current batch status bool refreshedTargeter = false; int rounds = 0; int numCompletedOps = 0; int numRoundsWithoutProgress = 0; while ( !batchOp.isFinished() ) { // // Get child batches to send using the targeter // // Targeting errors can be caused by remote metadata changing (the collection could have // been dropped and recreated, for example with a new shard key). If a remote metadata // change occurs *before* a client sends us a batch, we need to make sure that we don't // error out just because we're staler than the client - otherwise mongos will be have // unpredictable behavior. // // (If a metadata change happens *during* or *after* a client sends us a batch, however, // we make no guarantees about delivery.) // // For this reason, we don't record targeting errors until we've refreshed our targeting // metadata at least once *after* receiving the client batch - at that point, we know: // // 1) our new metadata is the same as the metadata when the client sent a batch, and so // targeting errors are real. // OR // 2) our new metadata is a newer version than when the client sent a batch, and so // the metadata must have changed after the client batch was sent. We don't need to // deliver in this case, since for all the client knows we may have gotten the batch // exactly when the metadata changed. // OwnedPointerVector<TargetedWriteBatch> childBatchesOwned; vector<TargetedWriteBatch*>& childBatches = childBatchesOwned.mutableVector(); // If we've already had a targeting error, we've refreshed the metadata once and can // record target errors definitively. bool recordTargetErrors = refreshedTargeter; Status targetStatus = batchOp.targetBatch( *_targeter, recordTargetErrors, &childBatches ); if ( !targetStatus.isOK() ) { // Don't do anything until a targeter refresh _targeter->noteCouldNotTarget(); refreshedTargeter = true; ++_stats->numTargetErrors; dassert( childBatches.size() == 0u ); } // // Send all child batches // size_t numSent = 0; size_t numToSend = childBatches.size(); bool remoteMetadataChanging = false; while ( numSent != numToSend ) { // Collect batches out on the network, mapped by endpoint HostBatchMap pendingBatches; // // Send side // // Get as many batches as we can at once for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin(); it != childBatches.end(); ++it ) { // // Collect the info needed to dispatch our targeted batch // TargetedWriteBatch* nextBatch = *it; // If the batch is NULL, we sent it previously, so skip if ( nextBatch == NULL ) continue; // Figure out what host we need to dispatch our targeted batch ConnectionString shardHost; Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint() .shardName, &shardHost ); if ( !resolveStatus.isOK() ) { ++_stats->numResolveErrors; // Record a resolve failure // TODO: It may be necessary to refresh the cache if stale, or maybe just // cancel and retarget the batch WriteErrorDetail error; buildErrorFrom( resolveStatus, &error ); batchOp.noteBatchError( *nextBatch, error ); // We're done with this batch *it = NULL; --numToSend; continue; } // If we already have a batch for this host, wait until the next time HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost ); if ( pendingIt != pendingBatches.end() ) continue; // // We now have all the info needed to dispatch the batch // BatchedCommandRequest request( clientRequest.getBatchType() ); batchOp.buildBatchRequest( *nextBatch, &request ); // Internally we use full namespaces for request/response, but we send the // command to a database with the collection name in the request. NamespaceString nss( request.getNS() ); request.setNS( nss.coll() ); _dispatcher->addCommand( shardHost, nss.db(), request ); // Indicate we're done by setting the batch to NULL // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast // endpoints for the same host, so this should be pretty efficient without // moving stuff around. *it = NULL; // Recv-side is responsible for cleaning up the nextBatch when used pendingBatches.insert( make_pair( shardHost, nextBatch ) ); } // Send them all out _dispatcher->sendAll(); numSent += pendingBatches.size(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response ConnectionString shardHost; BatchedCommandResponse response; Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response ); // Get the TargetedWriteBatch to find where to put the response dassert( pendingBatches.find( shardHost ) != pendingBatches.end() ); TargetedWriteBatch* batch = pendingBatches.find( shardHost )->second; if ( dispatchStatus.isOK() ) { TrackedErrors trackedErrors; trackedErrors.startTracking( ErrorCodes::StaleShardVersion ); // Dispatch was ok, note response batchOp.noteBatchResponse( *batch, response, &trackedErrors ); // Note if anything was stale const vector<ShardError*>& staleErrors = trackedErrors.getErrors( ErrorCodes::StaleShardVersion ); if ( staleErrors.size() > 0 ) { noteStaleResponses( staleErrors, _targeter ); ++_stats->numStaleBatches; } // Remember if the shard is actively changing metadata right now if ( isShardMetadataChanging( staleErrors ) ) { remoteMetadataChanging = true; } // Remember that we successfully wrote to this shard // NOTE: This will record lastOps for shards where we actually didn't update // or delete any documents, which preserves old behavior but is conservative _stats->noteWriteAt( shardHost, response.isLastOpSet() ? response.getLastOp() : OpTime(), response.isElectionIdSet() ? response.getElectionId() : OID()); } else { // Error occurred dispatching, note it stringstream msg; msg << "write results unavailable from " << shardHost.toString() << causedBy( dispatchStatus.toString() ); WriteErrorDetail error; buildErrorFrom( Status( ErrorCodes::RemoteResultsUnavailable, msg.str() ), &error ); batchOp.noteBatchError( *batch, error ); } } } ++rounds; ++_stats->numRounds; // If we're done, get out if ( batchOp.isFinished() ) break; // MORE WORK TO DO // // Refresh the targeter if we need to (no-op if nothing stale) // bool targeterChanged = false; Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged ); if ( !refreshStatus.isOK() ) { // It's okay if we can't refresh, we'll just record errors for the ops if // needed. warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() ) << endl; } // // Ensure progress is being made toward completing the batch op // int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed ); if ( currCompletedOps == numCompletedOps && !targeterChanged && !remoteMetadataChanging ) { ++numRoundsWithoutProgress; } else { numRoundsWithoutProgress = 0; } numCompletedOps = currCompletedOps; if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) { stringstream msg; msg << "no progress was made executing batch write op in " << clientRequest.getNS() << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps << " ops completed in " << rounds << " rounds total)"; WriteErrorDetail error; buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error ); batchOp.abortBatch( error ); break; } } batchOp.buildClientResponse( clientResponse ); }
void Socket::handleSendError(int ret, const char* context) { #if defined(_WIN32) const int mongo_errno = WSAGetLastError(); if ( mongo_errno == WSAETIMEDOUT && _timeout != 0 ) { #else const int mongo_errno = errno; if ( ( mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK ) && _timeout != 0 ) { #endif LOG(_logLevel) << "Socket " << context << " send() timed out " << remoteString() << endl; throw SocketException(SocketException::SEND_TIMEOUT , remoteString()); } else { LOG(_logLevel) << "Socket " << context << " send() " << errnoWithDescription(mongo_errno) << ' ' << remoteString() << endl; throw SocketException(SocketException::SEND_ERROR , remoteString()); } } void Socket::handleRecvError(int ret, int len) { if (ret == 0) { LOG(3) << "Socket recv() conn closed? " << remoteString() << endl; throw SocketException(SocketException::CLOSED , remoteString()); } // ret < 0 #if defined(_WIN32) int e = WSAGetLastError(); #else int e = errno; # if defined(EINTR) if (e == EINTR) { LOG(_logLevel) << "EINTR returned from recv(), retrying"; return; } # endif #endif #if defined(_WIN32) // Windows if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) { #else if (e == EAGAIN && _timeout > 0) { #endif // this is a timeout LOG(_logLevel) << "Socket recv() timeout " << remoteString() <<endl; throw SocketException(SocketException::RECV_TIMEOUT, remoteString()); } LOG(_logLevel) << "Socket recv() " << errnoWithDescription(e) << " " << remoteString() <<endl; throw SocketException(SocketException::RECV_ERROR , remoteString()); } void Socket::setTimeout( double secs ) { setSockTimeouts( _fd, secs ); } // TODO: allow modification? // // <positive value> : secs to wait between stillConnected checks // 0 : always check // -1 : never check const int Socket::errorPollIntervalSecs( 5 ); // Patch to allow better tolerance of flaky network connections that get broken // while we aren't looking. // TODO: Remove when better async changes come. // // isStillConnected() polls the socket at max every Socket::errorPollIntervalSecs to determine // if any disconnection-type events have happened on the socket. bool Socket::isStillConnected() { if (MONGO_FAIL_POINT(notStillConnected)) { return false; } if (_fd == -1) { // According to the man page, poll will respond with POLLVNAL for invalid or // unopened descriptors, but it doesn't seem to be properly implemented in // some platforms - it can return 0 events and 0 for revent. Hence this workaround. return false; } if ( errorPollIntervalSecs < 0 ) return true; if ( ! isPollSupported() ) return true; // nothing we can do time_t now = time( 0 ); time_t idleTimeSecs = now - _lastValidityCheckAtSecs; // Only check once every 5 secs if ( idleTimeSecs < errorPollIntervalSecs ) return true; // Reset our timer, we're checking the connection _lastValidityCheckAtSecs = now; // It's been long enough, poll to see if our socket is still connected pollfd pollInfo; pollInfo.fd = _fd; // We only care about reading the EOF message on clean close (and errors) pollInfo.events = POLLIN; // Poll( info[], size, timeout ) - timeout == 0 => nonblocking int nEvents = socketPoll( &pollInfo, 1, 0 ); LOG( 2 ) << "polling for status of connection to " << remoteString() << ", " << ( nEvents == 0 ? "no events" : nEvents == -1 ? "error detected" : "event detected" ) << endl; if ( nEvents == 0 ) { // No events incoming, return still connected AFAWK return true; } else if ( nEvents < 0 ) { // Poll itself failed, this is weird, warn and log errno warning() << "Socket poll() failed during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << causedBy(errnoWithDescription()) << endl; // Return true since it's not clear that we're disconnected. return true; } dassert( nEvents == 1 ); dassert( pollInfo.revents > 0 ); // Return false at this point, some event happened on the socket, but log what the // actual event was. if ( pollInfo.revents & POLLIN ) { // There shouldn't really be any data to recv here, so make sure this // is a clean hangup. const int testBufLength = 1024; char testBuf[testBufLength]; int recvd = ::recv( _fd, testBuf, testBufLength, portRecvFlags ); if ( recvd < 0 ) { // An error occurred during recv, warn and log errno warning() << "Socket recv() failed during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << causedBy(errnoWithDescription()) << endl; } else if ( recvd > 0 ) { // We got nonzero data from this socket, very weird? // Log and warn at runtime, log and abort at devtime // TODO: Dump the data to the log somehow? error() << "Socket found pending " << recvd << " bytes of data during connectivity check" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; dassert( false ); } else { // recvd == 0, socket closed remotely, just return false LOG( 0 ) << "Socket closed remotely, no longer connected" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; } } else if ( pollInfo.revents & POLLHUP ) { // A hangup has occurred on this socket LOG( 0 ) << "Socket hangup detected, no longer connected" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; } else if ( pollInfo.revents & POLLERR ) { // An error has occurred on this socket LOG( 0 ) << "Socket error detected, no longer connected" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; } else if ( pollInfo.revents & POLLNVAL ) { // Socket descriptor itself is weird // Log and warn at runtime, log and abort at devtime error() << "Socket descriptor detected as invalid" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; dassert( false ); } else { // Don't know what poll is saying here // Log and warn at runtime, log and abort at devtime error() << "Socket had unknown event (" << static_cast<int>(pollInfo.revents) << ")" << " (idle " << idleTimeSecs << " secs," << " remote host " << remoteString() << ")" << endl; dassert( false ); } return false; } #if defined(_WIN32) MONGO_INITIALIZER(SockWSAStartup)(InitializerContext * context) { WSADATA d; if ( WSAStartup(MAKEWORD(2,2), &d) != 0 ) { log() << "ERROR: wsastartup failed " << errnoWithDescription() << endl; abort(); } return Status::OK(); }
ChunkRange::ChunkRange(BSONObj minKey, BSONObj maxKey) : _minKey(std::move(minKey)), _maxKey(std::move(maxKey)) { dassert(SimpleBSONObjComparator::kInstance.evaluate(_minKey < _maxKey)); }
void FieldRefSet::fillFrom(const std::vector<FieldRef*>& fields) { dassert(_fieldSet.empty()); _fieldSet.insert(fields.begin(), fields.end()); }
UpdateResult update( OperationContext* txn, Database* db, const UpdateRequest& request, OpDebug* opDebug, UpdateDriver* driver, CanonicalQuery* cq) { LOG(3) << "processing update : " << request; std::auto_ptr<CanonicalQuery> cqHolder(cq); const NamespaceString& nsString = request.getNamespaceString(); UpdateLifecycle* lifecycle = request.getLifecycle(); Collection* collection = db->getCollection(nsString.ns()); validateUpdate(nsString.ns().c_str(), request.getUpdates(), request.getQuery()); // TODO: This seems a bit circuitious. opDebug->updateobj = request.getUpdates(); if (lifecycle) { lifecycle->setCollection(collection); driver->refreshIndexKeys(lifecycle->getIndexKeys()); } Runner* rawRunner; Status status = cq ? getRunner(collection, cqHolder.release(), &rawRunner) : getRunner(collection, nsString.ns(), request.getQuery(), &rawRunner, &cq); uassert(17243, "could not get runner " + request.getQuery().toString() + "; " + causedBy(status), status.isOK()); // Create the runner and setup all deps. auto_ptr<Runner> runner(rawRunner); // Register Runner with ClientCursor const ScopedRunnerRegistration safety(runner.get()); // // We'll start assuming we have one or more documents for this update. (Otherwise, // we'll fall-back to insert case (if upsert is true).) // // We are an update until we fall into the insert case below. driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT); int numMatched = 0; // If the update was in-place, we may see it again. This only matters if we're doing // a multi-update; if we're not doing a multi-update we stop after one update and we // won't see any more docs. // // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep // moving the document forward and it will continue to reappear in our index scan. // Unless the index is multikey, the underlying query machinery won't de-dup. // // If the update wasn't in-place we may see it again. Our query may return the new // document and we wouldn't want to update that. // // So, no matter what, we keep track of where the doc wound up. typedef unordered_set<DiskLoc, DiskLoc::Hasher> DiskLocSet; const scoped_ptr<DiskLocSet> updatedLocs(request.isMulti() ? new DiskLocSet : NULL); // Reset these counters on each call. We might re-enter this function to retry this // update if we throw a page fault exception below, and we rely on these counters // reflecting only the actions taken locally. In particlar, we must have the no-op // counter reset so that we can meaningfully comapre it with numMatched above. opDebug->nscanned = 0; opDebug->nscannedObjects = 0; opDebug->nModified = 0; // Get the cached document from the update driver. mutablebson::Document& doc = driver->getDocument(); mutablebson::DamageVector damages; // Used during iteration of docs BSONObj oldObj; // Get first doc, and location Runner::RunnerState state = Runner::RUNNER_ADVANCED; uassert(ErrorCodes::NotMaster, mongoutils::str::stream() << "Not primary while updating " << nsString.ns(), !request.shouldCallLogOp() || isMasterNs(nsString.ns().c_str())); while (true) { // Get next doc, and location DiskLoc loc; state = runner->getNext(&oldObj, &loc); if (state != Runner::RUNNER_ADVANCED) { if (state == Runner::RUNNER_EOF) { // We have reached the logical end of the loop, so do yielding recovery break; } else { uassertStatusOK(Status(ErrorCodes::InternalError, str::stream() << " Update query failed -- " << Runner::statestr(state))); } } // We fill this with the new locs of moved doc so we don't double-update. if (updatedLocs && updatedLocs->count(loc) > 0) { continue; } // We count how many documents we scanned even though we may skip those that are // deemed duplicated. The final 'numMatched' and 'nscanned' numbers may differ for // that reason. // TODO: Do we want to pull this out of the underlying query plan? opDebug->nscanned++; // Found a matching document opDebug->nscannedObjects++; numMatched++; // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new // document is needed to accomodate the new bson layout of the resulting document. doc.reset(oldObj, mutablebson::Document::kInPlaceEnabled); BSONObj logObj; FieldRefSet updatedFields; Status status = Status::OK(); if (!driver->needMatchDetails()) { // If we don't need match details, avoid doing the rematch status = driver->update(StringData(), &doc, &logObj, &updatedFields); } else { // If there was a matched field, obtain it. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); dassert(cq); verify(cq->root()->matchesBSON(oldObj, &matchDetails)); string matchedField; if (matchDetails.hasElemMatchKey()) matchedField = matchDetails.elemMatchKey(); // TODO: Right now, each mod checks in 'prepare' that if it needs positional // data, that a non-empty StringData() was provided. In principle, we could do // that check here in an else clause to the above conditional and remove the // checks from the mods. status = driver->update(matchedField, &doc, &logObj, &updatedFields); } if (!status.isOK()) { uasserted(16837, status.reason()); } // Ensure _id exists and is first uassertStatusOK(ensureIdAndFirst(doc)); // If the driver applied the mods in place, we can ask the mutable for what // changed. We call those changes "damages". :) We use the damages to inform the // journal what was changed, and then apply them to the original document // ourselves. If, however, the driver applied the mods out of place, we ask it to // generate a new, modified document for us. In that case, the file manager will // take care of the journaling details for us. // // This code flow is admittedly odd. But, right now, journaling is baked in the file // manager. And if we aren't using the file manager, we have to do jounaling // ourselves. bool docWasModified = false; BSONObj newObj; const char* source = NULL; bool inPlace = doc.getInPlaceUpdates(&damages, &source); // If something changed in the document, verify that no immutable fields were changed // and data is valid for storage. if ((!inPlace || !damages.empty()) ) { if (!(request.isFromReplication() || request.isFromMigration())) { const std::vector<FieldRef*>* immutableFields = NULL; if (lifecycle) immutableFields = lifecycle->getImmutableFields(); uassertStatusOK(validate(oldObj, updatedFields, doc, immutableFields, driver->modOptions()) ); } } // Save state before making changes runner->saveState(); if (inPlace && !driver->modsAffectIndices()) { // If a set of modifiers were all no-ops, we are still 'in place', but there is // no work to do, in which case we want to consider the object unchanged. if (!damages.empty() ) { collection->updateDocumentWithDamages( txn, loc, source, damages ); docWasModified = true; opDebug->fastmod = true; } newObj = oldObj; } else { // The updates were not in place. Apply them through the file manager. newObj = doc.getObject(); uassert(17419, str::stream() << "Resulting document after update is larger than " << BSONObjMaxUserSize, newObj.objsize() <= BSONObjMaxUserSize); StatusWith<DiskLoc> res = collection->updateDocument(txn, loc, newObj, true, opDebug); uassertStatusOK(res.getStatus()); DiskLoc newLoc = res.getValue(); docWasModified = true; // If the document moved, we might see it again in a collection scan (maybe it's // a document after our current document). // // If the document is indexed and the mod changes an indexed value, we might see it // again. For an example, see the comment above near declaration of updatedLocs. if (updatedLocs && (newLoc != loc || driver->modsAffectIndices())) { updatedLocs->insert(newLoc); } } // Restore state after modification uassert(17278, "Update could not restore runner state after updating a document.", runner->restoreState()); // Call logOp if requested. if (request.shouldCallLogOp() && !logObj.isEmpty()) { BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti()); logOp(txn, "u", nsString.ns().c_str(), logObj , &idQuery, NULL, request.isFromMigration()); } // Only record doc modifications if they wrote (exclude no-ops) if (docWasModified) opDebug->nModified++; if (!request.isMulti()) { break; } // Opportunity for journaling to write during the update. txn->recoveryUnit()->commitIfNeeded(); } // TODO: Can this be simplified? if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) { opDebug->nMatched = numMatched; return UpdateResult(numMatched > 0 /* updated existing object(s) */, !driver->isDocReplacement() /* $mod or obj replacement */, opDebug->nModified /* number of modified docs, no no-ops */, numMatched /* # of docs matched/updated, even no-ops */, BSONObj()); } // // We haven't found any existing document so an insert is done // (upsert is true). // opDebug->upsert = true; // Since this is an insert (no docs found and upsert:true), we will be logging it // as an insert in the oplog. We don't need the driver's help to build the // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. // Some mods may only work in that context (e.g. $setOnInsert). driver->setLogOp(false); driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT); // Reset the document we will be writing to doc.reset(); // This remains the empty object in the case of an object replacement, but in the case // of an upsert where we are creating a base object from the query and applying mods, // we capture the query as the original so that we can detect immutable field mutations. BSONObj original = BSONObj(); // Calling createFromQuery will populate the 'doc' with fields from the query which // creates the base of the update for the inserterd doc (because upsert was true) if (cq) { uassertStatusOK(driver->populateDocumentWithQueryFields(cq, doc)); // Validate the base doc, as taken from the query -- no fields means validate all. FieldRefSet noFields; uassertStatusOK(validate(BSONObj(), noFields, doc, NULL, driver->modOptions())); if (!driver->isDocReplacement()) { opDebug->fastmodinsert = true; // We need all the fields from the query to compare against for validation below. original = doc.getObject(); } else { original = request.getQuery(); } } else { fassert(17354, CanonicalQuery::isSimpleIdQuery(request.getQuery())); BSONElement idElt = request.getQuery()["_id"]; original = idElt.wrap(); fassert(17352, doc.root().appendElement(idElt)); } // Apply the update modifications and then log the update as an insert manually. FieldRefSet updatedFields; status = driver->update(StringData(), &doc, NULL, &updatedFields); if (!status.isOK()) { uasserted(16836, status.reason()); } // Ensure _id exists and is first uassertStatusOK(ensureIdAndFirst(doc)); // Validate that the object replacement or modifiers resulted in a document // that contains all the immutable keys and can be stored. if (!(request.isFromReplication() || request.isFromMigration())){ const std::vector<FieldRef*>* immutableFields = NULL; if (lifecycle) immutableFields = lifecycle->getImmutableFields(); // This will only validate the modified fields if not a replacement. uassertStatusOK(validate(original, updatedFields, doc, immutableFields, driver->modOptions()) ); } // Only create the collection if the doc will be inserted. if (!collection) { collection = db->getCollection(request.getNamespaceString().ns()); if (!collection) { collection = db->createCollection(txn, request.getNamespaceString().ns()); } } // Insert the doc BSONObj newObj = doc.getObject(); uassert(17420, str::stream() << "Document to upsert is larger than " << BSONObjMaxUserSize, newObj.objsize() <= BSONObjMaxUserSize); StatusWith<DiskLoc> newLoc = collection->insertDocument(txn, newObj, !request.isGod() /*enforceQuota*/); uassertStatusOK(newLoc.getStatus()); if (request.shouldCallLogOp()) { logOp(txn, "i", nsString.ns().c_str(), newObj, NULL, NULL, request.isFromMigration()); } opDebug->nMatched = 1; return UpdateResult(false /* updated a non existing document */, !driver->isDocReplacement() /* $mod or obj replacement? */, 1 /* docs written*/, 1 /* count of updated documents */, newObj /* object that was upserted */ ); }
bool WriteCmd::run(const string& dbName, BSONObj& cmdObj, int options, string& errMsg, BSONObjBuilder& result, bool fromRepl) { // Can't be run on secondaries (logTheOp() == false, slaveOk() == false). dassert( !fromRepl ); BatchedCommandRequest request( _writeType ); BatchedCommandResponse response; if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) { // Batch parse failure response.setOk( false ); response.setN( 0 ); response.setErrCode( ErrorCodes::FailedToParse ); response.setErrMessage( errMsg ); dassert( response.isValid( &errMsg ) ); result.appendElements( response.toBSON() ); // TODO // There's a pending issue about how to report response here. If we use // the command infra-structure, we should reuse the 'errmsg' field. But // we have already filed that message inside the BatchCommandResponse. // return response.getOk(); return true; } // Note that this is a runCommmand, and therefore, the database and the collection name // are in different parts of the grammar for the command. But it's more convenient to // work with a NamespaceString. We built it here and replace it in the parsed command. // Internally, everything work with the namespace string as opposed to just the // collection name. NamespaceString nss(dbName, request.getNS()); request.setNS(nss.ns()); Status status = userAllowedWriteNS( nss ); if ( !status.isOK() ) return appendCommandStatus( result, status ); if ( cc().curop() ) cc().curop()->setNS( nss.ns() ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { // check all docs BatchedInsertRequest* insertRequest = request.getInsertRequest(); vector<BSONObj>& docsToInsert = insertRequest->getDocuments(); for ( size_t i = 0; i < docsToInsert.size(); i++ ) { StatusWith<BSONObj> fixed = fixDocumentForInsert( docsToInsert[i] ); if ( !fixed.isOK() ) { // we don't return early since each doc can be handled independantly continue; } if ( fixed.getValue().isEmpty() ) { continue; } docsToInsert[i] = fixed.getValue(); } } BSONObj defaultWriteConcern; // This is really bad - it's only safe because we leak the defaults by overriding them with // new defaults and because we never reset to an empty default. // TODO: fix this for sane behavior where we query repl set object if ( getLastErrorDefault ) defaultWriteConcern = *getLastErrorDefault; if ( defaultWriteConcern.isEmpty() ) { BSONObjBuilder b; b.append( "w", 1 ); defaultWriteConcern = b.obj(); } WriteBatchExecutor writeBatchExecutor(defaultWriteConcern, &cc(), &globalOpCounters, lastError.get()); writeBatchExecutor.executeBatch( request, &response ); result.appendElements( response.toBSON() ); // TODO // There's a pending issue about how to report response here. If we use // the command infra-structure, we should reuse the 'errmsg' field. But // we have already filed that message inside the BatchCommandResponse. // return response.getOk(); return true; }
/** we will build an output buffer ourself and then use O_DIRECT we could be in read lock for this caller handles locking */ static void PREPLOGBUFFER() { assert( cmdLine.dur ); AlignedBuilder& bb = commitJob._ab; bb.reset(); unsigned lenOfs; // JSectHeader { bb.appendStr("\nHH\n", false); lenOfs = bb.skip(4); } // ops other than basic writes { for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { (*i)->serialize(bb); } } // write intents { scoped_lock lk(privateViews._mutex()); string lastFilePath; for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) { size_t ofs; MongoMMF *mmf = privateViews._find(i->p, ofs); if( mmf == 0 ) { string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p; journalingFailure(s.c_str()); // asserts return; } if( !mmf->willNeedRemap() ) { mmf->willNeedRemap() = true; // usually it will already be dirty so don't bother writing then } //size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p); i->w_ptr = ((char*)mmf->view_write()) + ofs; if( mmf->filePath() != lastFilePath ) { lastFilePath = mmf->filePath(); JDbContext c; bb.appendStruct(c); bb.appendStr(lastFilePath); } JEntry e; e.len = i->len; assert( ofs <= 0x80000000 ); e.ofs = (unsigned) ofs; e.fileNo = mmf->fileSuffixNo(); bb.appendStruct(e); bb.appendBuf(i->p, i->len); } } { JSectFooter f(bb.buf(), bb.len()); bb.appendStruct(f); } { assert( 0xffffe000 == (~(Alignment-1)) ); unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment dassert( L >= (unsigned) bb.len() ); *((unsigned*)bb.atOfs(lenOfs)) = L; unsigned padding = L - bb.len(); bb.skip(padding); dassert( bb.len() % Alignment == 0 ); } return; }