Example #1
0
ClientCursor::ClientCursor(ClientCursorParams params,
                           CursorManager* cursorManager,
                           CursorId cursorId,
                           OperationContext* operationUsingCursor,
                           Date_t now)
    : _cursorid(cursorId),
      _nss(std::move(params.nss)),
      _authenticatedUsers(std::move(params.authenticatedUsers)),
      _lsid(operationUsingCursor->getLogicalSessionId()),
      _txnNumber(operationUsingCursor->getTxnNumber()),
      _readConcernLevel(params.readConcernLevel),
      _cursorManager(cursorManager),
      _originatingCommand(params.originatingCommandObj),
      _queryOptions(params.queryOptions),
      _exec(std::move(params.exec)),
      _operationUsingCursor(operationUsingCursor),
      _lastUseDate(now) {
    invariant(_cursorManager);
    invariant(_exec);
    invariant(_operationUsingCursor);

    cursorStatsOpen.increment();

    if (isNoTimeout()) {
        // cursors normally timeout after an inactivity period to prevent excess memory use
        // setting this prevents timeout of the cursor in question.
        cursorStatsOpenNoTimeout.increment();
    }
}
Example #2
0
void CursorCache::appendInfo(BSONObjBuilder& result) const {
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    result.append("sharded", static_cast<int>(cursorStatsMultiTarget.get()));
    result.appendNumber("shardedEver", _shardedTotal);
    result.append("refs", static_cast<int>(cursorStatsSingleTarget.get()));
    result.append("totalOpen", static_cast<int>(cursorStatsTotalOpen.get()));
}
Example #3
0
//------------------[ copy constructor ]---------------------------------
Counter64::Counter64( const Counter64 &ctr64 )
  : SnmpSyntax (ctr64)
{
  smival.syntax = sNMP_SYNTAX_CNTR64;
  smival.value.hNumber.hipart = ctr64.high();
  smival.value.hNumber.lopart = ctr64.low();
}
Example #4
0
ClientCursor::~ClientCursor() {
    // Cursors must be unpinned and deregistered from their cursor manager before being deleted.
    invariant(!_operationUsingCursor);
    invariant(_disposed);

    cursorStatsOpen.decrement();
    if (isNoTimeout()) {
        cursorStatsOpenNoTimeout.decrement();
    }
}
Example #5
0
void CursorCache::removeRef(long long id) {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _refs.erase(id);
    _refsNS.erase(id);
    cursorStatsSingleTarget.decrement();
}
Example #6
0
void ClientCursorPin::deleteUnderlying() {
    invariant(_cursor);
    invariant(_cursor->_operationUsingCursor);
    // Note the following subtleties of this method's implementation:
    // - We must unpin the cursor before destruction, since it is an error to delete a pinned
    //   cursor.
    // - In addition, we must deregister the cursor before unpinning, since it is an
    //   error to unpin a registered cursor without holding the cursor manager lock (note that
    //   we can't simply unpin with the cursor manager lock here, since we need to guarantee
    //   exclusive ownership of the cursor when we are deleting it).

    // Note it's not safe to dereference _cursor->_cursorManager unless we know we haven't been
    // killed. If we're not locked we assume we haven't been killed because we're working with the
    // global cursor manager which never kills cursors.
    dassert(_opCtx->lockState()->isCollectionLockedForMode(_cursor->_nss.ns(), MODE_IS) ||
            _cursor->_cursorManager->isGlobalManager());

    if (!_cursor->getExecutor()->isMarkedAsKilled()) {
        _cursor->_cursorManager->deregisterCursor(_cursor);
    }

    // Make sure the cursor is disposed and unpinned before being destroyed.
    _cursor->dispose(_opCtx);
    _cursor->_operationUsingCursor = nullptr;
    delete _cursor;

    cursorStatsOpenPinned.decrement();
    _cursor = nullptr;
}
Example #7
0
Status waitForWriteConcern(OperationContext* txn,
                           const OpTime& replOpTime,
                           const WriteConcernOptions& writeConcern,
                           WriteConcernResult* result) {
    // We assume all options have been validated earlier, if not, programming error
    dassert(validateWriteConcern(writeConcern).isOK());

    // Next handle blocking on disk

    Timer syncTimer;

    switch (writeConcern.syncMode) {
        case WriteConcernOptions::NONE:
            break;
        case WriteConcernOptions::FSYNC: {
            StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
            if (!storageEngine->isDurable()) {
                result->fsyncFiles = storageEngine->flushAllFiles(true);
            } else {
                // We only need to commit the journal if we're durable
                txn->recoveryUnit()->waitUntilDurable();
            }
            break;
        }
        case WriteConcernOptions::JOURNAL:
            txn->recoveryUnit()->waitUntilDurable();
            break;
    }

    result->syncMillis = syncTimer.millis();

    // Now wait for replication

    if (replOpTime.isNull()) {
        // no write happened for this client yet
        return Status::OK();
    }

    // needed to avoid incrementing gleWtimeStats SERVER-9005
    if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) {
        // no desired replication check
        return Status::OK();
    }

    // Now we wait for replication
    // Note that replica set stepdowns and gle mode changes are thrown as errors
    repl::ReplicationCoordinator::StatusAndDuration replStatus =
        repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern);
    if (replStatus.status == ErrorCodes::WriteConcernFailed) {
        gleWtimeouts.increment();
        result->err = "timeout";
        result->wTimedOut = true;
    }
    // Add stats
    result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime);
    gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration));
    result->wTime = durationCount<Milliseconds>(replStatus.duration);

    return replStatus.status;
}
Example #8
0
void CursorCache::removeRef(long long id) {
    verify(id);
    scoped_lock lk(_mutex);
    _refs.erase(id);
    _refsNS.erase(id);
    cursorStatsSingleTarget.decrement();
}
Example #9
0
void CursorCache::storeRef(const std::string& server, long long id, const std::string& ns) {
    LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl;
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _refs[id] = server;
    _refsNS[id] = ns;
    cursorStatsSingleTarget.increment();
}
Example #10
0
OplogReader::OplogReader() {
    _tailingQueryOptions = QueryOption_SlaveOk;
    _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay;

    /* TODO: slaveOk maybe shouldn't use? */
    _tailingQueryOptions |= QueryOption_AwaitData;

    readersCreatedStats.increment();
}
Example #11
0
Status Collection::recordStoreGoingToMove(OperationContext* txn,
                                          const RecordId& oldLocation,
                                          const char* oldBuffer,
                                          size_t oldSize) {
    moveCounter.increment();
    _cursorManager.invalidateDocument(txn, oldLocation, INVALIDATION_DELETION);
    _indexCatalog.unindexRecord(txn, BSONObj(oldBuffer), oldLocation, true);
    return Status::OK();
}
Example #12
0
ClientCursorPin::ClientCursorPin(OperationContext* opCtx, ClientCursor* cursor)
    : _opCtx(opCtx), _cursor(cursor) {
    invariant(_cursor);
    invariant(_cursor->_operationUsingCursor);
    invariant(_cursor->_cursorManager);
    invariant(!_cursor->_disposed);

    // We keep track of the number of cursors currently pinned. The cursor can become unpinned
    // either by being released back to the cursor manager or by being deleted. A cursor may be
    // transferred to another pin object via move construction or move assignment, but in this case
    // it is still considered pinned.
    cursorStatsOpenPinned.increment();
}
Example #13
0
    void OpDebug::recordStats() {
        if ( nreturned > 0 )
            returnedCounter.increment( nreturned );
        if ( ninserted > 0 )
            insertedCounter.increment( ninserted );
        if ( nMatched > 0 )
            updatedCounter.increment( nMatched );
        if ( ndeleted > 0 )
            deletedCounter.increment( ndeleted );
        if ( nscanned > 0 )
            scannedCounter.increment( nscanned );
        if ( nscannedObjects > 0 )
            scannedObjectCounter.increment( nscannedObjects );

        if ( idhack )
            idhackCounter.increment();
        if ( scanAndOrder )
            scanAndOrderCounter.increment();
        if ( fastmod )
            fastmodCounter.increment();
        if ( writeConflicts )
            writeConflictsCounter.increment( writeConflicts );
    }
Example #14
0
File: curop.cpp Project: Axv2/mongo
    void OpDebug::recordStats() {
        if ( nreturned > 0 )
            returnedCounter.increment( nreturned );
        if ( ninserted > 0 )
            insertedCounter.increment( ninserted );
        if ( nupdated > 0 )
            updatedCounter.increment( nupdated );
        if ( ndeleted > 0 )
            deletedCounter.increment( ndeleted );
        if ( nscanned > 0 )
            scannedCounter.increment( nscanned );

        if ( idhack )
            idhackCounter.increment();
        if ( scanAndOrder )
            scanAndOrderCounter.increment();
        if ( fastmod )
            fastmodCounter.increment();
    }
Example #15
0
ShardedClientCursor::ShardedClientCursor(QueryMessage& q, ParallelSortClusteredCursor* cursor) {
    verify(cursor);
    _cursor = cursor;

    _skip = q.ntoskip;
    _ntoreturn = q.ntoreturn;

    _totalSent = 0;
    _done = false;

    _id = 0;

    if (q.queryOptions & QueryOption_NoCursorTimeout) {
        _lastAccessMillis = 0;
    } else
        _lastAccessMillis = Listener::getElapsedTimeMillis();

    cursorStatsMultiTarget.increment();
}
Example #16
0
        virtual void run() {
            Client::initThread( name().c_str() );

            while ( ! inShutdown() ) {
                sleepsecs( 60 );
                
                LOG(3) << "TTLMonitor thread awake" << endl;
                
                if ( lockedForWriting() ) {
                    // note: this is not perfect as you can go into fsync+lock between 
                    // this and actually doing the delete later
                    LOG(3) << " locked for writing" << endl;
                    continue;
                }

                // if part of replSet but not in a readable state (e.g. during initial sync), skip.
                if ( theReplSet && !theReplSet->state().readable() )
                    continue;

                set<string> dbs;
                {
                    Lock::DBRead lk( "local" );
                    dbHolder().getAllShortNames( dbs );
                }
                
                ttlPasses.increment();

                for ( set<string>::const_iterator i=dbs.begin(); i!=dbs.end(); ++i ) {
                    string db = *i;
                    try {
                        doTTLForDB( db );
                    }
                    catch ( DBException& e ) {
                        error() << "error processing ttl for db: " << db << " " << e << endl;
                    }
                }

            }
        }
Example #17
0
    /* apply the log op that is in param o
       @return bool success (true) or failure (false)
    */
    bool SyncTail::syncApply(
                        OperationContext* txn, const BSONObj &op, bool convertUpdateToUpsert) {
        const char *ns = op.getStringField("ns");
        verify(ns);

        if ( (*ns == '\0') || (*ns == '.') ) {
            // this is ugly
            // this is often a no-op
            // but can't be 100% sure
            if( *op.getStringField("op") != 'n' ) {
                error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog;
            }
            return true;
        }

        bool isCommand(op["op"].valuestrsafe()[0] == 'c');

        boost::scoped_ptr<Lock::ScopedLock> lk;

        if(isCommand) {
            // a command may need a global write lock. so we will conservatively go 
            // ahead and grab one here. suboptimal. :-(
            lk.reset(new Lock::GlobalWrite(txn->lockState()));
        } else {
            // DB level lock for this operation
            lk.reset(new Lock::DBWrite(txn->lockState(), ns)); 
        }

        Client::Context ctx(ns);
        ctx.getClient()->curop()->reset();
        // For non-initial-sync, we convert updates to upserts
        // to suppress errors when replaying oplog entries.
        bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert);
        opsAppliedStats.increment();
        txn->recoveryUnit()->commitIfNeeded();

        return ok;
    }
Example #18
0
void ClientCursorPin::release() {
    if (!_cursor)
        return;

    // Note it's not safe to dereference _cursor->_cursorManager unless we know we haven't been
    // killed. If we're not locked we assume we haven't been killed because we're working with the
    // global cursor manager which never kills cursors.
    dassert(_opCtx->lockState()->isCollectionLockedForMode(_cursor->_nss.ns(), MODE_IS) ||
            _cursor->_cursorManager->isGlobalManager());

    invariant(_cursor->_operationUsingCursor);

    if (_cursor->getExecutor()->isMarkedAsKilled()) {
        // The ClientCursor was killed while we had it.  Therefore, it is our responsibility to
        // call dispose() and delete it.
        deleteUnderlying();
    } else {
        // Unpin the cursor under the collection cursor manager lock.
        _cursor->_cursorManager->unpin(_opCtx, _cursor);
        cursorStatsOpenPinned.decrement();
    }

    _cursor = nullptr;
}
Example #19
0
namespace mongo {

using std::endl;
using std::vector;

static Counter64 freelistAllocs;
static Counter64 freelistBucketExhausted;
static Counter64 freelistIterations;

// TODO figure out what to do about these.
static ServerStatusMetricField<Counter64> dFreelist1("storage.freelist.search.requests",
                                                     &freelistAllocs);

static ServerStatusMetricField<Counter64> dFreelist2("storage.freelist.search.bucketExhausted",
                                                     &freelistBucketExhausted);

static ServerStatusMetricField<Counter64> dFreelist3("storage.freelist.search.scanned",
                                                     &freelistIterations);

SimpleRecordStoreV1::SimpleRecordStoreV1(OperationContext* txn,
                                         StringData ns,
                                         RecordStoreV1MetaData* details,
                                         ExtentManager* em,
                                         bool isSystemIndexes)
    : RecordStoreV1Base(ns, details, em, isSystemIndexes) {
    invariant(!details->isCapped());
    _normalCollection = NamespaceString::normal(ns);
}

SimpleRecordStoreV1::~SimpleRecordStoreV1() {}

DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) {
    // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the
    // correct deleted list each time we try to allocate a new record. This ensures we won't
    // orphan any data when upgrading from old versions, without needing a long upgrade phase.
    // This is done before we try to allocate the new record so we can take advantage of the new
    // space immediately.
    {
        const DiskLoc head = _details->deletedListLegacyGrabBag();
        if (!head.isNull()) {
            _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted());
            addDeletedRec(txn, head);
        }
    }

    // align size up to a multiple of 4
    const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1);

    freelistAllocs.increment();
    DiskLoc loc;
    DeletedRecord* dr = NULL;
    {
        int myBucket;
        for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) {
            // Only look at the first entry in each bucket. This works because we are either
            // quantizing or allocating fixed-size blocks.
            const DiskLoc head = _details->deletedListEntry(myBucket);
            if (head.isNull())
                continue;
            DeletedRecord* const candidate = drec(head);
            if (candidate->lengthWithHeaders() >= lenToAlloc) {
                loc = head;
                dr = candidate;
                break;
            }
        }

        if (!dr)
            return DiskLoc();  // no space

        // Unlink ourself from the deleted list
        _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted());
        *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid();  // defensive
    }

    invariant(dr->extentOfs() < loc.getOfs());

    // Split the deleted record if it has at least as much left over space as our smallest
    // allocation size. Otherwise, just take the whole DeletedRecord.
    const int remainingLength = dr->lengthWithHeaders() - lenToAlloc;
    if (remainingLength >= bucketSizes[0]) {
        txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc;
        const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc);
        DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc));
        newDel->extentOfs() = dr->extentOfs();
        newDel->lengthWithHeaders() = remainingLength;
        newDel->nextDeleted().Null();

        addDeletedRec(txn, newDelLoc);
    }

    return loc;
}

StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord(OperationContext* txn,
                                                     int lengthWithHeaders,
                                                     bool enforceQuota) {
    if (lengthWithHeaders > MaxAllowedAllocation) {
        return StatusWith<DiskLoc>(
            ErrorCodes::InvalidLength,
            str::stream() << "Attempting to allocate a record larger than maximum size: "
                          << lengthWithHeaders
                          << " > 16.5MB");
    }

    DiskLoc loc = _allocFromExistingExtents(txn, lengthWithHeaders);
    if (!loc.isNull())
        return StatusWith<DiskLoc>(loc);

    LOG(1) << "allocating new extent";

    increaseStorageSize(
        txn,
        _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)),
        enforceQuota);

    loc = _allocFromExistingExtents(txn, lengthWithHeaders);
    if (!loc.isNull()) {
        // got on first try
        return StatusWith<DiskLoc>(loc);
    }

    log() << "warning: alloc() failed after allocating new extent. "
          << "lengthWithHeaders: " << lengthWithHeaders
          << " last extent size:" << _details->lastExtentSize(txn) << "; trying again";

    for (int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++) {
        log() << "try #" << z << endl;

        increaseStorageSize(
            txn,
            _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)),
            enforceQuota);

        loc = _allocFromExistingExtents(txn, lengthWithHeaders);
        if (!loc.isNull())
            return StatusWith<DiskLoc>(loc);
    }

    return StatusWith<DiskLoc>(ErrorCodes::InternalError, "cannot allocate space");
}

Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
    const DiskLoc firstExtLoc = _details->firstExtent(txn);
    if (firstExtLoc.isNull() || !firstExtLoc.isValid()) {
        // Already empty
        return Status::OK();
    }

    // Free all extents except the first.
    Extent* firstExt = _extentManager->getExtent(firstExtLoc);
    if (!firstExt->xnext.isNull()) {
        const DiskLoc extNextLoc = firstExt->xnext;
        const DiskLoc oldLastExtLoc = _details->lastExtent(txn);
        Extent* const nextExt = _extentManager->getExtent(extNextLoc);

        // Unlink other extents;
        *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc();
        *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc();
        _details->setLastExtent(txn, firstExtLoc);
        _details->setLastExtentSize(txn, firstExt->length);

        _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc);
    }

    // Make the first (now only) extent a single large deleted record.
    *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc();
    *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc();
    _details->orphanDeletedList(txn);
    addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt));

    // Make stats reflect that there are now no documents in this record store.
    _details->setStats(txn, 0, 0);

    return Status::OK();
}

void SimpleRecordStoreV1::addDeletedRec(OperationContext* txn, const DiskLoc& dloc) {
    DeletedRecord* d = drec(dloc);

    int b = bucket(d->lengthWithHeaders());
    *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
    _details->setDeletedListEntry(txn, b, dloc);
}

std::unique_ptr<SeekableRecordCursor> SimpleRecordStoreV1::getCursor(OperationContext* txn,
                                                                     bool forward) const {
    return stdx::make_unique<SimpleRecordStoreV1Iterator>(txn, this, forward);
}

vector<std::unique_ptr<RecordCursor>> SimpleRecordStoreV1::getManyCursors(
    OperationContext* txn) const {
    vector<std::unique_ptr<RecordCursor>> cursors;
    const Extent* ext;
    for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
        ext = _getExtent(txn, extLoc);
        if (ext->firstRecord.isNull())
            continue;
        cursors.push_back(
            stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(txn, ext->firstRecord, this));
    }

    return cursors;
}

class CompactDocWriter final : public DocWriter {
public:
    /**
     * param allocationSize - allocation size WITH header
     */
    CompactDocWriter(const MmapV1RecordHeader* rec, unsigned dataSize, size_t allocationSize)
        : _rec(rec), _dataSize(dataSize), _allocationSize(allocationSize) {}

    virtual ~CompactDocWriter() {}

    virtual void writeDocument(char* buf) const {
        memcpy(buf, _rec->data(), _dataSize);
    }

    virtual size_t documentSize() const {
        return _allocationSize - MmapV1RecordHeader::HeaderSize;
    }

    virtual bool addPadding() const {
        return false;
    }

private:
    const MmapV1RecordHeader* _rec;
    size_t _dataSize;
    size_t _allocationSize;
};

void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
                                         const DiskLoc extentLoc,
                                         int extentNumber,
                                         RecordStoreCompactAdaptor* adaptor,
                                         const CompactOptions* compactOptions,
                                         CompactStats* stats) {
    log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " "
          << extentLoc;

    unsigned oldObjSize = 0;  // we'll report what the old padding was
    unsigned oldObjSizeWithPadding = 0;

    Extent* const sourceExtent = _extentManager->getExtent(extentLoc);
    sourceExtent->assertOk();
    fassert(17437, sourceExtent->validates(extentLoc));

    {
        // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we
        // first page in the whole Extent sequentially.
        // TODO benchmark on slow storage to verify this is measurably faster.
        log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl;
        Timer t;
        size_t length = sourceExtent->length;

        touch_pages(reinterpret_cast<const char*>(sourceExtent), length);
        int ms = t.millis();
        if (ms > 1000)
            log() << "compact end paging in " << ms << "ms "
                  << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl;
    }

    {
        // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents.
        log() << "compact copying records" << endl;
        long long totalNetSize = 0;
        long long nrecords = 0;
        DiskLoc nextSourceLoc = sourceExtent->firstRecord;
        while (!nextSourceLoc.isNull()) {
            txn->checkForInterrupt();

            WriteUnitOfWork wunit(txn);
            MmapV1RecordHeader* recOld = recordFor(nextSourceLoc);
            RecordData oldData = recOld->toRecordData();
            nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc);

            if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) {
                // object is corrupt!
                log() << "compact removing corrupt document!";
                stats->corruptDocuments++;
            } else {
                // How much data is in the record. Excludes padding and MmapV1RecordHeader headers.
                const unsigned rawDataSize = adaptor->dataSize(oldData);

                nrecords++;
                oldObjSize += rawDataSize;
                oldObjSizeWithPadding += recOld->netLength();

                // Allocation sizes include the headers and possibly some padding.
                const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize;
                unsigned allocationSize = minAllocationSize;
                switch (compactOptions->paddingMode) {
                    case CompactOptions::NONE:  // default padding
                        if (shouldPadInserts()) {
                            allocationSize = quantizeAllocationSpace(minAllocationSize);
                        }
                        break;

                    case CompactOptions::PRESERVE:  // keep original padding
                        allocationSize = recOld->lengthWithHeaders();
                        break;

                    case CompactOptions::MANUAL:  // user specified how much padding to use
                        allocationSize = compactOptions->computeRecordSize(minAllocationSize);
                        if (allocationSize < minAllocationSize ||
                            allocationSize > BSONObjMaxUserSize / 2) {
                            allocationSize = minAllocationSize;
                        }
                        break;
                }
                invariant(allocationSize >= minAllocationSize);

                // Copy the data to a new record. Because we orphaned the record freelist at the
                // start of the compact, this insert will allocate a record in a new extent.
                // See the comment in compact() for more details.
                CompactDocWriter writer(recOld, rawDataSize, allocationSize);
                StatusWith<RecordId> status = insertRecordWithDocWriter(txn, &writer);
                uassertStatusOK(status.getStatus());
                const MmapV1RecordHeader* newRec =
                    recordFor(DiskLoc::fromRecordId(status.getValue()));
                invariant(unsigned(newRec->netLength()) >= rawDataSize);
                totalNetSize += newRec->netLength();

                // Tells the caller that the record has been moved, so it can do things such as
                // add it to indexes.
                adaptor->inserted(newRec->toRecordData(), status.getValue());
            }

            // Remove the old record from the linked list of records withing the sourceExtent.
            // The old record is not added to the freelist as we will be freeing the whole
            // extent at the end.
            *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc;
            if (nextSourceLoc.isNull()) {
                // Just moved the last record out of the extent. Mark extent as empty.
                *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc();
            } else {
                MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc);
                txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs;
            }

            // Adjust the stats to reflect the removal of the old record. The insert above
            // handled adjusting the stats for the new record.
            _details->incrementStats(txn, -(recOld->netLength()), -1);

            wunit.commit();
        }

        // The extent must now be empty.
        invariant(sourceExtent->firstRecord.isNull());
        invariant(sourceExtent->lastRecord.isNull());

        // We are still the first extent, but we must not be the only extent.
        invariant(_details->firstExtent(txn) == extentLoc);
        invariant(_details->lastExtent(txn) != extentLoc);

        // Remove the newly emptied sourceExtent from the extent linked list and return it to
        // the extent manager.
        WriteUnitOfWork wunit(txn);
        const DiskLoc newFirst = sourceExtent->xnext;
        _details->setFirstExtent(txn, newFirst);
        *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc();
        _extentManager->freeExtent(txn, extentLoc);
        wunit.commit();

        {
            const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize
                                                 : 1.0;  // defining 0/0 as 1 for this.

            log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                  << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)"
                  << " oldPadding: " << oldPadding;
        }
    }
}

Status SimpleRecordStoreV1::compact(OperationContext* txn,
                                    RecordStoreCompactAdaptor* adaptor,
                                    const CompactOptions* options,
                                    CompactStats* stats) {
    std::vector<DiskLoc> extents;
    for (DiskLoc extLocation = _details->firstExtent(txn); !extLocation.isNull();
         extLocation = _extentManager->getExtent(extLocation)->xnext) {
        extents.push_back(extLocation);
    }
    log() << "compact " << extents.size() << " extents";

    {
        WriteUnitOfWork wunit(txn);
        // Orphaning the deleted lists ensures that all inserts go to new extents rather than
        // the ones that existed before starting the compact. If we abort the operation before
        // completion, any free space in the old extents will be leaked and never reused unless
        // the collection is compacted again or dropped. This is considered an acceptable
        // failure mode as no data will be lost.
        log() << "compact orphan deleted lists" << endl;
        _details->orphanDeletedList(txn);

        // Start over from scratch with our extent sizing and growth
        _details->setLastExtentSize(txn, 0);

        // create a new extent so new records go there
        increaseStorageSize(txn, _details->lastExtentSize(txn), true);
        wunit.commit();
    }

    stdx::unique_lock<Client> lk(*txn->getClient());
    ProgressMeterHolder pm(
        *txn->setMessage_inlock("compact extent", "Extent Compacting Progress", extents.size()));
    lk.unlock();

    // Go through all old extents and move each record to a new set of extents.
    int extentNumber = 0;
    for (std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++) {
        txn->checkForInterrupt();
        invariant(_details->firstExtent(txn) == *it);
        // empties and removes the first extent
        _compactExtent(txn, *it, extentNumber++, adaptor, options, stats);
        invariant(_details->firstExtent(txn) != *it);
        pm.hit();
    }

    invariant(_extentManager->getExtent(_details->firstExtent(txn))->xprev.isNull());
    invariant(_extentManager->getExtent(_details->lastExtent(txn))->xnext.isNull());

    // indexes will do their own progress meter
    pm.finished();

    return Status::OK();
}
}
Example #20
0
namespace repl {

const BSONObj reverseNaturalObj = BSON("$natural" << -1);

// number of readers created;
//  this happens when the source source changes, a reconfig/network-error or the cursor dies
static Counter64 readersCreatedStats;
static ServerStatusMetricField<Counter64> displayReadersCreated("repl.network.readersCreated",
                                                                &readersCreatedStats);


bool replAuthenticate(DBClientBase* conn) {
    if (!getGlobalAuthorizationManager()->isAuthEnabled())
        return true;

    if (!isInternalAuthSet())
        return false;
    return conn->authenticateInternalUser();
}

const Seconds OplogReader::kSocketTimeout(30);

OplogReader::OplogReader() {
    _tailingQueryOptions = QueryOption_SlaveOk;
    _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay;

    /* TODO: slaveOk maybe shouldn't use? */
    _tailingQueryOptions |= QueryOption_AwaitData;

    readersCreatedStats.increment();
}

bool OplogReader::connect(const HostAndPort& host) {
    if (conn() == NULL || _host != host) {
        resetConnection();
        _conn = shared_ptr<DBClientConnection>(
            new DBClientConnection(false, durationCount<Seconds>(kSocketTimeout)));
        string errmsg;
        if (!_conn->connect(host, errmsg) ||
            (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate(_conn.get()))) {
            resetConnection();
            error() << errmsg << endl;
            return false;
        }
        _conn->port().tag |= executor::NetworkInterface::kMessagingPortKeepOpen;
        _host = host;
    }
    return true;
}

void OplogReader::tailCheck() {
    if (cursor.get() && cursor->isDead()) {
        log() << "old cursor isDead, will initiate a new one" << std::endl;
        resetCursor();
    }
}

void OplogReader::query(
    const char* ns, Query query, int nToReturn, int nToSkip, const BSONObj* fields) {
    cursor.reset(
        _conn->query(ns, query, nToReturn, nToSkip, fields, QueryOption_SlaveOk).release());
}

void OplogReader::tailingQuery(const char* ns, const BSONObj& query) {
    verify(!haveCursor());
    LOG(2) << ns << ".find(" << query.toString() << ')' << endl;
    cursor.reset(_conn->query(ns, query, 0, 0, nullptr, _tailingQueryOptions).release());
}

void OplogReader::tailingQueryGTE(const char* ns, Timestamp optime) {
    BSONObjBuilder gte;
    gte.append("$gte", optime);
    BSONObjBuilder query;
    query.append("ts", gte.done());
    tailingQuery(ns, query.done());
}

HostAndPort OplogReader::getHost() const {
    return _host;
}

void OplogReader::connectToSyncSource(OperationContext* txn,
                                      const OpTime& lastOpTimeFetched,
                                      ReplicationCoordinator* replCoord) {
    const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0);
    const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max());
    OpTime oldestOpTimeSeen = sentinel;

    invariant(conn() == NULL);

    while (true) {
        HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp());

        if (candidate.empty()) {
            if (oldestOpTimeSeen == sentinel) {
                // If, in this invocation of connectToSyncSource(), we did not successfully
                // connect to any node ahead of us,
                // we apparently have no sync sources to connect to.
                // This situation is common; e.g. if there are no writes to the primary at
                // the moment.
                return;
            }

            // Connected to at least one member, but in all cases we were too stale to use them
            // as a sync source.
            error() << "too stale to catch up";
            log() << "our last optime : " << lastOpTimeFetched;
            log() << "oldest available is " << oldestOpTimeSeen;
            log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
            setMinValid(txn, oldestOpTimeSeen);
            bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING);
            if (!worked) {
                warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                          << ". Current state: " << replCoord->getMemberState();
            }
            return;
        }

        if (!connect(candidate)) {
            LOG(2) << "can't connect to " << candidate.toString() << " to read operations";
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10));
            continue;
        }
        // Read the first (oldest) op and confirm that it's not newer than our last
        // fetched op. Otherwise, we have fallen off the back of that source's oplog.
        BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query()));
        OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp));

        // remoteOldOpTime may come from a very old config, so we cannot compare their terms.
        if (!lastOpTimeFetched.isNull() &&
            lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) {
            // We're too stale to use this sync source.
            resetConnection();
            replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1));
            if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) {
                warning() << "we are too stale to use " << candidate.toString()
                          << " as a sync source";
                oldestOpTimeSeen = remoteOldOpTime;
            }
            continue;
        }

        // Got a valid sync source.
        return;
    }  // while (true)
}

}  // namespace repl
Example #21
0
File: curop.cpp Project: Axv2/mongo
namespace mongo {

    // todo : move more here

    CurOp::CurOp( Client * client , CurOp * wrapped ) :
        _client(client),
        _wrapped(wrapped)
    {
        if ( _wrapped )
            _client->_curOp = this;
        _start = 0;
        _active = false;
        _reset();
        _op = 0;
        _opNum = _nextOpNum++;
        // These addresses should never be written to again.  The zeroes are
        // placed here as a precaution because currentOp may be accessed
        // without the db mutex.
        memset(_ns, 0, sizeof(_ns));
    }

    void CurOp::_reset() {
        _suppressFromCurop = false;
        _command = false;
        _dbprofile = 0;
        _end = 0;
        _maxTimeTracker.reset();
        _message = "";
        _progressMeter.finished();
        _killPending.store(0);
        killCurrentOp.notifyAllWaiters();
        _numYields = 0;
        _expectedLatencyMs = 0;
        _lockStat.reset();
    }

    void CurOp::reset() {
        _reset();
        _start = 0;
        _opNum = _nextOpNum++;
        _ns[0] = 0;
        _debug.reset();
        _query.reset();
        _active = true; // this should be last for ui clarity
    }

    CurOp* CurOp::getOp(const BSONObj& criteria) {
        // Regarding Matcher: This is not quite the right hammer to use here.
        // Future: use an actual property of CurOp to flag index builds
        // and use that to filter.
        // This will probably need refactoring once we change index builds
        // to be a real command instead of an insert into system.indexes
        Matcher matcher(criteria);

        Client& me = cc();

        scoped_lock client_lock(Client::clientsMutex);
        for (std::set<Client*>::iterator it = Client::clients.begin();
             it != Client::clients.end();
             it++) {

            Client *client = *it;
            verify(client);

            CurOp* curop = client->curop();
            if (client == &me || curop == NULL) {
                continue;
            }

            if ( !curop->active() )
                continue;

            if ( curop->killPendingStrict() )
                continue;

            BSONObj info = curop->description();
            if (matcher.matches(info)) {
                return curop;
            }
        }

        return NULL;
    }

    void CurOp::reset( const HostAndPort& remote, int op ) {
        reset();
        if( _remote != remote ) {
            // todo : _remote is not thread safe yet is used as such!
            _remote = remote;
        }
        _op = op;
    }

    ProgressMeter& CurOp::setMessage(const char * msg,
                                     std::string name,
                                     unsigned long long progressMeterTotal,
                                     int secondsBetween) {
        if ( progressMeterTotal ) {
            if ( _progressMeter.isActive() ) {
                cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
                verify( ! _progressMeter.isActive() );
            }
            _progressMeter.reset( progressMeterTotal , secondsBetween );
            _progressMeter.setName(name);
        }
        else {
            _progressMeter.finished();
        }
        _message = msg;
        return _progressMeter;
    }

    CurOp::~CurOp() {
        killCurrentOp.notifyAllWaiters();

        if ( _wrapped ) {
            scoped_lock bl(Client::clientsMutex);
            _client->_curOp = _wrapped;
        }
        _client = 0;
    }

    void CurOp::ensureStarted() {
        if ( _start == 0 )
            _start = curTimeMicros64();
    }

    void CurOp::enter( Client::Context * context ) {
        ensureStarted();

        strncpy( _ns, context->ns(), Namespace::MaxNsLen);
        _ns[Namespace::MaxNsLen] = 0;

        _dbprofile = std::max( context->_db ? context->_db->getProfilingLevel() : 0 , _dbprofile );
    }

    void CurOp::leave( Client::Context * context ) {
    }

    void CurOp::recordGlobalTime( long long micros ) const {
        if ( _client ) {
            const LockState& ls = _client->lockState();
            verify( ls.threadState() );
            Top::global.record( _ns , _op , ls.hasAnyWriteLock() ? 1 : -1 , micros , _command );
        }
    }

    BSONObj CurOp::info() {
        BSONObjBuilder b;
        b.append("opid", _opNum);
        bool a = _active && _start;
        b.append("active", a);

        if( a ) {
            b.append("secs_running", elapsedSeconds() );
        }

        b.append( "op" , opToString( _op ) );

        b.append("ns", _ns);

        if (_op == dbInsert) {
            _query.append(b, "insert");
        }
        else {
            _query.append(b , "query");
        }

        if( !_remote.empty() ) {
            b.append("client", _remote.toString());
        }

        if ( _client ) {
            b.append( "desc" , _client->desc() );
            if ( _client->_threadId.size() )
                b.append( "threadId" , _client->_threadId );
            if ( _client->_connectionId )
                b.appendNumber( "connectionId" , _client->_connectionId );
            _client->_ls.reportState(b);
        }

        if ( ! _message.empty() ) {
            if ( _progressMeter.isActive() ) {
                StringBuilder buf;
                buf << _message.toString() << " " << _progressMeter.toString();
                b.append( "msg" , buf.str() );
                BSONObjBuilder sub( b.subobjStart( "progress" ) );
                sub.appendNumber( "done" , (long long)_progressMeter.done() );
                sub.appendNumber( "total" , (long long)_progressMeter.total() );
                sub.done();
            }
            else {
                b.append( "msg" , _message.toString() );
            }
        }

        if( killPending() )
            b.append("killPending", true);

        b.append( "numYields" , _numYields );
        b.append( "lockStats" , _lockStat.report() );

        return b.obj();
    }

    BSONObj CurOp::description() {
        BSONObjBuilder bob;
        bool a = _active && _start;
        bob.append("active", a);
        bob.append( "op" , opToString( _op ) );
        bob.append("ns", _ns);
        if (_op == dbInsert) {
            _query.append(bob, "insert");
        }
        else {
            _query.append(bob, "query");
        }
        if( killPending() )
            bob.append("killPending", true);
        return bob.obj();
    }

    void CurOp::setKillWaiterFlags() {
        for (size_t i = 0; i < _notifyList.size(); ++i)
            *(_notifyList[i]) = true;
        _notifyList.clear();
    }

    void CurOp::kill(bool* pNotifyFlag /* = NULL */) {
        _killPending.store(1);
        if (pNotifyFlag) {
            _notifyList.push_back(pNotifyFlag);
        }
    }

    void CurOp::setMaxTimeMicros(uint64_t maxTimeMicros) {
        if (maxTimeMicros == 0) {
            // 0 is "allow to run indefinitely".
            return;
        }

        // Note that calling startTime() will set CurOp::_start if it hasn't been set yet.
        _maxTimeTracker.setTimeLimit(startTime(), maxTimeMicros);
    }

    bool CurOp::maxTimeHasExpired() {
        return _maxTimeTracker.checkTimeLimit();
    }

    uint64_t CurOp::getRemainingMaxTimeMicros() const {
        return _maxTimeTracker.getRemainingMicros();
    }

    AtomicUInt CurOp::_nextOpNum;

    static Counter64 returnedCounter;
    static Counter64 insertedCounter;
    static Counter64 updatedCounter;
    static Counter64 deletedCounter;
    static Counter64 scannedCounter;

    static ServerStatusMetricField<Counter64> displayReturned( "document.returned", &returnedCounter );
    static ServerStatusMetricField<Counter64> displayUpdated( "document.updated", &updatedCounter );
    static ServerStatusMetricField<Counter64> displayInserted( "document.inserted", &insertedCounter );
    static ServerStatusMetricField<Counter64> displayDeleted( "document.deleted", &deletedCounter );
    static ServerStatusMetricField<Counter64> displayScanned( "queryExecutor.scanned", &scannedCounter );

    static Counter64 idhackCounter;
    static Counter64 scanAndOrderCounter;
    static Counter64 fastmodCounter;

    static ServerStatusMetricField<Counter64> displayIdhack( "operation.idhack", &idhackCounter );
    static ServerStatusMetricField<Counter64> displayScanAndOrder( "operation.scanAndOrder", &scanAndOrderCounter );
    static ServerStatusMetricField<Counter64> displayFastMod( "operation.fastmod", &fastmodCounter );

    void OpDebug::recordStats() {
        if ( nreturned > 0 )
            returnedCounter.increment( nreturned );
        if ( ninserted > 0 )
            insertedCounter.increment( ninserted );
        if ( nupdated > 0 )
            updatedCounter.increment( nupdated );
        if ( ndeleted > 0 )
            deletedCounter.increment( ndeleted );
        if ( nscanned > 0 )
            scannedCounter.increment( nscanned );

        if ( idhack )
            idhackCounter.increment();
        if ( scanAndOrder )
            scanAndOrderCounter.increment();
        if ( fastmod )
            fastmodCounter.increment();
    }

    CurOp::MaxTimeTracker::MaxTimeTracker() {
        reset();
    }

    void CurOp::MaxTimeTracker::reset() {
        _enabled = false;
        _targetEpochMicros = 0;
        _approxTargetServerMillis = 0;
    }

    void CurOp::MaxTimeTracker::setTimeLimit(uint64_t startEpochMicros, uint64_t durationMicros) {
        dassert(durationMicros != 0);

        _enabled = true;

        _targetEpochMicros = startEpochMicros + durationMicros;

        uint64_t now = curTimeMicros64();
        // If our accurate time source thinks time is not up yet, calculate the next target for
        // our approximate time source.
        if (_targetEpochMicros > now) {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis() +
                                        static_cast<int64_t>((_targetEpochMicros - now) / 1000);
        }
        // Otherwise, set our approximate time source target such that it thinks time is already
        // up.
        else {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis();
        }
    }

    bool CurOp::MaxTimeTracker::checkTimeLimit() {
        if (!_enabled) {
            return false;
        }

        // Does our approximate time source think time is not up yet?  If so, return early.
        if (_approxTargetServerMillis > Listener::getElapsedTimeMillis()) {
            return false;
        }

        uint64_t now = curTimeMicros64();
        // Does our accurate time source think time is not up yet?  If so, readjust the target for
        // our approximate time source and return early.
        if (_targetEpochMicros > now) {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis() +
                                        static_cast<int64_t>((_targetEpochMicros - now) / 1000);
            return false;
        }

        // Otherwise, time is up.
        return true;
    }

    uint64_t CurOp::MaxTimeTracker::getRemainingMicros() const {
        if (!_enabled) {
            // 0 is "allow to run indefinitely".
            return 0;
        }

        // Does our accurate time source think time is up?  If so, claim there is 1 microsecond
        // left for this operation.
        uint64_t now = curTimeMicros64();
        if (_targetEpochMicros <= now) {
            return 1;
        }

        // Otherwise, calculate remaining time.
        return _targetEpochMicros - now;
    }

}
    DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
                                                            int lenToAlloc ) {
        // align size up to a multiple of 4
        lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1);

        freelistAllocs.increment();
        DiskLoc loc;
        {
            DiskLoc *prev = 0;
            DiskLoc *bestprev = 0;
            DiskLoc bestmatch;
            int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough
            int b = bucket(lenToAlloc);
            DiskLoc cur = _details->deletedListEntry(b);
            
            int extra = 5; // look for a better fit, a little.
            int chain = 0;
            while ( 1 ) {
                { // defensive check
                    int fileNumber = cur.a();
                    int fileOffset = cur.getOfs();
                    if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) {
                        StringBuilder sb;
                        sb << "Deleted record list corrupted in collection " << _ns
                           << ", bucket " << b
                           << ", link number " << chain
                           << ", invalid link is " << cur.toString()
                           << ", throwing Fatal Assertion";
                        log() << sb.str() << endl;
                        fassertFailed(16469);
                    }
                }
                if ( cur.isNull() ) {
                    // move to next bucket.  if we were doing "extra", just break
                    if ( bestmatchlen < INT_MAX )
                        break;

                    if ( chain > 0 ) {
                        // if we looked at things in the right bucket, but they were not suitable
                        freelistBucketExhausted.increment();
                    }

                    b++;
                    if ( b > MaxBucket ) {
                        // out of space. alloc a new extent.
                        freelistIterations.increment( 1 + chain );
                        return DiskLoc();
                    }
                    cur = _details->deletedListEntry(b);
                    prev = 0;
                    continue;
                }
                DeletedRecord *r = drec(cur);
                if ( r->lengthWithHeaders() >= lenToAlloc &&
                     r->lengthWithHeaders() < bestmatchlen ) {
                    bestmatchlen = r->lengthWithHeaders();
                    bestmatch = cur;
                    bestprev = prev;
                    if (r->lengthWithHeaders() == lenToAlloc)
                        // exact match, stop searching
                        break;
                }
                if ( bestmatchlen < INT_MAX && --extra <= 0 )
                    break;
                if ( ++chain > 30 && b <= MaxBucket ) {
                    // too slow, force move to next bucket to grab a big chunk
                    //b++;
                    freelistIterations.increment( chain );
                    chain = 0;
                    cur.Null();
                }
                else {
                    cur = r->nextDeleted();
                    prev = &r->nextDeleted();
                }
            }

            // unlink ourself from the deleted list
            DeletedRecord *bmr = drec(bestmatch);
            if ( bestprev ) {
                *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted();
            }
            else {
                // should be the front of a free-list
                int myBucket = bucket(bmr->lengthWithHeaders());
                invariant( _details->deletedListEntry(myBucket) == bestmatch );
                _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted());
            }
            *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
            invariant(bmr->extentOfs() < bestmatch.getOfs());

            freelistIterations.increment( 1 + chain );
            loc = bestmatch;
        }

        if ( loc.isNull() )
            return loc;

        // determine if we should chop up

        DeletedRecord *r = drec(loc);

        /* note we want to grab from the front so our next pointers on disk tend
        to go in a forward direction which is important for performance. */
        int regionlen = r->lengthWithHeaders();
        invariant( r->extentOfs() < loc.getOfs() );

        int left = regionlen - lenToAlloc;
        if ( left < 24 || left < (lenToAlloc / 8) ) {
            // you get the whole thing.
            return loc;
        }

        // don't quantize:
        //   - $ collections (indexes) as we already have those aligned the way we want SERVER-8425
        if ( _normalCollection ) {
            // we quantize here so that it only impacts newly sized records
            // this prevents oddities with older records and space re-use SERVER-8435
            lenToAlloc = std::min( r->lengthWithHeaders(),
                                   quantizeAllocationSpace( lenToAlloc ) );
            left = regionlen - lenToAlloc;

            if ( left < 24 ) {
                // you get the whole thing.
                return loc;
            }
        }

        /* split off some for further use. */
        txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
        DiskLoc newDelLoc = loc;
        newDelLoc.inc(lenToAlloc);
        DeletedRecord* newDel = drec(newDelLoc);
        DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
        newDelW->extentOfs() = r->extentOfs();
        newDelW->lengthWithHeaders() = left;
        newDelW->nextDeleted().Null();

        addDeletedRec( txn, newDelLoc );
        return loc;
    }
namespace mongo {

    static Counter64 freelistAllocs;
    static Counter64 freelistBucketExhausted;
    static Counter64 freelistIterations;

    static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests",
                                                          &freelistAllocs );

    static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted",
                                                          &freelistBucketExhausted );

    static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned",
                                                          &freelistIterations );

    SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn,
                                              const StringData& ns,
                                              RecordStoreV1MetaData* details,
                                              ExtentManager* em,
                                              bool isSystemIndexes )
        : RecordStoreV1Base( ns, details, em, isSystemIndexes ) {

        invariant( !details->isCapped() );
        _normalCollection = NamespaceString::normal( ns );
        if ( _details->paddingFactor() == 0 ) {
            warning() << "implicit updgrade of paddingFactor of very old collection" << endl;
            WriteUnitOfWork wunit(txn);
            _details->setPaddingFactor(txn, 1.0);
            wunit.commit();
        }

    }

    SimpleRecordStoreV1::~SimpleRecordStoreV1() {
    }

    DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
                                                            int lenToAlloc ) {
        // align size up to a multiple of 4
        lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1);

        freelistAllocs.increment();
        DiskLoc loc;
        {
            DiskLoc *prev = 0;
            DiskLoc *bestprev = 0;
            DiskLoc bestmatch;
            int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough
            int b = bucket(lenToAlloc);
            DiskLoc cur = _details->deletedListEntry(b);
            
            int extra = 5; // look for a better fit, a little.
            int chain = 0;
            while ( 1 ) {
                { // defensive check
                    int fileNumber = cur.a();
                    int fileOffset = cur.getOfs();
                    if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) {
                        StringBuilder sb;
                        sb << "Deleted record list corrupted in collection " << _ns
                           << ", bucket " << b
                           << ", link number " << chain
                           << ", invalid link is " << cur.toString()
                           << ", throwing Fatal Assertion";
                        log() << sb.str() << endl;
                        fassertFailed(16469);
                    }
                }
                if ( cur.isNull() ) {
                    // move to next bucket.  if we were doing "extra", just break
                    if ( bestmatchlen < INT_MAX )
                        break;

                    if ( chain > 0 ) {
                        // if we looked at things in the right bucket, but they were not suitable
                        freelistBucketExhausted.increment();
                    }

                    b++;
                    if ( b > MaxBucket ) {
                        // out of space. alloc a new extent.
                        freelistIterations.increment( 1 + chain );
                        return DiskLoc();
                    }
                    cur = _details->deletedListEntry(b);
                    prev = 0;
                    continue;
                }
                DeletedRecord *r = drec(cur);
                if ( r->lengthWithHeaders() >= lenToAlloc &&
                     r->lengthWithHeaders() < bestmatchlen ) {
                    bestmatchlen = r->lengthWithHeaders();
                    bestmatch = cur;
                    bestprev = prev;
                    if (r->lengthWithHeaders() == lenToAlloc)
                        // exact match, stop searching
                        break;
                }
                if ( bestmatchlen < INT_MAX && --extra <= 0 )
                    break;
                if ( ++chain > 30 && b <= MaxBucket ) {
                    // too slow, force move to next bucket to grab a big chunk
                    //b++;
                    freelistIterations.increment( chain );
                    chain = 0;
                    cur.Null();
                }
                else {
                    cur = r->nextDeleted();
                    prev = &r->nextDeleted();
                }
            }

            // unlink ourself from the deleted list
            DeletedRecord *bmr = drec(bestmatch);
            if ( bestprev ) {
                *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted();
            }
            else {
                // should be the front of a free-list
                int myBucket = bucket(bmr->lengthWithHeaders());
                invariant( _details->deletedListEntry(myBucket) == bestmatch );
                _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted());
            }
            *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
            invariant(bmr->extentOfs() < bestmatch.getOfs());

            freelistIterations.increment( 1 + chain );
            loc = bestmatch;
        }

        if ( loc.isNull() )
            return loc;

        // determine if we should chop up

        DeletedRecord *r = drec(loc);

        /* note we want to grab from the front so our next pointers on disk tend
        to go in a forward direction which is important for performance. */
        int regionlen = r->lengthWithHeaders();
        invariant( r->extentOfs() < loc.getOfs() );

        int left = regionlen - lenToAlloc;
        if ( left < 24 || left < (lenToAlloc / 8) ) {
            // you get the whole thing.
            return loc;
        }

        // don't quantize:
        //   - $ collections (indexes) as we already have those aligned the way we want SERVER-8425
        if ( _normalCollection ) {
            // we quantize here so that it only impacts newly sized records
            // this prevents oddities with older records and space re-use SERVER-8435
            lenToAlloc = std::min( r->lengthWithHeaders(),
                                   quantizeAllocationSpace( lenToAlloc ) );
            left = regionlen - lenToAlloc;

            if ( left < 24 ) {
                // you get the whole thing.
                return loc;
            }
        }

        /* split off some for further use. */
        txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
        DiskLoc newDelLoc = loc;
        newDelLoc.inc(lenToAlloc);
        DeletedRecord* newDel = drec(newDelLoc);
        DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
        newDelW->extentOfs() = r->extentOfs();
        newDelW->lengthWithHeaders() = left;
        newDelW->nextDeleted().Null();

        addDeletedRec( txn, newDelLoc );
        return loc;
    }

    StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn,
                                                          int lengthWithHeaders,
                                                          bool enforceQuota ) {
        DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders );
        if ( !loc.isNull() )
            return StatusWith<DiskLoc>( loc );

        LOG(1) << "allocating new extent";

        increaseStorageSize( txn,
                             _extentManager->followupSize( lengthWithHeaders,
                                                           _details->lastExtentSize(txn)),
                             enforceQuota );

        loc = _allocFromExistingExtents( txn, lengthWithHeaders );
        if ( !loc.isNull() ) {
            // got on first try
            return StatusWith<DiskLoc>( loc );
        }

        log() << "warning: alloc() failed after allocating new extent. "
              << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:"
              << _details->lastExtentSize(txn) << "; trying again";

        for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) {
            log() << "try #" << z << endl;

            increaseStorageSize( txn,
                                 _extentManager->followupSize( lengthWithHeaders,
                                                               _details->lastExtentSize(txn)),
                                 enforceQuota );

            loc = _allocFromExistingExtents( txn, lengthWithHeaders );
            if ( ! loc.isNull() )
                return StatusWith<DiskLoc>( loc );
        }

        return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" );
    }

    Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
        const DiskLoc firstExtLoc = _details->firstExtent(txn);
        if (firstExtLoc.isNull() || !firstExtLoc.isValid()) {
            // Already empty
            return Status::OK();
        }

        // Free all extents except the first.
        Extent* firstExt = _extentManager->getExtent(firstExtLoc);
        if (!firstExt->xnext.isNull()) {
            const DiskLoc extNextLoc = firstExt->xnext;
            const DiskLoc oldLastExtLoc = _details->lastExtent(txn);
            Extent* const nextExt = _extentManager->getExtent(extNextLoc);

            // Unlink other extents;
            *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc();
            *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc();
            _details->setLastExtent(txn, firstExtLoc);
            _details->setLastExtentSize(txn, firstExt->length);

            _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc);
        }

        // Make the first (now only) extent a single large deleted record.
        *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc();
        *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc();
        _details->orphanDeletedList(txn);
        addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt));

        // Make stats reflect that there are now no documents in this record store.
        _details->setStats(txn, 0, 0);

        return Status::OK();
    }

    void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
        DeletedRecord* d = drec( dloc );

        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl;

        int b = bucket(d->lengthWithHeaders());
        *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
        _details->setDeletedListEntry(txn, b, dloc);
    }

    RecordIterator* SimpleRecordStoreV1::getIterator( OperationContext* txn,
                                                      const DiskLoc& start,
                                                      bool tailable,
                                                      const CollectionScanParams::Direction& dir) const {
        return new SimpleRecordStoreV1Iterator( txn, this, start, dir );
    }

    vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const {
        OwnedPointerVector<RecordIterator> iterators;
        const Extent* ext;
        for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
            ext = _getExtent(txn, extLoc);
            if (ext->firstRecord.isNull())
                continue;
            iterators.push_back(
                new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this));
        }

        return iterators.release();
    }

    class CompactDocWriter : public DocWriter {
    public:
        /**
         * param allocationSize - allocation size WITH header
         */
        CompactDocWriter( const Record* rec, unsigned dataSize, size_t allocationSize )
            : _rec( rec ),
              _dataSize( dataSize ),
              _allocationSize( allocationSize ) {
        }

        virtual ~CompactDocWriter() {}

        virtual void writeDocument( char* buf ) const {
            memcpy( buf, _rec->data(), _dataSize );
        }

        virtual size_t documentSize() const {
            return _allocationSize - Record::HeaderSize;
        }

        virtual bool addPadding() const {
            return false;
        }

    private:
        const Record* _rec;
        size_t _dataSize;
        size_t _allocationSize;
    };

    void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
                                             const DiskLoc extentLoc,
                                             int extentNumber,
                                             RecordStoreCompactAdaptor* adaptor,
                                             const CompactOptions* compactOptions,
                                             CompactStats* stats ) {

        log() << "compact begin extent #" << extentNumber
              << " for namespace " << _ns << " " << extentLoc;

        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent* const sourceExtent = _extentManager->getExtent( extentLoc );
        sourceExtent->assertOk();
        fassert( 17437, sourceExtent->validates(extentLoc) );

        {
            // The next/prev Record pointers within the Extent might not be in order so we first
            // page in the whole Extent sequentially.
            // TODO benchmark on slow storage to verify this is measurably faster.
            log() << "compact paging in len=" << sourceExtent->length/1000000.0 << "MB" << endl;
            Timer t;
            size_t length = sourceExtent->length;

            touch_pages( reinterpret_cast<const char*>(sourceExtent), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << sourceExtent->length/1000000.0/t.seconds() << "MB/sec" << endl;
        }

        {
            // Move each Record out of this extent and insert it in to the "new" extents.
            log() << "compact copying records" << endl;
            long long totalNetSize = 0;
            long long nrecords = 0;
            DiskLoc nextSourceLoc = sourceExtent->firstRecord;
            while (!nextSourceLoc.isNull()) {
                txn->checkForInterrupt();

                WriteUnitOfWork wunit(txn);
                Record* recOld = recordFor(nextSourceLoc);
                RecordData oldData = recOld->toRecordData();
                nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc);

                if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) {
                    // object is corrupt!
                    log() << "compact removing corrupt document!";
                    stats->corruptDocuments++;
                }
                else {
                    // How much data is in the record. Excludes padding and Record headers.
                    const unsigned rawDataSize = adaptor->dataSize( oldData );

                    nrecords++;
                    oldObjSize += rawDataSize;
                    oldObjSizeWithPadding += recOld->netLength();

                    // Allocation sizes include the headers and possibly some padding.
                    const unsigned minAllocationSize = rawDataSize + Record::HeaderSize;
                    unsigned allocationSize = minAllocationSize;
                    switch( compactOptions->paddingMode ) {
                    case CompactOptions::NONE: // no padding, unless using powerOf2Sizes
                        if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) )
                            allocationSize = quantizePowerOf2AllocationSpace(minAllocationSize);
                        else
                            allocationSize = minAllocationSize;
                        break;

                    case CompactOptions::PRESERVE: // keep original padding
                        allocationSize = recOld->lengthWithHeaders();
                        break;

                    case CompactOptions::MANUAL: // user specified how much padding to use
                        allocationSize = compactOptions->computeRecordSize(minAllocationSize);
                        if (allocationSize < minAllocationSize
                                || allocationSize > BSONObjMaxUserSize / 2 ) {
                            allocationSize = minAllocationSize;
                        }
                        break;
                    }
                    invariant(allocationSize >= minAllocationSize);

                    // Copy the data to a new record. Because we orphaned the record freelist at the
                    // start of the compact, this insert will allocate a record in a new extent.
                    // See the comment in compact() for more details.
                    CompactDocWriter writer( recOld, rawDataSize, allocationSize );
                    StatusWith<DiskLoc> status = insertRecord( txn, &writer, false );
                    uassertStatusOK( status.getStatus() );
                    const Record* newRec = recordFor(status.getValue());
                    invariant(unsigned(newRec->netLength()) >= rawDataSize);
                    totalNetSize += newRec->netLength();

                    // Tells the caller that the record has been moved, so it can do things such as
                    // add it to indexes.
                    adaptor->inserted(newRec->toRecordData(), status.getValue());
                }

                // Remove the old record from the linked list of records withing the sourceExtent.
                // The old record is not added to the freelist as we will be freeing the whole
                // extent at the end.
                *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc;
                if (nextSourceLoc.isNull()) {
                    // Just moved the last record out of the extent. Mark extent as empty.
                    *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc();
                }
                else {
                    Record* newFirstRecord = recordFor(nextSourceLoc);
                    txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs;
                }

                // Adjust the stats to reflect the removal of the old record. The insert above
                // handled adjusting the stats for the new record.
                _details->incrementStats(txn, -(recOld->netLength()), -1);

                wunit.commit();
            }

            // The extent must now be empty.
            invariant(sourceExtent->firstRecord.isNull());
            invariant(sourceExtent->lastRecord.isNull());

            // We are still the first extent, but we must not be the only extent.
            invariant( _details->firstExtent(txn) == extentLoc );
            invariant( _details->lastExtent(txn) != extentLoc );

            // Remove the newly emptied sourceExtent from the extent linked list and return it to
            // the extent manager.
            WriteUnitOfWork wunit(txn);
            const DiskLoc newFirst = sourceExtent->xnext;
            _details->setFirstExtent( txn, newFirst );
            *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc();
            _extentManager->freeExtent( txn, extentLoc );
            wunit.commit();

            {
                const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize
                                                     : 1.0; // defining 0/0 as 1 for this.

                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                      << " documents (" << totalNetSize / (1024*1024.0) << "MB)"
                      << " oldPadding: " << oldPadding;
            }
        }

    }

    Status SimpleRecordStoreV1::compact( OperationContext* txn,
                                         RecordStoreCompactAdaptor* adaptor,
                                         const CompactOptions* options,
                                         CompactStats* stats ) {

        std::vector<DiskLoc> extents;
        for( DiskLoc extLocation = _details->firstExtent(txn);
             !extLocation.isNull();
             extLocation = _extentManager->getExtent( extLocation )->xnext ) {
            extents.push_back( extLocation );
        }
        log() << "compact " << extents.size() << " extents";

        {
            WriteUnitOfWork wunit(txn);
            // Orphaning the deleted lists ensures that all inserts go to new extents rather than
            // the ones that existed before starting the compact. If we abort the operation before
            // completion, any free space in the old extents will be leaked and never reused unless
            // the collection is compacted again or dropped. This is considered an acceptable
            // failure mode as no data will be lost.
            log() << "compact orphan deleted lists" << endl;
            _details->orphanDeletedList(txn);

            // Start over from scratch with our extent sizing and growth
            _details->setLastExtentSize( txn, 0 );

            // create a new extent so new records go there
            increaseStorageSize( txn, _details->lastExtentSize(txn), true );
            wunit.commit();
        }

        ProgressMeterHolder pm(*txn->setMessage("compact extent",
                                                "Extent Compacting Progress",
                                                extents.size()));

        // Go through all old extents and move each record to a new set of extents.
        int extentNumber = 0;
        for( std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++ ) {
            txn->checkForInterrupt();
            invariant(_details->firstExtent(txn) == *it);
            // empties and removes the first extent
            _compactExtent(txn, *it, extentNumber++, adaptor, options, stats );
            invariant(_details->firstExtent(txn) != *it);
            pm.hit();
        }

        invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() );
        invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() );

        // indexes will do their own progress meter
        pm.finished();

        return Status::OK();
    }

}
Example #24
0
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) {
    // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the
    // correct deleted list each time we try to allocate a new record. This ensures we won't
    // orphan any data when upgrading from old versions, without needing a long upgrade phase.
    // This is done before we try to allocate the new record so we can take advantage of the new
    // space immediately.
    {
        const DiskLoc head = _details->deletedListLegacyGrabBag();
        if (!head.isNull()) {
            _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted());
            addDeletedRec(txn, head);
        }
    }

    // align size up to a multiple of 4
    const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1);

    freelistAllocs.increment();
    DiskLoc loc;
    DeletedRecord* dr = NULL;
    {
        int myBucket;
        for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) {
            // Only look at the first entry in each bucket. This works because we are either
            // quantizing or allocating fixed-size blocks.
            const DiskLoc head = _details->deletedListEntry(myBucket);
            if (head.isNull())
                continue;
            DeletedRecord* const candidate = drec(head);
            if (candidate->lengthWithHeaders() >= lenToAlloc) {
                loc = head;
                dr = candidate;
                break;
            }
        }

        if (!dr)
            return DiskLoc();  // no space

        // Unlink ourself from the deleted list
        _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted());
        *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid();  // defensive
    }

    invariant(dr->extentOfs() < loc.getOfs());

    // Split the deleted record if it has at least as much left over space as our smallest
    // allocation size. Otherwise, just take the whole DeletedRecord.
    const int remainingLength = dr->lengthWithHeaders() - lenToAlloc;
    if (remainingLength >= bucketSizes[0]) {
        txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc;
        const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc);
        DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc));
        newDel->extentOfs() = dr->extentOfs();
        newDel->lengthWithHeaders() = remainingLength;
        newDel->nextDeleted().Null();

        addDeletedRec(txn, newDelLoc);
    }

    return loc;
}
Example #25
0
 long long get() const {
     return cursorStatsMultiTarget.get() + cursorStatsSingleTarget.get();
 }
Example #26
0
namespace mongo {

using std::unique_ptr;
using std::endl;
using std::string;
using std::stringstream;

const int ShardedClientCursor::INIT_REPLY_BUFFER_SIZE = 32768;

// Note: There is no counter for shardedEver from cursorInfo since it is deprecated
static Counter64 cursorStatsMultiTarget;
static Counter64 cursorStatsSingleTarget;

// Simple class to report the sum total open cursors = sharded + refs
class CursorStatsSum {
public:
    operator long long() const {
        return get();
    }
    long long get() const {
        return cursorStatsMultiTarget.get() + cursorStatsSingleTarget.get();
    }
};

static CursorStatsSum cursorStatsTotalOpen;

// --------  ShardedCursor -----------

ShardedClientCursor::ShardedClientCursor(QueryMessage& q, ParallelSortClusteredCursor* cursor) {
    verify(cursor);
    _cursor = cursor;

    _skip = q.ntoskip;
    _ntoreturn = q.ntoreturn;

    _totalSent = 0;
    _done = false;

    _id = 0;

    if (q.queryOptions & QueryOption_NoCursorTimeout) {
        _lastAccessMillis = 0;
    } else
        _lastAccessMillis = Listener::getElapsedTimeMillis();

    cursorStatsMultiTarget.increment();
}

ShardedClientCursor::~ShardedClientCursor() {
    verify(_cursor);
    delete _cursor;
    _cursor = 0;
    cursorStatsMultiTarget.decrement();
}

long long ShardedClientCursor::getId() {
    if (_id <= 0) {
        _id = cursorCache.genId();
        verify(_id >= 0);
    }
    return _id;
}

int ShardedClientCursor::getTotalSent() const {
    return _totalSent;
}

void ShardedClientCursor::accessed() {
    if (_lastAccessMillis > 0)
        _lastAccessMillis = Listener::getElapsedTimeMillis();
}

long long ShardedClientCursor::idleTime(long long now) {
    if (_lastAccessMillis == 0)
        return 0;
    return now - _lastAccessMillis;
}

bool ShardedClientCursor::sendNextBatch(int batchSize, BufBuilder& buffer, int& docCount) {
    uassert(10191, "cursor already done", !_done);

    int maxSize = 1024 * 1024;
    if (_totalSent > 0)
        maxSize *= 3;

    docCount = 0;

    // If batchSize is negative, it means that we should send up to -batchSize results
    // back to the client, and that we should only send a *single batch*. An batchSize of
    // 1 is also a special case which means "return up to 1 result in a single batch" (so
    // that +1 actually has the same meaning of -1). For all other values of batchSize, we
    // may have to return multiple batches.
    const bool sendMoreBatches = batchSize == 0 || batchSize > 1;
    batchSize = abs(batchSize);

    // Set the initial batch size to 101, just like mongoD.
    if (batchSize == 0 && _totalSent == 0)
        batchSize = 101;

    // Set batch size to batchSize requested by the current operation unconditionally.  This is
    // necessary because if the loop exited due to docCount == batchSize then setBatchSize(0) was
    // called, so the next _cusor->more() will be called with a batch size of 0 if the cursor
    // buffer was drained the previous run.  Unconditionally setting the batch size ensures that
    // we don't ask for a batch size of zero as a side effect.
    _cursor->setBatchSize(batchSize);

    bool cursorHasMore = true;
    while ((cursorHasMore = _cursor->more())) {
        BSONObj o = _cursor->next();
        buffer.appendBuf((void*)o.objdata(), o.objsize());
        ++docCount;

        // Ensure that the next batch will never wind up requesting more docs from the shard
        // than are remaining to satisfy the initial batchSize.
        if (batchSize != 0) {
            if (docCount == batchSize)
                break;
            _cursor->setBatchSize(batchSize - docCount);
        }

        if (buffer.len() > maxSize) {
            break;
        }
    }

    // We need to request another batch if the following two conditions hold:
    //
    //  1. batchSize is positive and not equal to 1 (see the comment above). This condition
    //  is stored in 'sendMoreBatches'.
    //
    //  2. The last call to _cursor->more() was true (i.e. we never explicitly got a false
    //  value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server
    //  hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the
    //  query response to indicate that there are no more results. In this case, _cursor->more()
    //  will be explicitly false, and we know for sure that we do not have to send more batches.
    //
    //  On the other hand, if _cursor->more() is true there may or may not be more results.
    //  Suppose that the mongod generates enough results to fill this batch. In this case it
    //  does not know whether not there are more, because doing so would require requesting an
    //  extra result and seeing whether we get EOF. The mongod sends a valid cursorId to
    //  indicate that there may be more. We do the same here: we indicate that there may be
    //  more results to retrieve by setting 'hasMoreBatches' to true.
    bool hasMoreBatches = sendMoreBatches && cursorHasMore;

    LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches
           << " cursorHasMore: " << cursorHasMore << " batchSize: " << batchSize
           << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl;

    _totalSent += docCount;
    _done = !hasMoreBatches;

    return hasMoreBatches;
}

// ---- CursorCache -----

unsigned getCCRandomSeed() {
    unique_ptr<SecureRandom> sr(SecureRandom::create());
    return sr->nextInt64();
}

CursorCache::CursorCache() : _random(getCCRandomSeed()), _shardedTotal(0) {}

CursorCache::~CursorCache() {
    // TODO: delete old cursors?
    bool print = shouldLog(logger::LogSeverity::Debug(1));
    if (_cursors.size() || _refs.size())
        print = true;
    verify(_refs.size() == _refsNS.size());

    if (print)
        log() << " CursorCache at shutdown - "
              << " sharded: " << _cursors.size() << " passthrough: " << _refs.size() << endl;
}

ShardedClientCursorPtr CursorCache::get(long long id) const {
    LOG(_myLogLevel) << "CursorCache::get id: " << id << endl;
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    MapSharded::const_iterator i = _cursors.find(id);
    if (i == _cursors.end()) {
        return ShardedClientCursorPtr();
    }
    i->second->accessed();
    return i->second;
}

int CursorCache::getMaxTimeMS(long long id) const {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    MapShardedInt::const_iterator i = _cursorsMaxTimeMS.find(id);
    return (i != _cursorsMaxTimeMS.end()) ? i->second : 0;
}

void CursorCache::store(ShardedClientCursorPtr cursor, int maxTimeMS) {
    LOG(_myLogLevel) << "CursorCache::store cursor "
                     << " id: " << cursor->getId()
                     << (maxTimeMS != kMaxTimeCursorNoTimeLimit
                             ? str::stream() << "maxTimeMS: " << maxTimeMS
                             : string("")) << endl;
    verify(cursor->getId());
    verify(maxTimeMS == kMaxTimeCursorTimeLimitExpired || maxTimeMS == kMaxTimeCursorNoTimeLimit ||
           maxTimeMS > 0);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _cursorsMaxTimeMS[cursor->getId()] = maxTimeMS;
    _cursors[cursor->getId()] = cursor;
    _shardedTotal++;
}

void CursorCache::updateMaxTimeMS(long long id, int maxTimeMS) {
    verify(id);
    verify(maxTimeMS == kMaxTimeCursorTimeLimitExpired || maxTimeMS == kMaxTimeCursorNoTimeLimit ||
           maxTimeMS > 0);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _cursorsMaxTimeMS[id] = maxTimeMS;
}

void CursorCache::remove(long long id) {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _cursorsMaxTimeMS.erase(id);
    _cursors.erase(id);
}

void CursorCache::removeRef(long long id) {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _refs.erase(id);
    _refsNS.erase(id);
    cursorStatsSingleTarget.decrement();
}

void CursorCache::storeRef(const std::string& server, long long id, const std::string& ns) {
    LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl;
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _refs[id] = server;
    _refsNS[id] = ns;
    cursorStatsSingleTarget.increment();
}

string CursorCache::getRef(long long id) const {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    MapNormal::const_iterator i = _refs.find(id);

    LOG(_myLogLevel) << "CursorCache::getRef id: " << id
                     << " out: " << (i == _refs.end() ? " NONE " : i->second) << endl;

    if (i == _refs.end())
        return "";
    return i->second;
}

std::string CursorCache::getRefNS(long long id) const {
    verify(id);
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    MapNormal::const_iterator i = _refsNS.find(id);

    LOG(_myLogLevel) << "CursorCache::getRefNs id: " << id
                     << " out: " << (i == _refsNS.end() ? " NONE " : i->second) << std::endl;

    if (i == _refsNS.end())
        return "";
    return i->second;
}


long long CursorCache::genId() {
    while (true) {
        stdx::lock_guard<stdx::mutex> lk(_mutex);

        long long x = Listener::getElapsedTimeMillis() << 32;
        x |= _random.nextInt32();

        if (x == 0)
            continue;

        if (x < 0)
            x *= -1;

        MapSharded::iterator i = _cursors.find(x);
        if (i != _cursors.end())
            continue;

        MapNormal::iterator j = _refs.find(x);
        if (j != _refs.end())
            continue;

        return x;
    }
}

void CursorCache::gotKillCursors(Message& m) {
    LastError::get(cc()).disable();
    DbMessage dbmessage(m);
    int n = dbmessage.pullInt();

    if (n > 2000) {
        (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl;
    }

    uassert(13286, "sent 0 cursors to kill", n >= 1);
    uassert(13287, "too many cursors to kill", n < 30000);
    massert(18632,
            str::stream() << "bad kill cursors size: " << m.dataSize(),
            m.dataSize() == 8 + (8 * n));


    ConstDataCursor cursors(dbmessage.getArray(n));

    ClientBasic* client = ClientBasic::getCurrent();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    for (int i = 0; i < n; i++) {
        long long id = cursors.readAndAdvance<LittleEndian<int64_t>>();
        LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl;

        if (!id) {
            warning() << " got cursor id of 0 to kill" << endl;
            continue;
        }

        string server;
        {
            stdx::lock_guard<stdx::mutex> lk(_mutex);

            MapSharded::iterator i = _cursors.find(id);
            if (i != _cursors.end()) {
                Status authorizationStatus =
                    authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id);
                audit::logKillCursorsAuthzCheck(
                    client,
                    NamespaceString(i->second->getNS()),
                    id,
                    authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized);
                if (authorizationStatus.isOK()) {
                    _cursorsMaxTimeMS.erase(i->second->getId());
                    _cursors.erase(i);
                }
                continue;
            }

            MapNormal::iterator refsIt = _refs.find(id);
            MapNormal::iterator refsNSIt = _refsNS.find(id);
            if (refsIt == _refs.end()) {
                warning() << "can't find cursor: " << id << endl;
                continue;
            }
            verify(refsNSIt != _refsNS.end());
            Status authorizationStatus =
                authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id);
            audit::logKillCursorsAuthzCheck(client,
                                            NamespaceString(refsNSIt->second),
                                            id,
                                            authorizationStatus.isOK() ? ErrorCodes::OK
                                                                       : ErrorCodes::Unauthorized);
            if (!authorizationStatus.isOK()) {
                continue;
            }
            server = refsIt->second;
            _refs.erase(refsIt);
            _refsNS.erase(refsNSIt);
            cursorStatsSingleTarget.decrement();
        }

        LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server
                         << endl;

        verify(server.size());
        ScopedDbConnection conn(server);
        conn->killCursor(id);
        conn.done();
    }
}

void CursorCache::appendInfo(BSONObjBuilder& result) const {
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    result.append("sharded", static_cast<int>(cursorStatsMultiTarget.get()));
    result.appendNumber("shardedEver", _shardedTotal);
    result.append("refs", static_cast<int>(cursorStatsSingleTarget.get()));
    result.append("totalOpen", static_cast<int>(cursorStatsTotalOpen.get()));
}

void CursorCache::doTimeouts() {
    long long now = Listener::getElapsedTimeMillis();
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    for (MapSharded::iterator i = _cursors.begin(); i != _cursors.end(); ++i) {
        // Note: cursors with no timeout will always have an idleTime of 0
        long long idleFor = i->second->idleTime(now);
        if (idleFor < ClusterCursorCleanupJob::cursorTimeoutMillis) {
            continue;
        }
        log() << "killing old cursor " << i->second->getId() << " idle for: " << idleFor << "ms"
              << endl;  // TODO: make LOG(1)
        _cursorsMaxTimeMS.erase(i->second->getId());
        _cursors.erase(i);
        i = _cursors.begin();  // possible 2nd entry will get skipped, will get on next pass
        if (i == _cursors.end())
            break;
    }
}

CursorCache cursorCache;

const int CursorCache::_myLogLevel = 3;

class CursorTimeoutTask : public task::Task {
public:
    virtual string name() const {
        return "cursorTimeout";
    }
    virtual void doWork() {
        cursorCache.doTimeouts();
    }
};

void CursorCache::startTimeoutThread() {
    task::repeat(new CursorTimeoutTask, 4000);
}
}  // namespace mongo
Example #27
0
namespace mongo {

namespace {
const auto bannedExpressionsInValidators = std::set<StringData>{
    "$geoNear", "$near", "$nearSphere", "$text", "$where",
};

Status checkValidatorForBannedExpressions(const BSONObj& validator) {
    for (auto field : validator) {
        const auto name = field.fieldNameStringData();
        if (name[0] == '$' && bannedExpressionsInValidators.count(name)) {
            return {ErrorCodes::InvalidOptions,
                    str::stream() << name << " is not allowed in collection validators"};
        }

        if (field.type() == Object || field.type() == Array) {
            auto status = checkValidatorForBannedExpressions(field.Obj());
            if (!status.isOK())
                return status;
        }
    }

    return Status::OK();
}
}

using std::unique_ptr;
using std::endl;
using std::string;
using std::vector;

using logger::LogComponent;

std::string CompactOptions::toString() const {
    std::stringstream ss;
    ss << "paddingMode: ";
    switch (paddingMode) {
        case NONE:
            ss << "NONE";
            break;
        case PRESERVE:
            ss << "PRESERVE";
            break;
        case MANUAL:
            ss << "MANUAL (" << paddingBytes << " + ( doc * " << paddingFactor << ") )";
    }

    ss << " validateDocuments: " << validateDocuments;

    return ss.str();
}

//
// CappedInsertNotifier
//

CappedInsertNotifier::CappedInsertNotifier() : _version(0), _dead(false) {}

void CappedInsertNotifier::notifyAll() {
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    ++_version;
    _notifier.notify_all();
}

void CappedInsertNotifier::_wait(stdx::unique_lock<stdx::mutex>& lk,
                                 uint64_t prevVersion,
                                 Microseconds timeout) const {
    while (!_dead && prevVersion == _version) {
        if (timeout == Microseconds::max()) {
            _notifier.wait(lk);
        } else if (stdx::cv_status::timeout == _notifier.wait_for(lk, timeout)) {
            return;
        }
    }
}

void CappedInsertNotifier::wait(uint64_t prevVersion, Microseconds timeout) const {
    stdx::unique_lock<stdx::mutex> lk(_mutex);
    _wait(lk, prevVersion, timeout);
}

void CappedInsertNotifier::wait(Microseconds timeout) const {
    stdx::unique_lock<stdx::mutex> lk(_mutex);
    _wait(lk, _version, timeout);
}

void CappedInsertNotifier::wait() const {
    stdx::unique_lock<stdx::mutex> lk(_mutex);
    _wait(lk, _version, Microseconds::max());
}

void CappedInsertNotifier::kill() {
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    _dead = true;
    _notifier.notify_all();
}

bool CappedInsertNotifier::isDead() {
    stdx::lock_guard<stdx::mutex> lk(_mutex);
    return _dead;
}

// ----

Collection::Collection(OperationContext* txn,
                       StringData fullNS,
                       CollectionCatalogEntry* details,
                       RecordStore* recordStore,
                       DatabaseCatalogEntry* dbce)
    : _ns(fullNS),
      _details(details),
      _recordStore(recordStore),
      _dbce(dbce),
      _needCappedLock(supportsDocLocking() && _recordStore->isCapped() && _ns.db() != "local"),
      _infoCache(this),
      _indexCatalog(this),
      _validatorDoc(_details->getCollectionOptions(txn).validator.getOwned()),
      _validator(uassertStatusOK(parseValidator(_validatorDoc))),
      _validationAction(uassertStatusOK(
          _parseValidationAction(_details->getCollectionOptions(txn).validationAction))),
      _validationLevel(uassertStatusOK(
          _parseValidationLevel(_details->getCollectionOptions(txn).validationLevel))),
      _cursorManager(fullNS),
      _cappedNotifier(_recordStore->isCapped() ? new CappedInsertNotifier() : nullptr),
      _mustTakeCappedLockOnInsert(isCapped() && !_ns.isSystemDotProfile() && !_ns.isOplog()) {
    _magic = 1357924;
    _indexCatalog.init(txn);
    if (isCapped())
        _recordStore->setCappedCallback(this);

    _infoCache.init(txn);
}

Collection::~Collection() {
    verify(ok());
    _magic = 0;
    if (_cappedNotifier) {
        _cappedNotifier->kill();
    }
}

bool Collection::requiresIdIndex() const {
    if (_ns.ns().find('$') != string::npos) {
        // no indexes on indexes
        return false;
    }

    if (_ns.isSystem()) {
        StringData shortName = _ns.coll().substr(_ns.coll().find('.') + 1);
        if (shortName == "indexes" || shortName == "namespaces" || shortName == "profile") {
            return false;
        }
    }

    if (_ns.db() == "local") {
        if (_ns.coll().startsWith("oplog."))
            return false;
    }

    if (!_ns.isSystem()) {
        // non system collections definitely have an _id index
        return true;
    }


    return true;
}

std::unique_ptr<SeekableRecordCursor> Collection::getCursor(OperationContext* txn,
                                                            bool forward) const {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS));
    invariant(ok());

    return _recordStore->getCursor(txn, forward);
}

vector<std::unique_ptr<RecordCursor>> Collection::getManyCursors(OperationContext* txn) const {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS));

    return _recordStore->getManyCursors(txn);
}

Snapshotted<BSONObj> Collection::docFor(OperationContext* txn, const RecordId& loc) const {
    return Snapshotted<BSONObj>(txn->recoveryUnit()->getSnapshotId(),
                                _recordStore->dataFor(txn, loc).releaseToBson());
}

bool Collection::findDoc(OperationContext* txn,
                         const RecordId& loc,
                         Snapshotted<BSONObj>* out) const {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS));

    RecordData rd;
    if (!_recordStore->findRecord(txn, loc, &rd))
        return false;
    *out = Snapshotted<BSONObj>(txn->recoveryUnit()->getSnapshotId(), rd.releaseToBson());
    return true;
}

Status Collection::checkValidation(OperationContext* txn, const BSONObj& document) const {
    if (!_validator)
        return Status::OK();

    if (_validationLevel == OFF)
        return Status::OK();

    if (documentValidationDisabled(txn))
        return Status::OK();

    if (_validator->matchesBSON(document))
        return Status::OK();

    if (_validationAction == WARN) {
        warning() << "Document would fail validation"
                  << " collection: " << ns() << " doc: " << document;
        return Status::OK();
    }

    return {ErrorCodes::DocumentValidationFailure, "Document failed validation"};
}

StatusWithMatchExpression Collection::parseValidator(const BSONObj& validator) const {
    if (validator.isEmpty())
        return {nullptr};

    if (ns().isSystem()) {
        return {ErrorCodes::InvalidOptions,
                "Document validators not allowed on system collections."};
    }

    if (ns().isOnInternalDb()) {
        return {ErrorCodes::InvalidOptions,
                str::stream() << "Document validators are not allowed on collections in"
                              << " the " << ns().db() << " database"};
    }

    {
        auto status = checkValidatorForBannedExpressions(validator);
        if (!status.isOK())
            return status;
    }

    auto statusWithMatcher =
        MatchExpressionParser::parse(validator, ExtensionsCallbackDisallowExtensions());
    if (!statusWithMatcher.isOK())
        return statusWithMatcher.getStatus();

    return statusWithMatcher;
}

Status Collection::insertDocument(OperationContext* txn, const DocWriter* doc, bool enforceQuota) {
    invariant(!_validator || documentValidationDisabled(txn));
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(!_indexCatalog.haveAnyIndexes());  // eventually can implement, just not done

    if (_mustTakeCappedLockOnInsert)
        synchronizeOnCappedInFlightResource(txn->lockState(), _ns);

    StatusWith<RecordId> loc = _recordStore->insertRecord(txn, doc, _enforceQuota(enforceQuota));
    if (!loc.isOK())
        return loc.getStatus();

    // we cannot call into the OpObserver here because the document being written is not present
    // fortunately, this is currently only used for adding entries to the oplog.

    txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); });

    return loc.getStatus();
}


Status Collection::insertDocuments(OperationContext* txn,
                                   const vector<BSONObj>::const_iterator begin,
                                   const vector<BSONObj>::const_iterator end,
                                   bool enforceQuota,
                                   bool fromMigrate) {
    // Should really be done in the collection object at creation and updated on index create.
    const bool hasIdIndex = _indexCatalog.findIdIndex(txn);

    for (auto it = begin; it != end; it++) {
        if (hasIdIndex && (*it)["_id"].eoo()) {
            return Status(ErrorCodes::InternalError,
                          str::stream() << "Collection::insertDocument got "
                                           "document without _id for ns:" << _ns.ns());
        }

        auto status = checkValidation(txn, *it);
        if (!status.isOK())
            return status;
    }

    const SnapshotId sid = txn->recoveryUnit()->getSnapshotId();

    if (_mustTakeCappedLockOnInsert)
        synchronizeOnCappedInFlightResource(txn->lockState(), _ns);

    Status status = _insertDocuments(txn, begin, end, enforceQuota);
    if (!status.isOK())
        return status;
    invariant(sid == txn->recoveryUnit()->getSnapshotId());

    getGlobalServiceContext()->getOpObserver()->onInserts(txn, ns(), begin, end, fromMigrate);

    txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); });

    return Status::OK();
}

Status Collection::insertDocument(OperationContext* txn,
                                  const BSONObj& docToInsert,
                                  bool enforceQuota,
                                  bool fromMigrate) {
    vector<BSONObj> docs;
    docs.push_back(docToInsert);
    return insertDocuments(txn, docs.begin(), docs.end(), enforceQuota, fromMigrate);
}

Status Collection::insertDocument(OperationContext* txn,
                                  const BSONObj& doc,
                                  MultiIndexBlock* indexBlock,
                                  bool enforceQuota) {
    {
        auto status = checkValidation(txn, doc);
        if (!status.isOK())
            return status;
    }

    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));

    if (_mustTakeCappedLockOnInsert)
        synchronizeOnCappedInFlightResource(txn->lockState(), _ns);

    StatusWith<RecordId> loc =
        _recordStore->insertRecord(txn, doc.objdata(), doc.objsize(), _enforceQuota(enforceQuota));

    if (!loc.isOK())
        return loc.getStatus();

    Status status = indexBlock->insert(doc, loc.getValue());
    if (!status.isOK())
        return status;

    vector<BSONObj> docs;
    docs.push_back(doc);
    getGlobalServiceContext()->getOpObserver()->onInserts(txn, ns(), docs.begin(), docs.end());

    txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); });

    return loc.getStatus();
}

Status Collection::_insertDocuments(OperationContext* txn,
                                    const vector<BSONObj>::const_iterator begin,
                                    const vector<BSONObj>::const_iterator end,
                                    bool enforceQuota) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));

    if (isCapped() && _indexCatalog.haveAnyIndexes() && std::distance(begin, end) > 1) {
        // We require that inserts to indexed capped collections be done one-at-a-time to avoid the
        // possibility that a later document causes an earlier document to be deleted before it can
        // be indexed.
        // TODO SERVER-21512 It would be better to handle this here by just doing single inserts.
        return {ErrorCodes::OperationCannotBeBatched,
                "Can't batch inserts into indexed capped collections"};
    }

    if (_needCappedLock) {
        // X-lock the metadata resource for this capped collection until the end of the WUOW. This
        // prevents the primary from executing with more concurrency than secondaries.
        // See SERVER-21646.
        Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X};
    }

    std::vector<Record> records;
    for (auto it = begin; it != end; it++) {
        Record record = {RecordId(), RecordData(it->objdata(), it->objsize())};
        records.push_back(record);
    }
    Status status = _recordStore->insertRecords(txn, &records, _enforceQuota(enforceQuota));
    if (!status.isOK())
        return status;

    std::vector<BsonRecord> bsonRecords;
    int recordIndex = 0;
    for (auto it = begin; it != end; it++) {
        RecordId loc = records[recordIndex++].id;
        invariant(RecordId::min() < loc);
        invariant(loc < RecordId::max());

        BsonRecord bsonRecord = {loc, &(*it)};
        bsonRecords.push_back(bsonRecord);
    }

    return _indexCatalog.indexRecords(txn, bsonRecords);
}

void Collection::notifyCappedWaitersIfNeeded() {
    // If there is a notifier object and another thread is waiting on it, then we notify
    // waiters of this document insert. Waiters keep a shared_ptr to '_cappedNotifier', so
    // there are waiters if this Collection's shared_ptr is not unique (use_count > 1).
    if (_cappedNotifier && !_cappedNotifier.unique())
        _cappedNotifier->notifyAll();
}

Status Collection::aboutToDeleteCapped(OperationContext* txn,
                                       const RecordId& loc,
                                       RecordData data) {
    /* check if any cursors point to us.  if so, advance them. */
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION);

    BSONObj doc = data.releaseToBson();
    _indexCatalog.unindexRecord(txn, doc, loc, false);

    return Status::OK();
}

void Collection::deleteDocument(
    OperationContext* txn, const RecordId& loc, bool fromMigrate, bool cappedOK, bool noWarn) {
    if (isCapped() && !cappedOK) {
        log() << "failing remove on a capped ns " << _ns << endl;
        uasserted(10089, "cannot remove from a capped collection");
        return;
    }

    Snapshotted<BSONObj> doc = docFor(txn, loc);

    auto opObserver = getGlobalServiceContext()->getOpObserver();
    OpObserver::DeleteState deleteState = opObserver->aboutToDelete(txn, ns(), doc.value());

    /* check if any cursors point to us.  if so, advance them. */
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION);

    _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn);

    _recordStore->deleteRecord(txn, loc);

    opObserver->onDelete(txn, ns(), std::move(deleteState), fromMigrate);
}

Counter64 moveCounter;
ServerStatusMetricField<Counter64> moveCounterDisplay("record.moves", &moveCounter);

StatusWith<RecordId> Collection::updateDocument(OperationContext* txn,
                                                const RecordId& oldLocation,
                                                const Snapshotted<BSONObj>& oldDoc,
                                                const BSONObj& newDoc,
                                                bool enforceQuota,
                                                bool indexesAffected,
                                                OpDebug* debug,
                                                oplogUpdateEntryArgs& args) {
    {
        auto status = checkValidation(txn, newDoc);
        if (!status.isOK()) {
            if (_validationLevel == STRICT_V) {
                return status;
            }
            // moderate means we have to check the old doc
            auto oldDocStatus = checkValidation(txn, oldDoc.value());
            if (oldDocStatus.isOK()) {
                // transitioning from good -> bad is not ok
                return status;
            }
            // bad -> bad is ok in moderate mode
        }
    }

    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId());

    if (_needCappedLock) {
        // X-lock the metadata resource for this capped collection until the end of the WUOW. This
        // prevents the primary from executing with more concurrency than secondaries.
        // See SERVER-21646.
        Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X};
    }

    SnapshotId sid = txn->recoveryUnit()->getSnapshotId();

    BSONElement oldId = oldDoc.value()["_id"];
    if (!oldId.eoo() && (oldId != newDoc["_id"]))
        return StatusWith<RecordId>(
            ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596);

    // The MMAPv1 storage engine implements capped collections in a way that does not allow records
    // to grow beyond their original size. If MMAPv1 part of a replicaset with storage engines that
    // do not have this limitation, replication could result in errors, so it is necessary to set a
    // uniform rule here. Similarly, it is not sufficient to disallow growing records, because this
    // happens when secondaries roll back an update shrunk a record. Exactly replicating legacy
    // MMAPv1 behavior would require padding shrunk documents on all storage engines. Instead forbid
    // all size changes.
    const auto oldSize = oldDoc.value().objsize();
    if (_recordStore->isCapped() && oldSize != newDoc.objsize())
        return {ErrorCodes::CannotGrowDocumentInCappedNamespace,
                str::stream() << "Cannot change the size of a document in a capped collection: "
                              << oldSize << " != " << newDoc.objsize()};

    // At the end of this step, we will have a map of UpdateTickets, one per index, which
    // represent the index updates needed to be done, based on the changes between oldDoc and
    // newDoc.
    OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets;
    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexCatalogEntry* entry = ii.catalogEntry(descriptor);
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) ||
                repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(txn,
                                             oldDoc.value(),
                                             newDoc,
                                             oldLocation,
                                             options,
                                             updateTicket,
                                             entry->getFilterExpression());
            if (!ret.isOK()) {
                return StatusWith<RecordId>(ret);
            }
        }
    }

    // This can call back into Collection::recordStoreGoingToMove.  If that happens, the old
    // object is removed from all indexes.
    StatusWith<RecordId> newLocation = _recordStore->updateRecord(
        txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this);

    if (!newLocation.isOK()) {
        return newLocation;
    }

    // At this point, the old object may or may not still be indexed, depending on if it was
    // moved. If the object did move, we need to add the new location to all indexes.
    if (newLocation.getValue() != oldLocation) {
        if (debug) {
            if (debug->nmoved == -1)  // default of -1 rather than 0
                debug->nmoved = 1;
            else
                debug->nmoved += 1;
        }

        std::vector<BsonRecord> bsonRecords;
        BsonRecord bsonRecord = {newLocation.getValue(), &newDoc};
        bsonRecords.push_back(bsonRecord);
        Status s = _indexCatalog.indexRecords(txn, bsonRecords);
        if (!s.isOK())
            return StatusWith<RecordId>(s);
        invariant(sid == txn->recoveryUnit()->getSnapshotId());
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

        return newLocation;
    }

    // Object did not move.  We update each index with each respective UpdateTicket.

    if (debug)
        debug->keyUpdates = 0;

    if (indexesAffected) {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true);
        while (ii.more()) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = ii.accessMethod(descriptor);

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if (!ret.isOK())
                return StatusWith<RecordId>(ret);
            if (debug)
                debug->keyUpdates += updatedKeys;
        }
    }

    invariant(sid == txn->recoveryUnit()->getSnapshotId());
    args.ns = ns().ns();
    getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);

    return newLocation;
}

Status Collection::recordStoreGoingToMove(OperationContext* txn,
                                          const RecordId& oldLocation,
                                          const char* oldBuffer,
                                          size_t oldSize) {
    moveCounter.increment();
    _cursorManager.invalidateDocument(txn, oldLocation, INVALIDATION_DELETION);
    _indexCatalog.unindexRecord(txn, BSONObj(oldBuffer), oldLocation, true);
    return Status::OK();
}

Status Collection::recordStoreGoingToUpdateInPlace(OperationContext* txn, const RecordId& loc) {
    // Broadcast the mutation so that query results stay correct.
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION);
    return Status::OK();
}


bool Collection::updateWithDamagesSupported() const {
    if (_validator)
        return false;

    return _recordStore->updateWithDamagesSupported();
}

StatusWith<RecordData> Collection::updateDocumentWithDamages(
    OperationContext* txn,
    const RecordId& loc,
    const Snapshotted<RecordData>& oldRec,
    const char* damageSource,
    const mutablebson::DamageVector& damages,
    oplogUpdateEntryArgs& args) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(oldRec.snapshotId() == txn->recoveryUnit()->getSnapshotId());
    invariant(updateWithDamagesSupported());

    // Broadcast the mutation so that query results stay correct.
    _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION);

    auto newRecStatus =
        _recordStore->updateWithDamages(txn, loc, oldRec.value(), damageSource, damages);

    if (newRecStatus.isOK()) {
        args.ns = ns().ns();
        getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args);
    }
    return newRecStatus;
}

bool Collection::_enforceQuota(bool userEnforeQuota) const {
    if (!userEnforeQuota)
        return false;

    if (!mmapv1GlobalOptions.quota)
        return false;

    if (_ns.db() == "local")
        return false;

    if (_ns.isSpecial())
        return false;

    return true;
}

bool Collection::isCapped() const {
    return _cappedNotifier.get();
}

std::shared_ptr<CappedInsertNotifier> Collection::getCappedInsertNotifier() const {
    invariant(isCapped());
    return _cappedNotifier;
}

uint64_t Collection::numRecords(OperationContext* txn) const {
    return _recordStore->numRecords(txn);
}

uint64_t Collection::dataSize(OperationContext* txn) const {
    return _recordStore->dataSize(txn);
}

uint64_t Collection::getIndexSize(OperationContext* opCtx, BSONObjBuilder* details, int scale) {
    IndexCatalog* idxCatalog = getIndexCatalog();

    IndexCatalog::IndexIterator ii = idxCatalog->getIndexIterator(opCtx, true);

    uint64_t totalSize = 0;

    while (ii.more()) {
        IndexDescriptor* d = ii.next();
        IndexAccessMethod* iam = idxCatalog->getIndex(d);

        long long ds = iam->getSpaceUsedBytes(opCtx);

        totalSize += ds;
        if (details) {
            details->appendNumber(d->indexName(), ds / scale);
        }
    }

    return totalSize;
}

/**
 * order will be:
 * 1) store index specs
 * 2) drop indexes
 * 3) truncate record store
 * 4) re-write indexes
 */
Status Collection::truncate(OperationContext* txn) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X));
    BackgroundOperation::assertNoBgOpInProgForNs(ns());
    invariant(_indexCatalog.numIndexesInProgress(txn) == 0);

    // 1) store index specs
    vector<BSONObj> indexSpecs;
    {
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, false);
        while (ii.more()) {
            const IndexDescriptor* idx = ii.next();
            indexSpecs.push_back(idx->infoObj().getOwned());
        }
    }

    // 2) drop indexes
    Status status = _indexCatalog.dropAllIndexes(txn, true);
    if (!status.isOK())
        return status;
    _cursorManager.invalidateAll(false, "collection truncated");

    // 3) truncate record store
    status = _recordStore->truncate(txn);
    if (!status.isOK())
        return status;

    // 4) re-create indexes
    for (size_t i = 0; i < indexSpecs.size(); i++) {
        status = _indexCatalog.createIndexOnEmptyCollection(txn, indexSpecs[i]);
        if (!status.isOK())
            return status;
    }

    return Status::OK();
}

void Collection::temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX));
    invariant(isCapped());
    BackgroundOperation::assertNoBgOpInProgForNs(ns());
    invariant(_indexCatalog.numIndexesInProgress(txn) == 0);

    _cursorManager.invalidateAll(false, "capped collection truncated");
    _recordStore->temp_cappedTruncateAfter(txn, end, inclusive);
}

Status Collection::setValidator(OperationContext* txn, BSONObj validatorDoc) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X));

    // Make owned early so that the parsed match expression refers to the owned object.
    if (!validatorDoc.isOwned())
        validatorDoc = validatorDoc.getOwned();

    auto statusWithMatcher = parseValidator(validatorDoc);
    if (!statusWithMatcher.isOK())
        return statusWithMatcher.getStatus();

    _details->updateValidator(txn, validatorDoc, getValidationLevel(), getValidationAction());

    _validator = std::move(statusWithMatcher.getValue());
    _validatorDoc = std::move(validatorDoc);
    return Status::OK();
}

StatusWith<Collection::ValidationLevel> Collection::_parseValidationLevel(StringData newLevel) {
    if (newLevel == "") {
        // default
        return STRICT_V;
    } else if (newLevel == "off") {
        return OFF;
    } else if (newLevel == "moderate") {
        return MODERATE;
    } else if (newLevel == "strict") {
        return STRICT_V;
    } else {
        return Status(ErrorCodes::BadValue,
                      str::stream() << "invalid validation level: " << newLevel);
    }
}

StatusWith<Collection::ValidationAction> Collection::_parseValidationAction(StringData newAction) {
    if (newAction == "") {
        // default
        return ERROR_V;
    } else if (newAction == "warn") {
        return WARN;
    } else if (newAction == "error") {
        return ERROR_V;
    } else {
        return Status(ErrorCodes::BadValue,
                      str::stream() << "invalid validation action: " << newAction);
    }
}

StringData Collection::getValidationLevel() const {
    switch (_validationLevel) {
        case STRICT_V:
            return "strict";
        case OFF:
            return "off";
        case MODERATE:
            return "moderate";
    }
    MONGO_UNREACHABLE;
}

StringData Collection::getValidationAction() const {
    switch (_validationAction) {
        case ERROR_V:
            return "error";
        case WARN:
            return "warn";
    }
    MONGO_UNREACHABLE;
}

Status Collection::setValidationLevel(OperationContext* txn, StringData newLevel) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X));

    StatusWith<ValidationLevel> status = _parseValidationLevel(newLevel);
    if (!status.isOK()) {
        return status.getStatus();
    }

    _validationLevel = status.getValue();

    _details->updateValidator(txn, _validatorDoc, getValidationLevel(), getValidationAction());

    return Status::OK();
}

Status Collection::setValidationAction(OperationContext* txn, StringData newAction) {
    invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X));

    StatusWith<ValidationAction> status = _parseValidationAction(newAction);
    if (!status.isOK()) {
        return status.getStatus();
    }

    _validationAction = status.getValue();

    _details->updateValidator(txn, _validatorDoc, getValidationLevel(), getValidationAction());

    return Status::OK();
}


namespace {
class MyValidateAdaptor : public ValidateAdaptor {
public:
    virtual ~MyValidateAdaptor() {}

    virtual Status validate(const RecordData& record, size_t* dataSize) {
        BSONObj obj = record.toBson();
        const Status status = validateBSON(obj.objdata(), obj.objsize());
        if (status.isOK())
            *dataSize = obj.objsize();
        return Status::OK();
    }
};

void validateIndexKeyCount(OperationContext* txn,
                           const IndexDescriptor& idx,
                           int64_t numIdxKeys,
                           int64_t numRecs,
                           ValidateResults* results) {
    if (idx.isIdIndex() && numIdxKeys != numRecs) {
        string err = str::stream() << "number of _id index entries (" << numIdxKeys
                                   << ") does not match the number of documents (" << numRecs
                                   << ")";
        results->errors.push_back(err);
        results->valid = false;
        return;  // Avoid failing the next two checks, they just add redundant/confusing messages
    }
    if (!idx.isMultikey(txn) && numIdxKeys > numRecs) {
        string err = str::stream() << "index " << idx.indexName()
                                   << " is not multi-key, but has more entries (" << numIdxKeys
                                   << ") than documents (" << numRecs << ")";
        results->errors.push_back(err);
        results->valid = false;
    }
    //  If an access method name is given, the index may be a full text, geo or special
    //  index plugin with different semantics.
    if (!idx.isSparse() && !idx.isPartial() && idx.getAccessMethodName() == "" &&
        numIdxKeys < numRecs) {
        string err = str::stream() << "index " << idx.indexName()
                                   << " is not sparse or partial, but has fewer entries ("
                                   << numIdxKeys << ") than documents (" << numRecs << ")";
        results->errors.push_back(err);
        results->valid = false;
    }
}
}  // namespace

Status Collection::validate(OperationContext* txn,
                            bool full,
                            bool scanData,
                            ValidateResults* results,
                            BSONObjBuilder* output) {
    dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS));

    MyValidateAdaptor adaptor;
    Status status = _recordStore->validate(txn, full, scanData, &adaptor, results, output);
    if (!status.isOK())
        return status;

    {  // indexes
        output->append("nIndexes", _indexCatalog.numIndexesReady(txn));
        int idxn = 0;
        try {
            // Only applicable when 'full' validation is requested.
            std::unique_ptr<BSONObjBuilder> indexDetails(full ? new BSONObjBuilder() : NULL);
            BSONObjBuilder indexes;  // not using subObjStart to be exception safe

            IndexCatalog::IndexIterator i = _indexCatalog.getIndexIterator(txn, false);
            while (i.more()) {
                const IndexDescriptor* descriptor = i.next();
                log(LogComponent::kIndex) << "validating index " << descriptor->indexNamespace()
                                          << endl;
                IndexAccessMethod* iam = _indexCatalog.getIndex(descriptor);
                invariant(iam);

                std::unique_ptr<BSONObjBuilder> bob(
                    indexDetails.get() ? new BSONObjBuilder(indexDetails->subobjStart(
                                             descriptor->indexNamespace()))
                                       : NULL);

                int64_t keys;
                iam->validate(txn, full, &keys, bob.get());
                indexes.appendNumber(descriptor->indexNamespace(), static_cast<long long>(keys));

                validateIndexKeyCount(
                    txn, *descriptor, keys, _recordStore->numRecords(txn), results);

                if (bob) {
                    BSONObj obj = bob->done();
                    BSONElement valid = obj["valid"];
                    if (valid.ok() && !valid.trueValue()) {
                        results->valid = false;
                    }
                }
                idxn++;
            }

            output->append("keysPerIndex", indexes.done());
            if (indexDetails.get()) {
                output->append("indexDetails", indexDetails->done());
            }
        } catch (DBException& exc) {
            string err = str::stream() << "exception during index validate idxn "
                                       << BSONObjBuilder::numStr(idxn) << ": " << exc.toString();
            results->errors.push_back(err);
            results->valid = false;
        }
    }

    return Status::OK();
}

Status Collection::touch(OperationContext* txn,
                         bool touchData,
                         bool touchIndexes,
                         BSONObjBuilder* output) const {
    if (touchData) {
        BSONObjBuilder b;
        Status status = _recordStore->touch(txn, &b);
        if (!status.isOK())
            return status;
        output->append("data", b.obj());
    }

    if (touchIndexes) {
        Timer t;
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, false);
        while (ii.more()) {
            const IndexDescriptor* desc = ii.next();
            const IndexAccessMethod* iam = _indexCatalog.getIndex(desc);
            Status status = iam->touch(txn);
            if (!status.isOK())
                return status;
        }

        output->append("indexes",
                       BSON("num" << _indexCatalog.numIndexesTotal(txn) << "millis" << t.millis()));
    }

    return Status::OK();
}
}
Example #28
0
void CursorCache::gotKillCursors(Message& m) {
    LastError::get(cc()).disable();
    DbMessage dbmessage(m);
    int n = dbmessage.pullInt();

    if (n > 2000) {
        (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl;
    }

    uassert(13286, "sent 0 cursors to kill", n >= 1);
    uassert(13287, "too many cursors to kill", n < 30000);
    massert(18632,
            str::stream() << "bad kill cursors size: " << m.dataSize(),
            m.dataSize() == 8 + (8 * n));


    ConstDataCursor cursors(dbmessage.getArray(n));

    ClientBasic* client = ClientBasic::getCurrent();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    for (int i = 0; i < n; i++) {
        long long id = cursors.readAndAdvance<LittleEndian<int64_t>>();
        LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl;

        if (!id) {
            warning() << " got cursor id of 0 to kill" << endl;
            continue;
        }

        string server;
        {
            stdx::lock_guard<stdx::mutex> lk(_mutex);

            MapSharded::iterator i = _cursors.find(id);
            if (i != _cursors.end()) {
                Status authorizationStatus =
                    authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id);
                audit::logKillCursorsAuthzCheck(
                    client,
                    NamespaceString(i->second->getNS()),
                    id,
                    authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized);
                if (authorizationStatus.isOK()) {
                    _cursorsMaxTimeMS.erase(i->second->getId());
                    _cursors.erase(i);
                }
                continue;
            }

            MapNormal::iterator refsIt = _refs.find(id);
            MapNormal::iterator refsNSIt = _refsNS.find(id);
            if (refsIt == _refs.end()) {
                warning() << "can't find cursor: " << id << endl;
                continue;
            }
            verify(refsNSIt != _refsNS.end());
            Status authorizationStatus =
                authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id);
            audit::logKillCursorsAuthzCheck(client,
                                            NamespaceString(refsNSIt->second),
                                            id,
                                            authorizationStatus.isOK() ? ErrorCodes::OK
                                                                       : ErrorCodes::Unauthorized);
            if (!authorizationStatus.isOK()) {
                continue;
            }
            server = refsIt->second;
            _refs.erase(refsIt);
            _refsNS.erase(refsNSIt);
            cursorStatsSingleTarget.decrement();
        }

        LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server
                         << endl;

        verify(server.size());
        ScopedDbConnection conn(server);
        conn->killCursor(id);
        conn.done();
    }
}
Example #29
0
namespace mongo {

    // Enabling the maxTimeAlwaysTimeOut fail point will cause any query or command run with a valid
    // non-zero max time to fail immediately.  Any getmore operation on a cursor already created
    // with a valid non-zero max time will also fail immediately.
    //
    // This fail point cannot be used with the maxTimeNeverTimeOut fail point.
    MONGO_FP_DECLARE(maxTimeAlwaysTimeOut);

    // Enabling the maxTimeNeverTimeOut fail point will cause the server to never time out any
    // query, command, or getmore operation, regardless of whether a max time is set.
    //
    // This fail point cannot be used with the maxTimeAlwaysTimeOut fail point.
    MONGO_FP_DECLARE(maxTimeNeverTimeOut);

    // todo : move more here

    CurOp::CurOp( Client * client , CurOp * wrapped ) :
        _client(client),
        _wrapped(wrapped)
    {
        if ( _wrapped )
            _client->_curOp = this;
        _start = 0;
        _active = false;
        _reset();
        _op = 0;
        _opNum = _nextOpNum.fetchAndAdd(1);
        _command = NULL;
    }

    void CurOp::_reset() {
        _suppressFromCurop = false;
        _isCommand = false;
        _dbprofile = 0;
        _end = 0;
        _maxTimeMicros = 0;
        _maxTimeTracker.reset();
        _message = "";
        _progressMeter.finished();
        _killPending.store(0);
        _numYields = 0;
        _expectedLatencyMs = 0;
        _lockStat.reset();
    }

    void CurOp::reset() {
        _reset();
        _start = 0;
        _opNum = _nextOpNum.fetchAndAdd(1);
        _debug.reset();
        _query.reset();
        _active = true; // this should be last for ui clarity
    }

    void CurOp::reset( const HostAndPort& remote, int op ) {
        reset();
        if( _remote != remote ) {
            // todo : _remote is not thread safe yet is used as such!
            _remote = remote;
        }
        _op = op;
    }

    ProgressMeter& CurOp::setMessage(const char * msg,
                                     std::string name,
                                     unsigned long long progressMeterTotal,
                                     int secondsBetween) {
        if ( progressMeterTotal ) {
            if ( _progressMeter.isActive() ) {
                cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
                verify( ! _progressMeter.isActive() );
            }
            _progressMeter.reset( progressMeterTotal , secondsBetween );
            _progressMeter.setName(name);
        }
        else {
            _progressMeter.finished();
        }
        _message = msg;
        return _progressMeter;
    }

    CurOp::~CurOp() {
        if ( _wrapped ) {
            scoped_lock bl(Client::clientsMutex);
            _client->_curOp = _wrapped;
        }
        _client = 0;
    }

    void CurOp::setNS( const StringData& ns ) {
        // _ns copies the data in the null-terminated ptr it's given
        _ns = ns.toString().c_str();
    }

    void CurOp::ensureStarted() {
        if ( _start == 0 ) {
            _start = curTimeMicros64();

            // If ensureStarted() is invoked after setMaxTimeMicros(), then time limit tracking will
            // start here.  This is because time limit tracking can only commence after the
            // operation is assigned a start time.
            if (_maxTimeMicros > 0) {
                _maxTimeTracker.setTimeLimit(_start, _maxTimeMicros);
            }
        }
    }

    void CurOp::enter( Client::Context * context ) {
        ensureStarted();
        _ns = context->ns();
        _dbprofile = std::max( context->_db ? context->_db->getProfilingLevel() : 0 , _dbprofile );
    }

    void CurOp::recordGlobalTime(bool isWriteLocked, long long micros) const {
        string nsStr = _ns.toString();
        Top::global.record(nsStr, _op, isWriteLocked ? 1 : -1, micros, _isCommand);
    }

    void CurOp::reportState(BSONObjBuilder* builder) {
        builder->append("opid", _opNum);
        bool a = _active && _start;
        builder->append("active", a);

        if( a ) {
            builder->append("secs_running", elapsedSeconds() );
            builder->append("microsecs_running", static_cast<long long int>(elapsedMicros()) );
        }

        builder->append( "op" , opToString( _op ) );

        builder->append("ns", _ns.toString());

        if (_op == dbInsert) {
            _query.append(*builder, "insert");
        }
        else {
            _query.append(*builder, "query");
        }

        if ( !debug().planSummary.empty() ) {
            builder->append( "planSummary" , debug().planSummary.toString() );
        }

        if( !_remote.empty() ) {
            builder->append("client", _remote.toString());
        }

        if ( ! _message.empty() ) {
            if ( _progressMeter.isActive() ) {
                StringBuilder buf;
                buf << _message.toString() << " " << _progressMeter.toString();
                builder->append( "msg" , buf.str() );
                BSONObjBuilder sub( builder->subobjStart( "progress" ) );
                sub.appendNumber( "done" , (long long)_progressMeter.done() );
                sub.appendNumber( "total" , (long long)_progressMeter.total() );
                sub.done();
            }
            else {
                builder->append( "msg" , _message.toString() );
            }
        }

        if( killPending() )
            builder->append("killPending", true);

        builder->append( "numYields" , _numYields );
        builder->append( "lockStats" , _lockStat.report() );
    }

    BSONObj CurOp::description() {
        BSONObjBuilder bob;
        bool a = _active && _start;
        bob.append("active", a);
        bob.append( "op" , opToString( _op ) );
        bob.append("ns", _ns.toString());
        if (_op == dbInsert) {
            _query.append(bob, "insert");
        }
        else {
            _query.append(bob, "query");
        }
        if( killPending() )
            bob.append("killPending", true);
        return bob.obj();
    }

    void CurOp::kill() {
        _killPending.store(1);
    }

    void CurOp::setMaxTimeMicros(uint64_t maxTimeMicros) {
        _maxTimeMicros = maxTimeMicros;

        if (_maxTimeMicros == 0) {
            // 0 is "allow to run indefinitely".
            return;
        }

        // If the operation has a start time, then enable the tracker.
        //
        // If the operation has no start time yet, then ensureStarted() will take responsibility for
        // enabling the tracker.
        if (isStarted()) {
            _maxTimeTracker.setTimeLimit(startTime(), _maxTimeMicros);
        }
    }

    bool CurOp::maxTimeHasExpired() {
        if (MONGO_FAIL_POINT(maxTimeNeverTimeOut)) {
            return false;
        }
        if (_maxTimeMicros > 0 && MONGO_FAIL_POINT(maxTimeAlwaysTimeOut)) {
            return true;
        }
        return _maxTimeTracker.checkTimeLimit();
    }

    uint64_t CurOp::getRemainingMaxTimeMicros() const {
        return _maxTimeTracker.getRemainingMicros();
    }

    AtomicUInt32 CurOp::_nextOpNum;

    static Counter64 returnedCounter;
    static Counter64 insertedCounter;
    static Counter64 updatedCounter;
    static Counter64 deletedCounter;
    static Counter64 scannedCounter;
    static Counter64 scannedObjectCounter;

    static ServerStatusMetricField<Counter64> displayReturned( "document.returned", &returnedCounter );
    static ServerStatusMetricField<Counter64> displayUpdated( "document.updated", &updatedCounter );
    static ServerStatusMetricField<Counter64> displayInserted( "document.inserted", &insertedCounter );
    static ServerStatusMetricField<Counter64> displayDeleted( "document.deleted", &deletedCounter );
    static ServerStatusMetricField<Counter64> displayScanned( "queryExecutor.scanned", &scannedCounter );
    static ServerStatusMetricField<Counter64> displayScannedObjects( "queryExecutor.scannedObjects",
                                                                     &scannedObjectCounter );

    static Counter64 idhackCounter;
    static Counter64 scanAndOrderCounter;
    static Counter64 fastmodCounter;

    static ServerStatusMetricField<Counter64> displayIdhack( "operation.idhack", &idhackCounter );
    static ServerStatusMetricField<Counter64> displayScanAndOrder( "operation.scanAndOrder", &scanAndOrderCounter );
    static ServerStatusMetricField<Counter64> displayFastMod( "operation.fastmod", &fastmodCounter );

    void OpDebug::recordStats() {
        if ( nreturned > 0 )
            returnedCounter.increment( nreturned );
        if ( ninserted > 0 )
            insertedCounter.increment( ninserted );
        if ( nMatched > 0 )
            updatedCounter.increment( nMatched );
        if ( ndeleted > 0 )
            deletedCounter.increment( ndeleted );
        if ( nscanned > 0 )
            scannedCounter.increment( nscanned );
        if ( nscannedObjects > 0 )
            scannedObjectCounter.increment( nscannedObjects );

        if ( idhack )
            idhackCounter.increment();
        if ( scanAndOrder )
            scanAndOrderCounter.increment();
        if ( fastmod )
            fastmodCounter.increment();
    }

    CurOp::MaxTimeTracker::MaxTimeTracker() {
        reset();
    }

    void CurOp::MaxTimeTracker::reset() {
        _enabled = false;
        _targetEpochMicros = 0;
        _approxTargetServerMillis = 0;
    }

    void CurOp::MaxTimeTracker::setTimeLimit(uint64_t startEpochMicros, uint64_t durationMicros) {
        dassert(durationMicros != 0);

        _enabled = true;

        _targetEpochMicros = startEpochMicros + durationMicros;

        uint64_t now = curTimeMicros64();
        // If our accurate time source thinks time is not up yet, calculate the next target for
        // our approximate time source.
        if (_targetEpochMicros > now) {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis() +
                                        static_cast<int64_t>((_targetEpochMicros - now) / 1000);
        }
        // Otherwise, set our approximate time source target such that it thinks time is already
        // up.
        else {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis();
        }
    }

    bool CurOp::MaxTimeTracker::checkTimeLimit() {
        if (!_enabled) {
            return false;
        }

        // Does our approximate time source think time is not up yet?  If so, return early.
        if (_approxTargetServerMillis > Listener::getElapsedTimeMillis()) {
            return false;
        }

        uint64_t now = curTimeMicros64();
        // Does our accurate time source think time is not up yet?  If so, readjust the target for
        // our approximate time source and return early.
        if (_targetEpochMicros > now) {
            _approxTargetServerMillis = Listener::getElapsedTimeMillis() +
                                        static_cast<int64_t>((_targetEpochMicros - now) / 1000);
            return false;
        }

        // Otherwise, time is up.
        return true;
    }

    uint64_t CurOp::MaxTimeTracker::getRemainingMicros() const {
        if (!_enabled) {
            // 0 is "allow to run indefinitely".
            return 0;
        }

        // Does our accurate time source think time is up?  If so, claim there is 1 microsecond
        // left for this operation.
        uint64_t now = curTimeMicros64();
        if (_targetEpochMicros <= now) {
            return 1;
        }

        // Otherwise, calculate remaining time.
        return _targetEpochMicros - now;
    }

}
Example #30
0
ShardedClientCursor::~ShardedClientCursor() {
    verify(_cursor);
    delete _cursor;
    _cursor = 0;
    cursorStatsMultiTarget.decrement();
}