Ejemplo n.º 1
0
        /** apply the writes back to the non-private MMF after they are for certain in redo log 

            (1) todo we don't need to write back everything every group commit.  we MUST write back
            that which is going to be a remapped on its private view - but that might not be all 
            views.

            (2) todo should we do this using N threads?  would be quite easy
                see Hackenberg paper table 5 and 6.  2 threads might be a good balance.

            locking: in read lock when called
        */
        static void WRITETODATAFILES() { 
            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
            for( int i = commitJob.writes().size() - 1; i >= 0; i-- ) {
                const WriteIntent& intent = commitJob.writes()[i];
                char *dst = (char *) (intent.w_ptr);
                memcpy(dst, intent.p, intent.len);
            }

            debugValidateMapsMatch();
        }
Ejemplo n.º 2
0
void DurableImpl::declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {
    typedef std::vector<std::pair<void*, unsigned>> Intents;
    stdx::lock_guard<SimpleMutex> lk(commitJob.groupCommitMutex);
    for (Intents::const_iterator it(intents.begin()), end(intents.end()); it != end; ++it) {
        commitJob.note(it->first, it->second);
    }
}
Ejemplo n.º 3
0
/**
 * Remaps the private view from the shared view so that it does not consume too much
 * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed
 * to disk and applied on top of the shared view.
 *
 * @param fraction Value between (0, 1] indicating what fraction of the memory to remap.
 *      Remapping too much or too frequently incurs copy-on-write page fault cost.
 */
static void remapPrivateView(double fraction) {
    // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any
    // newly written data on reads.
    invariant(!commitJob.hasWritten());

    try {
        Timer t;
        remapPrivateViewImpl(fraction);
        stats.curr()->_remapPrivateViewMicros += t.micros();

        LOG(4) << "remapPrivateView end";
        return;
    } catch (DBException& e) {
        severe() << "dbexception in remapPrivateView causing immediate shutdown: " << e.toString();
    } catch (std::ios_base::failure& e) {
        severe() << "ios_base exception in remapPrivateView causing immediate shutdown: "
                 << e.what();
    } catch (std::bad_alloc& e) {
        severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: "
                 << e.what();
    } catch (std::exception& e) {
        severe() << "exception in remapPrivateView causing immediate shutdown: " << e.what();
    } catch (...) {
        severe() << "unknown exception in remapPrivateView causing immediate shutdown: ";
    }

    invariant(false);
}
Ejemplo n.º 4
0
        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
        */
        static void debugValidateMapsMatch() {
            if( !DebugValidateMapsMatch ) 
                 return;

            Timer t;
            set<MongoFile*>& files = MongoFile::getAllFiles();
            for( set<MongoFile*>::iterator i = files.begin(); i != files.end(); i++ ) { 
                MongoFile *mf = *i;
                if( mf->isMongoMMF() ) { 
                    MongoMMF *mmf = (MongoMMF*) mf;
                    const char *p = (const char *) mmf->getView();
                    const char *w = (const char *) mmf->view_write();
                    unsigned low = 0xffffffff;
                    unsigned high = 0;
                    for( unsigned i = 0; i < mmf->length(); i++ ) {
                        if( p[i] != w[i] ) { 
                            log() << i << '\t' << (int) p[i] << '\t' << (int) w[i] << endl;
                            if( i < low ) low = i;
                            if( i > high ) high = i;
                        }
                    }
                    if( low != 0xffffffff ) { 
                        std::stringstream ss;
                        ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
                        log() << ss.str() << endl;
                        log() << "priv loc: " << (void*)(p+low) << endl;
                        vector<WriteIntent>& w = commitJob.writes();
                        (void)w; // mark as unused. Useful for inspection in debugger

                        breakpoint();
                    }
                }
            }
            log() << "debugValidateMapsMatch " << t.millis() << "ms " << endl;
        }
Ejemplo n.º 5
0
        /** indicate that a database is about to be dropped.  call before the actual drop. */
        void DurableImpl::droppingDb(string db) { 
            shared_ptr<DurOp> op( new DropDbOp(db) );
            commitJob.noteOp(op);

            // must commit now, before files are actually unlinked:
            groupCommit();
        }
Ejemplo n.º 6
0
void DurableImpl::closingFileNotification() {
    if (commitJob.hasWritten()) {
        severe() << "journal warning files are closing outside locks with writes pending";

        // File is closing while there are unwritten changes
        invariant(false);
    }
}
Ejemplo n.º 7
0
        /** locking in read lock when called 
            @see MongoMMF::close()
        */
        static void groupCommit() {
            dbMutex.assertAtLeastReadLocked();

            if( !commitJob.hasWritten() )
                return;

            PREPLOGBUFFER();

            WRITETOJOURNAL(commitJob._ab);

            // data is now in the journal, which is sufficient for acknowledging getlasterror. 
            // (ok to crash after that)
            log() << "TEMP NOTIFYING COMMITTED" << endl;
            commitJob.notifyCommitted();

            // write the noted write intent entries to the data files.
            // this has to come after writing to the journal, obviously...
            MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock
            WRITETODATAFILES();
            if (!dbMutex.isWriteLocked())
                MongoFile::unmarkAllWritable();

            commitJob.reset();

            // REMAPPRIVATEVIEW
            // 
            // remapping private views must occur after WRITETODATAFILES otherwise 
            // we wouldn't see newly written data on reads.
            // 
            DEV assert( !commitJob.hasWritten() );
            if( !dbMutex.isWriteLocked() ) { 
                // this needs done in a write lock thus we do it on the next acquisition of that 
                // instead of here (there is no rush if you aren't writing anyway -- but it must happen, 
                // if it is done, before any uncommitted writes occur).
                //
                dbMutex._remapPrivateViewRequested = true;
            }
            else { 
                // however, if we are already write locked, we must do it now -- up the call tree someone 
                // may do a write without a new lock acquisition.  this can happen when MongoMMF::close() calls
                // this method when a file (and its views) is about to go away.
                //
                REMAPPRIVATEVIEW();
            }
        }
Ejemplo n.º 8
0
bool DurableImpl::commitIfNeeded() {
    if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) {
        return false;
    }

    // Just wake up the flush thread
    flushRequested.notify_one();
    return true;
}
Ejemplo n.º 9
0
 /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its 
     views disappear 
 */
 void closingFileNotification() {
     if( dbMutex.atLeastReadLocked() ) {
         groupCommit(); 
     }
     else {
         assert( inShutdown() );
         if( commitJob.hasWritten() ) { 
             log() << "dur warning files are closing outside locks with writes pending" << endl;
         }
     }
 }
Ejemplo n.º 10
0
/** we will build an output buffer ourself and then use O_DIRECT
    we could be in read lock for this
    caller handles locking
    @return partially populated sectheader and _ab set
*/
static void _PREPLOGBUFFER(JSectHeader& h, AlignedBuilder& bb) {
    // Add the JSectHeader

    // Invalidate the total length, we will fill it in later.
    h.setSectionLen(0xffffffff);
    h.seqNumber = getLastDataFileFlushTime();
    h.fileId = j.curFileId();

    // Ops other than basic writes (DurOp's) go first
    const std::vector<std::shared_ptr<DurOp>>& durOps = commitJob.ops();
    for (std::vector<std::shared_ptr<DurOp>>::const_iterator i = durOps.begin(); i != durOps.end();
         i++) {
        (*i)->serialize(bb);
    }

    // Write intents
    const std::vector<WriteIntent>& intents = commitJob.getIntentsSorted();
    if (!intents.empty()) {
        prepBasicWrites(bb, intents);
    }
}
Ejemplo n.º 11
0
        /** We need to remap the private views periodically. otherwise they would become very large.
            Call within write lock.
        */
        void REMAPPRIVATEVIEW() { 
            static unsigned startAt;
            static unsigned long long lastRemap;

            dbMutex.assertWriteLocked();
            dbMutex._remapPrivateViewRequested = false;
            assert( !commitJob.hasWritten() );

            if( 0 ) { 
                log() << "TEMP remapprivateview disabled for testing - will eventually run oom in this mode if db bigger than ram" << endl;
                return;
            }

            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so 
            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write 
            // faults after remapping, so doing a little bit at a time will avoid big load spikes on 
            // remapping.
            unsigned long long now = curTimeMicros64();
            double fraction = (now-lastRemap)/20000000.0;

            set<MongoFile*>& files = MongoFile::getAllFiles();
            unsigned sz = files.size();
            if( sz == 0 ) 
                return;

            unsigned ntodo = (unsigned) (sz * fraction);
            if( ntodo < 1 ) ntodo = 1;
            if( ntodo > sz ) ntodo = sz;

            const set<MongoFile*>::iterator b = files.begin();
            const set<MongoFile*>::iterator e = files.end();
            set<MongoFile*>::iterator i = b;
            // skip to our starting position
            for( unsigned x = 0; x < startAt; x++ ) {
                i++;
                if( i == e ) i = b;
            }
            startAt = (startAt + ntodo) % sz; // mark where to start next time

            for( unsigned x = 0; x < ntodo; x++ ) {
                dassert( i != e );
                if( (*i)->isMongoMMF() ) {
                    MongoMMF *mmf = (MongoMMF*) *i;
                    assert(mmf);
                    if( mmf->willNeedRemap() ) {
                        mmf->willNeedRemap() = false;
                        mmf->remapThePrivateView();
                    }
                    i++;
                    if( i == e ) i = b;
                }
            }
        }
Ejemplo n.º 12
0
        static void go() {
            if( !commitJob.hasWritten() )
                return;

            {
                readlocktry lk("", 1000);
                if( lk.got() ) {
                    groupCommit();
                    return;
                }
            }

            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a 
            // write lock instead.  otherwise writes could use too much RAM.
            writelock lk;
            groupCommit();
        }
Ejemplo n.º 13
0
        void DurableImpl::debugCheckLastDeclaredWrite() { 
            if( !DebugCheckLastDeclaredWrite )
                return;

            if( testIntent )
                return;

            static int n;
            ++n;

            assert(debug && cmdLine.dur);
            vector<WriteIntent>& w = commitJob.writes();
            if( w.size() == 0 ) 
                return;
            const WriteIntent &i = w[w.size()-1];
            size_t ofs;
            MongoMMF *mmf = privateViews.find(i.p, ofs);
            if( mmf == 0 ) 
                return;
            size_t past = ofs + i.len;
            if( mmf->length() < past + 8 ) 
                return; // too close to end of view
            char *priv = (char *) mmf->getView();
            char *writ = (char *) mmf->view_write();
            unsigned long long *a = (unsigned long long *) (priv+past);
            unsigned long long *b = (unsigned long long *) (writ+past);
            if( *a != *b ) { 
                for( unsigned z = 0; z < w.size() - 1; z++ ) { 
                    const WriteIntent& wi = w[z];
                    char *r1 = (char*) wi.p;
                    char *r2 = r1 + wi.len;
                    if( r1 <= (((char*)a)+8) && r2 > (char*)a ) { 
                        //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
                        return;
                    }
                }
                log() << "dur data after write area " << i.p << " does not agree" << endl;
                log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
                log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
                log() << " n:    " << n << endl;
                log() << endl;
            }
        }
Ejemplo n.º 14
0
 void DurableImpl::createdFile(const std::string& filename, unsigned long long len) {
     boost::shared_ptr<DurOp> op(new FileCreatedOp(filename, len));
     commitJob.noteOp(op);
 }
Ejemplo n.º 15
0
 bool DurableImpl::awaitCommit() { 
     commitJob.awaitNextCommit();
     return true;
 }
Ejemplo n.º 16
0
 /** Declare that a file has been created 
     Normally writes are applied only after journalling, for safety.  But here the file 
     is created first, and the journal will just replay the creation if the create didn't 
     happen because of crashing.
 */
 void DurableImpl::createdFile(string filename, unsigned long long len) { 
     shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
     commitJob.noteOp(op);
 }
Ejemplo n.º 17
0
namespace dur {

extern Journal j;
extern CommitJob commitJob;

const RelativePath local = RelativePath::fromRelativePath("local");

static DurableMappedFile* findMMF_inlock(void* ptr, size_t& ofs) {
    DurableMappedFile* f = privateViews.find_inlock(ptr, ofs);
    if (f == 0) {
        error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;

        // we want a stack trace and the assert below didn't print a trace once in the real world
        // - not sure why
        printStackTrace();
        stringstream ss;
        ss << "view pointer cannot be resolved " << std::hex << (size_t)ptr;
        journalingFailure(ss.str().c_str());  // asserts, which then abends
    }
    return f;
}

/** put the basic write operation into the buffer (bb) to be journaled */
static void prepBasicWrite_inlock(AlignedBuilder& bb,
                                  const WriteIntent* i,
                                  RelativePath& lastDbPath) {
    size_t ofs = 1;
    DurableMappedFile* mmf = findMMF_inlock(i->start(), /*out*/ ofs);

    if (MONGO_unlikely(!mmf->willNeedRemap())) {
        // tag this mmf as needed a remap of its private view later.
        // usually it will already be dirty/already set, so we do the if above first
        // to avoid possibility of cpu cache line contention
        mmf->setWillNeedRemap();
    }

    // since we have already looked up the mmf, we go ahead and remember the write view location
    // so we don't have to find the DurableMappedFile again later in WRITETODATAFILES()
    //
    // this was for WRITETODATAFILES_Impl2 so commented out now
    //
    /*
    dassert( i->w_ptr == 0 );
    i->w_ptr = ((char*)mmf->view_write()) + ofs;
    */

    JEntry e;
    e.len = min(i->length(), (unsigned)(mmf->length() - ofs));  // don't write past end of file
    verify(ofs <= 0x80000000);
    e.ofs = (unsigned)ofs;
    e.setFileNo(mmf->fileSuffixNo());

    if (mmf->relativePath() == local) {
        e.setLocalDbContextBit();
    } else if (mmf->relativePath() != lastDbPath) {
        lastDbPath = mmf->relativePath();
        JDbContext c;
        bb.appendStruct(c);
        bb.appendStr(lastDbPath.toString());
    }

    bb.appendStruct(e);
    bb.appendBuf(i->start(), e.len);

    if (MONGO_unlikely(e.len != (unsigned)i->length())) {
        log() << "journal info splitting prepBasicWrite at boundary" << endl;

        // This only happens if we write to the last byte in a file and
        // the fist byte in another file that is mapped adjacently. I
        // think most OSs leave at least a one page gap between
        // mappings, but better to be safe.

        WriteIntent next((char*)i->start() + e.len, i->length() - e.len);
        prepBasicWrite_inlock(bb, &next, lastDbPath);
    }
}

/** basic write ops / write intents.  note there is no particular order to these : if we have
    two writes to the same location during the group commit interval, it is likely
    (although not assured) that it is journaled here once.
*/
static void prepBasicWrites(AlignedBuilder& bb, const std::vector<WriteIntent>& intents) {
    stdx::lock_guard<stdx::mutex> lk(privateViews._mutex());

    // Each time write intents switch to a different database we journal a JDbContext.
    // Switches will be rare as we sort by memory location first and we batch commit.
    RelativePath lastDbPath;

    invariant(!intents.empty());

    WriteIntent last;
    for (std::vector<WriteIntent>::const_iterator i = intents.begin(); i != intents.end(); i++) {
        if (i->start() < last.end()) {
            // overlaps
            last.absorb(*i);
        } else {
            // discontinuous
            if (i != intents.begin()) {
                prepBasicWrite_inlock(bb, &last, lastDbPath);
            }

            last = *i;
        }
    }

    prepBasicWrite_inlock(bb, &last, lastDbPath);
}

/** we will build an output buffer ourself and then use O_DIRECT
    we could be in read lock for this
    caller handles locking
    @return partially populated sectheader and _ab set
*/
static void _PREPLOGBUFFER(JSectHeader& h, AlignedBuilder& bb) {
    // Add the JSectHeader

    // Invalidate the total length, we will fill it in later.
    h.setSectionLen(0xffffffff);
    h.seqNumber = getLastDataFileFlushTime();
    h.fileId = j.curFileId();

    // Ops other than basic writes (DurOp's) go first
    const std::vector<std::shared_ptr<DurOp>>& durOps = commitJob.ops();
    for (std::vector<std::shared_ptr<DurOp>>::const_iterator i = durOps.begin(); i != durOps.end();
         i++) {
        (*i)->serialize(bb);
    }

    // Write intents
    const std::vector<WriteIntent>& intents = commitJob.getIntentsSorted();
    if (!intents.empty()) {
        prepBasicWrites(bb, intents);
    }
}

void PREPLOGBUFFER(/*out*/ JSectHeader& outHeader, AlignedBuilder& outBuffer) {
    Timer t;
    j.assureLogFileOpen();  // so fileId is set
    _PREPLOGBUFFER(outHeader, outBuffer);
    stats.curr()->_prepLogBufferMicros += t.micros();
}
}
Ejemplo n.º 18
0
    namespace dur {

#if defined(_DEBUG)
        const bool DebugValidateMapsMatch = false;
        const bool DebugCheckLastDeclaredWrite = false;
#else
        const bool DebugValidateMapsMatch = false;
        const bool DebugCheckLastDeclaredWrite = false;
#endif

        DurableInterface* DurableInterface::_impl = new NonDurableImpl();

#if !defined(_DURABLE)
        // called by startup/main
        void enableDurability() {}
#else
        void enableDurability() { // TODO: merge with startup() ?
            assert(typeid(*DurableInterface::_impl) == typeid(NonDurableImpl));
            // lets NonDurableImpl instance leak, but its tiny and only happens once
            DurableInterface::_impl = new DurableImpl();
        }

        // later in this file
        static void groupCommit();

        static CommitJob commitJob;

        bool DurableImpl::awaitCommit() { 
            commitJob.awaitNextCommit();
            return true;
        }

        /** Declare that a file has been created 
            Normally writes are applied only after journalling, for safety.  But here the file 
            is created first, and the journal will just replay the creation if the create didn't 
            happen because of crashing.
        */
        void DurableImpl::createdFile(string filename, unsigned long long len) { 
            shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
            commitJob.noteOp(op);
        }

        /** indicate that a database is about to be dropped.  call before the actual drop. */
        void DurableImpl::droppingDb(string db) { 
            shared_ptr<DurOp> op( new DropDbOp(db) );
            commitJob.noteOp(op);

            // must commit now, before files are actually unlinked:
            groupCommit();
        }

        /** declare write intent.  when already in the write view if testIntent is true. */
        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
            WriteIntent w(p, len);
            commitJob.note(w);
        }

        void* DurableImpl::writingPtr(void *x, unsigned len) { 
            void *p = x;
            if( testIntent )
                p = MongoMMF::switchToPrivateView(x);
            declareWriteIntent(p, len);
            return p;
        }

        /** declare intent to write
            @param ofs offset within buf at which we will write
            @param len the length at ofs we will write
            @return new buffer pointer.  this is modified when testIntent is true.
        */
        void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
            char *p = (char *) buf;
            if( testIntent )
                p = (char *) MongoMMF::switchToPrivateView(buf);
            declareWriteIntent(p+ofs, len);
            return p;
        }

        /** Used in _DEBUG builds to check that we didn't overwrite the last intent
            that was declared.  called just before writelock release.  we check a few
            bytes after the declared region to see if they changed.

            @see MongoMutex::_releasedWriteLock

            SLOW
        */
#if defined(_DEBUG)
        void DurableImpl::debugCheckLastDeclaredWrite() { 
            if( !DebugCheckLastDeclaredWrite )
                return;

            if( testIntent )
                return;

            static int n;
            ++n;

            assert(debug && cmdLine.dur);
            vector<WriteIntent>& w = commitJob.writes();
            if( w.size() == 0 ) 
                return;
            const WriteIntent &i = w[w.size()-1];
            size_t ofs;
            MongoMMF *mmf = privateViews.find(i.p, ofs);
            if( mmf == 0 ) 
                return;
            size_t past = ofs + i.len;
            if( mmf->length() < past + 8 ) 
                return; // too close to end of view
            char *priv = (char *) mmf->getView();
            char *writ = (char *) mmf->view_write();
            unsigned long long *a = (unsigned long long *) (priv+past);
            unsigned long long *b = (unsigned long long *) (writ+past);
            if( *a != *b ) { 
                for( unsigned z = 0; z < w.size() - 1; z++ ) { 
                    const WriteIntent& wi = w[z];
                    char *r1 = (char*) wi.p;
                    char *r2 = r1 + wi.len;
                    if( r1 <= (((char*)a)+8) && r2 > (char*)a ) { 
                        //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
                        return;
                    }
                }
                log() << "dur data after write area " << i.p << " does not agree" << endl;
                log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
                log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
                log() << " n:    " << n << endl;
                log() << endl;
            }
        }
#endif

        /** we will build an output buffer ourself and then use O_DIRECT
            we could be in read lock for this
            caller handles locking 
            */
        static void PREPLOGBUFFER() { 
            assert( cmdLine.dur );
            AlignedBuilder& bb = commitJob._ab;
            bb.reset();

            unsigned lenOfs;
            // JSectHeader
            {
                bb.appendStr("\nHH\n", false);
                lenOfs = bb.skip(4);
            }

            // ops other than basic writes
            {
                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { 
                    (*i)->serialize(bb);
                }
            }

            // write intents
            {
                scoped_lock lk(privateViews._mutex());
                string lastFilePath;
                for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
                    size_t ofs;
                    MongoMMF *mmf = privateViews._find(i->p, ofs);
                    if( mmf == 0 ) {
                        string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
                        journalingFailure(s.c_str()); // asserts
                        return;
                    }

                    if( !mmf->willNeedRemap() ) {
                        mmf->willNeedRemap() = true; // usually it will already be dirty so don't bother writing then
                    }
                    //size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p);
                    i->w_ptr = ((char*)mmf->view_write()) + ofs;
                    if( mmf->filePath() != lastFilePath ) { 
                        lastFilePath = mmf->filePath();
                        JDbContext c;
                        bb.appendStruct(c);
                        bb.appendStr(lastFilePath);
                    }
                    JEntry e;
                    e.len = i->len;
                    assert( ofs <= 0x80000000 );
                    e.ofs = (unsigned) ofs;
                    e.fileNo = mmf->fileSuffixNo();
                    bb.appendStruct(e);
                    bb.appendBuf(i->p, i->len);
                }
            }

            {
                JSectFooter f(bb.buf(), bb.len());
                bb.appendStruct(f);
            }

            {
                assert( 0xffffe000 == (~(Alignment-1)) );
                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
                dassert( L >= (unsigned) bb.len() );
                *((unsigned*)bb.atOfs(lenOfs)) = L;
                unsigned padding = L - bb.len();
                bb.skip(padding);
                dassert( bb.len() % Alignment == 0 );
            }

            return;
        }

        /** write the buffer we have built to the journal and fsync it.
            outside of lock as that could be slow.
        */
        static void WRITETOJOURNAL(AlignedBuilder& ab) { 
            journal(ab);
        }

        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
        */
        static void debugValidateMapsMatch() {
            if( !DebugValidateMapsMatch ) 
                 return;

            Timer t;
            set<MongoFile*>& files = MongoFile::getAllFiles();
            for( set<MongoFile*>::iterator i = files.begin(); i != files.end(); i++ ) { 
                MongoFile *mf = *i;
                if( mf->isMongoMMF() ) { 
                    MongoMMF *mmf = (MongoMMF*) mf;
                    const char *p = (const char *) mmf->getView();
                    const char *w = (const char *) mmf->view_write();
                    unsigned low = 0xffffffff;
                    unsigned high = 0;
                    for( unsigned i = 0; i < mmf->length(); i++ ) {
                        if( p[i] != w[i] ) { 
                            log() << i << '\t' << (int) p[i] << '\t' << (int) w[i] << endl;
                            if( i < low ) low = i;
                            if( i > high ) high = i;
                        }
                    }
                    if( low != 0xffffffff ) { 
                        std::stringstream ss;
                        ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
                        log() << ss.str() << endl;
                        log() << "priv loc: " << (void*)(p+low) << endl;
                        vector<WriteIntent>& w = commitJob.writes();
                        (void)w; // mark as unused. Useful for inspection in debugger

                        breakpoint();
                    }
                }
            }
            log() << "debugValidateMapsMatch " << t.millis() << "ms " << endl;
        }

        /** apply the writes back to the non-private MMF after they are for certain in redo log 

            (1) todo we don't need to write back everything every group commit.  we MUST write back
            that which is going to be a remapped on its private view - but that might not be all 
            views.

            (2) todo should we do this using N threads?  would be quite easy
                see Hackenberg paper table 5 and 6.  2 threads might be a good balance.

            locking: in read lock when called
        */
        static void WRITETODATAFILES() { 
            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
            for( int i = commitJob.writes().size() - 1; i >= 0; i-- ) {
                const WriteIntent& intent = commitJob.writes()[i];
                char *dst = (char *) (intent.w_ptr);
                memcpy(dst, intent.p, intent.len);
            }

            debugValidateMapsMatch();
        }

        /** We need to remap the private views periodically. otherwise they would become very large.
            Call within write lock.
        */
        void REMAPPRIVATEVIEW() { 
            static unsigned startAt;
            static unsigned long long lastRemap;

            dbMutex.assertWriteLocked();
            dbMutex._remapPrivateViewRequested = false;
            assert( !commitJob.hasWritten() );

            if( 0 ) { 
                log() << "TEMP remapprivateview disabled for testing - will eventually run oom in this mode if db bigger than ram" << endl;
                return;
            }

            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so 
            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write 
            // faults after remapping, so doing a little bit at a time will avoid big load spikes on 
            // remapping.
            unsigned long long now = curTimeMicros64();
            double fraction = (now-lastRemap)/20000000.0;

            set<MongoFile*>& files = MongoFile::getAllFiles();
            unsigned sz = files.size();
            if( sz == 0 ) 
                return;

            unsigned ntodo = (unsigned) (sz * fraction);
            if( ntodo < 1 ) ntodo = 1;
            if( ntodo > sz ) ntodo = sz;

            const set<MongoFile*>::iterator b = files.begin();
            const set<MongoFile*>::iterator e = files.end();
            set<MongoFile*>::iterator i = b;
            // skip to our starting position
            for( unsigned x = 0; x < startAt; x++ ) {
                i++;
                if( i == e ) i = b;
            }
            startAt = (startAt + ntodo) % sz; // mark where to start next time

            for( unsigned x = 0; x < ntodo; x++ ) {
                dassert( i != e );
                if( (*i)->isMongoMMF() ) {
                    MongoMMF *mmf = (MongoMMF*) *i;
                    assert(mmf);
                    if( mmf->willNeedRemap() ) {
                        mmf->willNeedRemap() = false;
                        mmf->remapThePrivateView();
                    }
                    i++;
                    if( i == e ) i = b;
                }
            }
        }

        /** locking in read lock when called 
            @see MongoMMF::close()
        */
        static void groupCommit() {
            dbMutex.assertAtLeastReadLocked();

            if( !commitJob.hasWritten() )
                return;

            PREPLOGBUFFER();

            WRITETOJOURNAL(commitJob._ab);

            // data is now in the journal, which is sufficient for acknowledging getlasterror. 
            // (ok to crash after that)
            log() << "TEMP NOTIFYING COMMITTED" << endl;
            commitJob.notifyCommitted();

            // write the noted write intent entries to the data files.
            // this has to come after writing to the journal, obviously...
            MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock
            WRITETODATAFILES();
            if (!dbMutex.isWriteLocked())
                MongoFile::unmarkAllWritable();

            commitJob.reset();

            // REMAPPRIVATEVIEW
            // 
            // remapping private views must occur after WRITETODATAFILES otherwise 
            // we wouldn't see newly written data on reads.
            // 
            DEV assert( !commitJob.hasWritten() );
            if( !dbMutex.isWriteLocked() ) { 
                // this needs done in a write lock thus we do it on the next acquisition of that 
                // instead of here (there is no rush if you aren't writing anyway -- but it must happen, 
                // if it is done, before any uncommitted writes occur).
                //
                dbMutex._remapPrivateViewRequested = true;
            }
            else { 
                // however, if we are already write locked, we must do it now -- up the call tree someone 
                // may do a write without a new lock acquisition.  this can happen when MongoMMF::close() calls
                // this method when a file (and its views) is about to go away.
                //
                REMAPPRIVATEVIEW();
            }
        }

        static void go() {
            if( !commitJob.hasWritten() )
                return;

            {
                readlocktry lk("", 1000);
                if( lk.got() ) {
                    groupCommit();
                    return;
                }
            }

            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a 
            // write lock instead.  otherwise writes could use too much RAM.
            writelock lk;
            groupCommit();
        }

        /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its 
            views disappear 
        */
        void closingFileNotification() {
            if( dbMutex.atLeastReadLocked() ) {
                groupCommit(); 
            }
            else {
                assert( inShutdown() );
                if( commitJob.hasWritten() ) { 
                    log() << "dur warning files are closing outside locks with writes pending" << endl;
                }
            }
        }

        static void durThread() { 
            Client::initThread("dur");
            const int HowOftenToGroupCommitMs = 100;
            while( 1 ) { 
                try {
                    int millis = HowOftenToGroupCommitMs;
                    {
                        Timer t;
                        journalRotate(); // note we do this part outside of mongomutex
                        millis -= t.millis();
                        if( millis < 5 || millis > HowOftenToGroupCommitMs )
                            millis = 5;
                    }
                    sleepmillis(millis);
                    go();
                }
                catch(std::exception& e) { 
                    log() << "exception in durThread " << e.what() << endl;
                }
            }
        }

        void unlinkThread();
        void recover();
        void _debugCheckLastDeclaredWrite() { 
#if defined(_DEBUG)
            getDur().debugCheckLastDeclaredWrite(); 
#endif
        }

        void DurableImpl::startup() {
            if( !cmdLine.dur )
                return;
            if( testIntent )
                return;
            recover();
            journalMakeDir();
            boost::thread t(durThread);
            boost::thread t2(unlinkThread);
        }

#endif


    } // namespace dur
Ejemplo n.º 19
0
 /** declare write intent.  when already in the write view if testIntent is true. */
 void declareWriteIntent(void *p, unsigned len) {
     WriteIntent w(p, len);
     commitJob.note(w);
 }
Ejemplo n.º 20
0
namespace dur {

namespace {

// Used to activate the flush thread
stdx::mutex flushMutex;
stdx::condition_variable flushRequested;

// This is waited on for getlasterror acknowledgements. It means that data has been written to
// the journal, but not necessarily applied to the shared view, so it is all right to
// acknowledge the user operation, but NOT all right to delete the journal files for example.
CommitNotifier commitNotify;

// This is waited on for complete flush. It means that data has been both written to journal
// and applied to the shared view, so it is allowed to delete the journal files. Used for
// fsync:true, close DB, shutdown acknowledgements.
CommitNotifier applyToDataFilesNotify;

// When set, the flush thread will exit
AtomicUInt32 shutdownRequested(0);

enum {
    // How many commit cycles to do before considering doing a remap
    NumCommitsBeforeRemap = 10,

    // How many outstanding journal flushes should be allowed before applying writer back
    // pressure. Size of 1 allows two journal blocks to be in the process of being written -
    // one on the journal writer's buffer and one blocked waiting to be picked up.
    NumAsyncJournalWrites = 1,
};

// Remap loop state
unsigned remapFileToStartAt;

// How frequently to reset the durability statistics
enum { DurStatsResetIntervalMillis = 3 * 1000 };

// Size sanity checks
static_assert(UncommittedBytesLimit > BSONObjMaxInternalSize * 3,
              "UncommittedBytesLimit > BSONObjMaxInternalSize * 3");
static_assert(sizeof(void*) == 4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6,
              "sizeof(void*) == 4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6");


/**
 * MMAP V1 durability server status section.
 */
class DurSSS : public ServerStatusSection {
public:
    DurSSS() : ServerStatusSection("dur") {}

    virtual bool includeByDefault() const {
        return true;
    }

    virtual BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const {
        if (!getDur().isDurable()) {
            return BSONObj();
        }

        return dur::stats.asObj();
    }

} durSSS;


/**
 * A no-op durability interface. Used for the case when journaling is not enabled.
 */
class NonDurableImpl : public DurableInterface {
public:
    NonDurableImpl() {}

    // DurableInterface virtual methods
    virtual void* writingPtr(void* x, unsigned len) {
        return x;
    }
    virtual void declareWriteIntent(void*, unsigned) {}
    virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {}
    virtual void createdFile(const std::string& filename, unsigned long long len) {}
    virtual bool waitUntilDurable() {
        return false;
    }
    virtual bool commitNow(OperationContext* txn) {
        return false;
    }
    virtual bool commitIfNeeded() {
        return false;
    }
    virtual void syncDataAndTruncateJournal(OperationContext* txn) {}
    virtual bool isDurable() const {
        return false;
    }
    virtual void closingFileNotification() {}
    virtual void commitAndStopDurThread() {}
};


/**
 * The actual durability interface, when journaling is enabled.
 */
class DurableImpl : public DurableInterface {
public:
    DurableImpl() {}

    // DurableInterface virtual methods
    virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents);
    virtual void createdFile(const std::string& filename, unsigned long long len);
    virtual bool waitUntilDurable();
    virtual bool commitNow(OperationContext* txn);
    virtual bool commitIfNeeded();
    virtual void syncDataAndTruncateJournal(OperationContext* txn);
    virtual bool isDurable() const {
        return true;
    }
    virtual void closingFileNotification();
    virtual void commitAndStopDurThread();

    void start(ClockSource* cs, int64_t serverStartMs);

private:
    stdx::thread _durThreadHandle;
};


/**
 * Diagnostic to check that the private view and the non-private view are in sync after
 * applying the journal changes. This function is very slow and only runs when paranoid checks
 * are enabled.
 *
 * Must be called under at least S flush lock to ensure that there are no concurrent writes
 * happening.
 */
void debugValidateFileMapsMatch(const DurableMappedFile* mmf) {
    const unsigned char* p = (const unsigned char*)mmf->getView();
    const unsigned char* w = (const unsigned char*)mmf->view_write();

    // Ignore pre-allocated files that are not fully created yet
    if (!p || !w) {
        return;
    }

    if (memcmp(p, w, (unsigned)mmf->length()) == 0) {
        return;
    }

    unsigned low = 0xffffffff;
    unsigned high = 0;

    log() << "DurParanoid mismatch in " << mmf->filename();

    int logged = 0;
    unsigned lastMismatch = 0xffffffff;

    for (unsigned i = 0; i < mmf->length(); i++) {
        if (p[i] != w[i]) {
            if (lastMismatch != 0xffffffff && lastMismatch + 1 != i) {
                // Separate blocks of mismatches
                log() << std::endl;
            }

            lastMismatch = i;

            if (++logged < 60) {
                if (logged == 1) {
                    // For .ns files to find offset in record
                    log() << "ofs % 628 = 0x" << hex << (i % 628) << endl;
                }

                stringstream ss;
                ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned)w[i]
                   << "\tprivmap:" << setw(2) << (unsigned)p[i];

                if (p[i] > 32 && p[i] <= 126) {
                    ss << '\t' << p[i];
                }

                log() << ss.str() << endl;
            }

            if (logged == 60) {
                log() << "..." << endl;
            }

            if (i < low)
                low = i;
            if (i > high)
                high = i;
        }
    }

    if (low != 0xffffffff) {
        std::stringstream ss;
        ss << "journal error warning views mismatch " << mmf->filename() << ' ' << hex << low
           << ".." << high << " len:" << high - low + 1;

        log() << ss.str() << endl;
        log() << "priv loc: " << (void*)(p + low) << ' ' << endl;

        severe() << "Written data does not match in-memory view. Missing WriteIntent?";
        invariant(false);
    }
}


/**
 * Main code of the remap private view function.
 */
void remapPrivateViewImpl(double fraction) {
    LOG(4) << "journal REMAPPRIVATEVIEW" << endl;

// There is no way that the set of files can change while we are in this method, because
// we hold the flush lock in X mode. For files to go away, a database needs to be dropped,
// which means acquiring the flush lock in at least IX mode.
//
// However, the record fetcher logic unfortunately operates without any locks and on
// Windows and Solaris remap is not atomic and there is a window where the record fetcher
// might get an access violation. That's why we acquire the mongo files mutex here in X
// mode and the record fetcher takes in in S-mode (see MmapV1RecordFetcher for more
// detail).
//
// See SERVER-5723 for performance improvement.
// See SERVER-5680 to see why this code is necessary on Windows.
// See SERVER-8795 to see why this code is necessary on Solaris.
#if defined(_WIN32) || defined(__sun)
    LockMongoFilesExclusive lk;
#else
    LockMongoFilesShared lk;
#endif

    std::set<MongoFile*>& files = MongoFile::getAllFiles();

    const unsigned sz = files.size();
    if (sz == 0) {
        return;
    }

    unsigned ntodo = (unsigned)(sz * fraction);
    if (ntodo < 1)
        ntodo = 1;
    if (ntodo > sz)
        ntodo = sz;

    const set<MongoFile*>::iterator b = files.begin();
    const set<MongoFile*>::iterator e = files.end();
    set<MongoFile*>::iterator i = b;

    // Skip to our starting position as remembered from the last remap cycle
    for (unsigned x = 0; x < remapFileToStartAt; x++) {
        i++;
        if (i == e)
            i = b;
    }

    // Mark where to start on the next cycle
    const unsigned startedAt = remapFileToStartAt;
    remapFileToStartAt = (remapFileToStartAt + ntodo) % sz;

    Timer t;

    for (unsigned x = 0; x < ntodo; x++) {
        if ((*i)->isDurableMappedFile()) {
            DurableMappedFile* const mmf = (DurableMappedFile*)*i;

            // Sanity check that the contents of the shared and the private view match so we
            // don't end up overwriting data.
            if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalParanoid) {
                debugValidateFileMapsMatch(mmf);
            }

            if (mmf->willNeedRemap()) {
                mmf->remapThePrivateView();
            }

            i++;

            if (i == e)
                i = b;
        }
    }

    LOG(3) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' '
           << t.millis() << "ms";
}


// One instance of each durability interface
DurableImpl durableImpl;
NonDurableImpl nonDurableImpl;

// Notified when we commit to the journal.
static JournalListener* journalListener = &NoOpJournalListener::instance;
// Protects journalListener.
static stdx::mutex journalListenerMutex;

}  // namespace


// Declared in dur_preplogbuffer.cpp
void PREPLOGBUFFER(JSectHeader& outHeader,
                   AlignedBuilder& outBuffer,
                   ClockSource* cs,
                   int64_t serverStartMs);

// Declared in dur_journal.cpp
boost::filesystem::path getJournalDir();
void preallocateFiles();

// Forward declaration
static void durThread(ClockSource* cs, int64_t serverStartMs);

// Durability activity statistics
Stats stats;

// Reference to the write intents tracking object
CommitJob commitJob;

// Reference to the active durability interface
DurableInterface* DurableInterface::_impl(&nonDurableImpl);


//
// Stats
//

Stats::Stats() : _currIdx(0) {}

void Stats::reset() {
    // Seal the current metrics
    _stats[_currIdx]._durationMillis = _stats[_currIdx].getCurrentDurationMillis();

    // Use a new metric
    const unsigned newCurrIdx = (_currIdx + 1) % (sizeof(_stats) / sizeof(_stats[0]));
    _stats[newCurrIdx].reset();

    _currIdx = newCurrIdx;
}

BSONObj Stats::asObj() const {
    // Use the previous statistic
    const S& stats = _stats[(_currIdx - 1) % (sizeof(_stats) / sizeof(_stats[0]))];

    BSONObjBuilder builder;
    stats._asObj(&builder);

    return builder.obj();
}

void Stats::S::reset() {
    memset(this, 0, sizeof(*this));
    _startTimeMicros = curTimeMicros64();
}

std::string Stats::S::_CSVHeader() const {
    return "cmts\t jrnMB\t wrDFMB\t cIWLk\t early\t prpLgB\t wrToJ\t wrToDF\t rmpPrVw";
}

std::string Stats::S::_asCSV() const {
    stringstream ss;
    ss << setprecision(2) << _commits << '\t' << _journaledBytes / 1000000.0 << '\t'
       << _writeToDataFilesBytes / 1000000.0 << '\t' << _commitsInWriteLock << '\t' << 0 << '\t'
       << (unsigned)(_prepLogBufferMicros / 1000) << '\t'
       << (unsigned)(_writeToJournalMicros / 1000) << '\t'
       << (unsigned)(_writeToDataFilesMicros / 1000) << '\t'
       << (unsigned)(_remapPrivateViewMicros / 1000) << '\t' << (unsigned)(_commitsMicros / 1000)
       << '\t' << (unsigned)(_commitsInWriteLockMicros / 1000) << '\t';

    return ss.str();
}

void Stats::S::_asObj(BSONObjBuilder* builder) const {
    BSONObjBuilder& b = *builder;
    b << "commits" << _commits << "journaledMB" << _journaledBytes / 1000000.0
      << "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 << "compression"
      << _journaledBytes / (_uncompressedBytes + 1.0) << "commitsInWriteLock" << _commitsInWriteLock
      << "earlyCommits" << 0 << "timeMs"
      << BSON("dt" << _durationMillis << "prepLogBuffer" << (unsigned)(_prepLogBufferMicros / 1000)
                   << "writeToJournal"
                   << (unsigned)(_writeToJournalMicros / 1000)
                   << "writeToDataFiles"
                   << (unsigned)(_writeToDataFilesMicros / 1000)
                   << "remapPrivateView"
                   << (unsigned)(_remapPrivateViewMicros / 1000)
                   << "commits"
                   << (unsigned)(_commitsMicros / 1000)
                   << "commitsInWriteLock"
                   << (unsigned)(_commitsInWriteLockMicros / 1000));

    if (storageGlobalParams.journalCommitIntervalMs != 0) {
        b << "journalCommitIntervalMs" << storageGlobalParams.journalCommitIntervalMs.load();
    }
}


//
// DurableInterface
//

DurableInterface::DurableInterface() {}

DurableInterface::~DurableInterface() {}


//
// DurableImpl
//

bool DurableImpl::commitNow(OperationContext* txn) {
    CommitNotifier::When when = commitNotify.now();

    AutoYieldFlushLockForMMAPV1Commit flushLockYield(txn->lockState());

    // There is always just one waiting anyways
    flushRequested.notify_one();

    // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
    // call has been persisted to the journal file. This does not mean that this data has been
    // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
    applyToDataFilesNotify.waitFor(when);

    return true;
}

bool DurableImpl::waitUntilDurable() {
    commitNotify.awaitBeyondNow();
    return true;
}

void DurableImpl::createdFile(const std::string& filename, unsigned long long len) {
    std::shared_ptr<DurOp> op(new FileCreatedOp(filename, len));
    commitJob.noteOp(op);
}


void DurableImpl::declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {
    typedef std::vector<std::pair<void*, unsigned>> Intents;
    stdx::lock_guard<SimpleMutex> lk(commitJob.groupCommitMutex);
    for (Intents::const_iterator it(intents.begin()), end(intents.end()); it != end; ++it) {
        commitJob.note(it->first, it->second);
    }
}

bool DurableImpl::commitIfNeeded() {
    if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) {
        return false;
    }

    // Just wake up the flush thread
    flushRequested.notify_one();
    return true;
}

void DurableImpl::syncDataAndTruncateJournal(OperationContext* txn) {
    invariant(txn->lockState()->isW());

    // Once this returns, all the outstanding journal has been applied to the data files and
    // so it's safe to do the flushAll/journalCleanup below.
    commitNow(txn);

    // Flush the shared view to disk.
    MongoFile::flushAll(true);

    // Once the shared view has been flushed, we do not need the journal files anymore.
    journalCleanup(true);

    // Double check post-conditions
    invariant(!haveJournalFiles());
}

void DurableImpl::closingFileNotification() {
    if (commitJob.hasWritten()) {
        severe() << "journal warning files are closing outside locks with writes pending";

        // File is closing while there are unwritten changes
        invariant(false);
    }
}

void DurableImpl::commitAndStopDurThread() {
    CommitNotifier::When when = commitNotify.now();

    // There is always just one waiting anyways
    flushRequested.notify_one();

    // commitNotify.waitFor ensures that whatever was scheduled for journaling before this
    // call has been persisted to the journal file. This does not mean that this data has been
    // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify.
    applyToDataFilesNotify.waitFor(when);

    // Flush the shared view to disk.
    MongoFile::flushAll(true);

    // Once the shared view has been flushed, we do not need the journal files anymore.
    journalCleanup(true);

    // Double check post-conditions
    invariant(!haveJournalFiles());

    shutdownRequested.store(1);

    // Wait for the durability thread to terminate
    log() << "Terminating durability thread ...";
    _durThreadHandle.join();
}

void DurableImpl::start(ClockSource* cs, int64_t serverStartMs) {
    // Start the durability thread
    stdx::thread t(durThread, cs, serverStartMs);
    _durThreadHandle.swap(t);
}


/**
 * Remaps the private view from the shared view so that it does not consume too much
 * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed
 * to disk and applied on top of the shared view.
 *
 * @param fraction Value between (0, 1] indicating what fraction of the memory to remap.
 *      Remapping too much or too frequently incurs copy-on-write page fault cost.
 */
static void remapPrivateView(double fraction) {
    // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any
    // newly written data on reads.
    invariant(!commitJob.hasWritten());

    try {
        Timer t;
        remapPrivateViewImpl(fraction);
        stats.curr()->_remapPrivateViewMicros += t.micros();

        LOG(4) << "remapPrivateView end";
        return;
    } catch (DBException& e) {
        severe() << "dbexception in remapPrivateView causing immediate shutdown: " << e.toString();
    } catch (std::ios_base::failure& e) {
        severe() << "ios_base exception in remapPrivateView causing immediate shutdown: "
                 << e.what();
    } catch (std::bad_alloc& e) {
        severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: "
                 << e.what();
    } catch (std::exception& e) {
        severe() << "exception in remapPrivateView causing immediate shutdown: " << e.what();
    } catch (...) {
        severe() << "unknown exception in remapPrivateView causing immediate shutdown: ";
    }

    invariant(false);
}


/**
 * The main durability thread loop. There is a single instance of this function running.
 */
static void durThread(ClockSource* cs, int64_t serverStartMs) {
    Client::initThread("durability");

    log() << "Durability thread started";

    bool samePartition = true;
    try {
        const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string();
        samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
    } catch (...) {
    }

    // Spawn the journal writer thread
    JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites);
    journalWriter.start();

    // Used as an estimate of how much / how fast to remap
    uint64_t commitCounter(0);
    uint64_t estimatedPrivateMapSize(0);
    uint64_t remapLastTimestamp(0);

    while (shutdownRequested.loadRelaxed() == 0) {
        unsigned ms = storageGlobalParams.journalCommitIntervalMs;
        if (ms == 0) {
            ms = samePartition ? 100 : 30;
        }

        // +1 so it never goes down to zero
        const int64_t oneThird = (ms / 3) + 1;

        // Reset the stats based on the reset interval
        if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) {
            stats.reset();
        }

        try {
            stdx::unique_lock<stdx::mutex> lock(flushMutex);

            for (unsigned i = 0; i <= 2; i++) {
                if (stdx::cv_status::no_timeout ==
                    flushRequested.wait_for(lock, Milliseconds(oneThird).toSystemDuration())) {
                    // Someone forced a flush
                    break;
                }

                if (commitNotify.nWaiting()) {
                    // One or more getLastError j:true is pending
                    break;
                }

                if (commitJob.bytes() > UncommittedBytesLimit / 2) {
                    // The number of written bytes is growing
                    break;
                }
            }

            // The commit logic itself
            LOG(4) << "groupCommit begin";

            Timer t;

            const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext();
            OperationContext& txn = *txnPtr;
            AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState());

            // We need to snapshot the commitNumber after the flush lock has been obtained,
            // because at this point we know that we have a stable snapshot of the data.
            const CommitNotifier::When commitNumber(commitNotify.now());

            LOG(4) << "Processing commit number " << commitNumber;

            if (!commitJob.hasWritten()) {
                // We do not need the journal lock anymore. Free it here, for the really
                // unlikely possibility that the writeBuffer command below blocks.
                autoFlushLock.release();

                // getlasterror request could have came after the data was already committed.
                // No need to call committingReset though, because we have not done any
                // writes (hasWritten == false).
                JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
                buffer->setNoop();
                buffer->journalListenerToken = getJournalListener()->getToken();

                journalWriter.writeBuffer(buffer, commitNumber);
            } else {
                // This copies all the in-memory changes into the journal writer's buffer.
                JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
                PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder(), cs, serverStartMs);

                estimatedPrivateMapSize += commitJob.bytes();
                commitCounter++;

                // Now that the write intents have been copied to the buffer, the commit job is
                // free to be reused. We need to reset the commit job's contents while under
                // the S flush lock, because otherwise someone might have done a write and this
                // would wipe out their changes without ever being committed.
                commitJob.committingReset();

                double systemMemoryPressurePercentage =
                    ProcessInfo::getSystemMemoryPressurePercentage();

                // Now that the in-memory modifications have been collected, we can potentially
                // release the flush lock if remap is not necessary.
                // When we remap due to memory pressure, we look at two criteria
                // 1. If the amount of 4k pages touched exceeds 512 MB,
                //    a reasonable estimate of memory pressure on Linux.
                // 2. Check if the amount of free memory on the machine is running low,
                //    since #1 is underestimates the memory pressure on Windows since
                //    commits in 64MB chunks.
                const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) ||
                    (systemMemoryPressurePercentage > 0.0) ||
                    (commitCounter % NumCommitsBeforeRemap == 0) ||
                    (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap);

                double remapFraction = 0.0;

                if (shouldRemap) {
                    // We want to remap all private views about every 2 seconds. There could be
                    // ~1000 views so we do a little each pass. There will be copy on write
                    // faults after remapping, so doing a little bit at a time will avoid big
                    // load spikes when the pages are touched.
                    //
                    // TODO: Instead of the time-based logic above, consider using ProcessInfo
                    //       and watching for getResidentSize to drop, which is more precise.
                    remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0;

                    if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) {
                        remapFraction = 1;
                    } else {
                        // We don't want to get close to the UncommittedBytesLimit
                        const double remapMemFraction =
                            estimatedPrivateMapSize / ((double)UncommittedBytesLimit);

                        remapFraction = std::max(remapMemFraction, remapFraction);

                        remapFraction = std::max(systemMemoryPressurePercentage, remapFraction);
                    }
                } else {
                    LOG(4) << "Early release flush lock";

                    // We will not be doing a remap so drop the flush lock. That way we will be
                    // doing the journal I/O outside of lock, so other threads can proceed.
                    invariant(!shouldRemap);
                    autoFlushLock.release();
                }

                buffer->journalListenerToken = getJournalListener()->getToken();
                // Request async I/O to the journal. This may block.
                journalWriter.writeBuffer(buffer, commitNumber);

                // Data has now been written to the shared view. If remap was requested, we
                // would still be holding the S flush lock here, so just upgrade it and
                // perform the remap.
                if (shouldRemap) {
                    // Need to wait for the previously scheduled journal writes to complete
                    // before any remap is attempted.
                    journalWriter.flush();
                    journalWriter.assertIdle();

                    // Upgrading the journal lock to flush stops all activity on the system,
                    // because we will be remapping memory and we don't want readers to be
                    // accessing it. Technically this step could be avoided on systems, which
                    // support atomic remap.
                    autoFlushLock.upgradeFlushLockToExclusive();
                    remapPrivateView(remapFraction);

                    autoFlushLock.release();

                    // Reset the private map estimate outside of the lock
                    estimatedPrivateMapSize = 0;
                    remapLastTimestamp = curTimeMicros64();

                    stats.curr()->_commitsInWriteLock++;
                    stats.curr()->_commitsInWriteLockMicros += t.micros();
                }
            }

            stats.curr()->_commits++;
            stats.curr()->_commitsMicros += t.micros();

            LOG(4) << "groupCommit end";
        } catch (DBException& e) {
            severe() << "dbexception in durThread causing immediate shutdown: " << e.toString();
            invariant(false);
        } catch (std::ios_base::failure& e) {
            severe() << "ios_base exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (std::bad_alloc& e) {
            severe() << "bad_alloc exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (std::exception& e) {
            severe() << "exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (...) {
            severe() << "unhandled exception in durThread causing immediate shutdown";
            invariant(false);
        }
    }

    // Stops the journal thread and ensures everything was written
    invariant(!commitJob.hasWritten());

    journalWriter.flush();
    journalWriter.shutdown();

    log() << "Durability thread stopped";
}


/**
 * Invoked at server startup. Recovers the database by replaying journal files and then
 * starts the durability thread.
 */
void startup(ClockSource* cs, int64_t serverStartMs) {
    if (!storageGlobalParams.dur) {
        return;
    }

    journalMakeDir(cs, serverStartMs);

    try {
        replayJournalFilesAtStartup();
    } catch (DBException& e) {
        severe() << "dbexception during recovery: " << e.toString();
        throw;
    } catch (std::exception& e) {
        severe() << "std::exception during recovery: " << e.what();
        throw;
    } catch (...) {
        severe() << "exception during recovery";
        throw;
    }

    preallocateFiles();

    durableImpl.start(cs, serverStartMs);
    DurableInterface::_impl = &durableImpl;
}

void setJournalListener(JournalListener* jl) {
    stdx::unique_lock<stdx::mutex> lk(journalListenerMutex);
    journalListener = jl;
}

JournalListener* getJournalListener() {
    stdx::unique_lock<stdx::mutex> lk(journalListenerMutex);
    return journalListener;
}

}  // namespace dur
Ejemplo n.º 21
0
/**
 * The main durability thread loop. There is a single instance of this function running.
 */
static void durThread(ClockSource* cs, int64_t serverStartMs) {
    Client::initThread("durability");

    log() << "Durability thread started";

    bool samePartition = true;
    try {
        const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string();
        samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
    } catch (...) {
    }

    // Spawn the journal writer thread
    JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites);
    journalWriter.start();

    // Used as an estimate of how much / how fast to remap
    uint64_t commitCounter(0);
    uint64_t estimatedPrivateMapSize(0);
    uint64_t remapLastTimestamp(0);

    while (shutdownRequested.loadRelaxed() == 0) {
        unsigned ms = storageGlobalParams.journalCommitIntervalMs;
        if (ms == 0) {
            ms = samePartition ? 100 : 30;
        }

        // +1 so it never goes down to zero
        const int64_t oneThird = (ms / 3) + 1;

        // Reset the stats based on the reset interval
        if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) {
            stats.reset();
        }

        try {
            stdx::unique_lock<stdx::mutex> lock(flushMutex);

            for (unsigned i = 0; i <= 2; i++) {
                if (stdx::cv_status::no_timeout ==
                    flushRequested.wait_for(lock, Milliseconds(oneThird).toSystemDuration())) {
                    // Someone forced a flush
                    break;
                }

                if (commitNotify.nWaiting()) {
                    // One or more getLastError j:true is pending
                    break;
                }

                if (commitJob.bytes() > UncommittedBytesLimit / 2) {
                    // The number of written bytes is growing
                    break;
                }
            }

            // The commit logic itself
            LOG(4) << "groupCommit begin";

            Timer t;

            const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext();
            OperationContext& txn = *txnPtr;
            AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState());

            // We need to snapshot the commitNumber after the flush lock has been obtained,
            // because at this point we know that we have a stable snapshot of the data.
            const CommitNotifier::When commitNumber(commitNotify.now());

            LOG(4) << "Processing commit number " << commitNumber;

            if (!commitJob.hasWritten()) {
                // We do not need the journal lock anymore. Free it here, for the really
                // unlikely possibility that the writeBuffer command below blocks.
                autoFlushLock.release();

                // getlasterror request could have came after the data was already committed.
                // No need to call committingReset though, because we have not done any
                // writes (hasWritten == false).
                JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
                buffer->setNoop();
                buffer->journalListenerToken = getJournalListener()->getToken();

                journalWriter.writeBuffer(buffer, commitNumber);
            } else {
                // This copies all the in-memory changes into the journal writer's buffer.
                JournalWriter::Buffer* const buffer = journalWriter.newBuffer();
                PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder(), cs, serverStartMs);

                estimatedPrivateMapSize += commitJob.bytes();
                commitCounter++;

                // Now that the write intents have been copied to the buffer, the commit job is
                // free to be reused. We need to reset the commit job's contents while under
                // the S flush lock, because otherwise someone might have done a write and this
                // would wipe out their changes without ever being committed.
                commitJob.committingReset();

                double systemMemoryPressurePercentage =
                    ProcessInfo::getSystemMemoryPressurePercentage();

                // Now that the in-memory modifications have been collected, we can potentially
                // release the flush lock if remap is not necessary.
                // When we remap due to memory pressure, we look at two criteria
                // 1. If the amount of 4k pages touched exceeds 512 MB,
                //    a reasonable estimate of memory pressure on Linux.
                // 2. Check if the amount of free memory on the machine is running low,
                //    since #1 is underestimates the memory pressure on Windows since
                //    commits in 64MB chunks.
                const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) ||
                    (systemMemoryPressurePercentage > 0.0) ||
                    (commitCounter % NumCommitsBeforeRemap == 0) ||
                    (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap);

                double remapFraction = 0.0;

                if (shouldRemap) {
                    // We want to remap all private views about every 2 seconds. There could be
                    // ~1000 views so we do a little each pass. There will be copy on write
                    // faults after remapping, so doing a little bit at a time will avoid big
                    // load spikes when the pages are touched.
                    //
                    // TODO: Instead of the time-based logic above, consider using ProcessInfo
                    //       and watching for getResidentSize to drop, which is more precise.
                    remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0;

                    if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) {
                        remapFraction = 1;
                    } else {
                        // We don't want to get close to the UncommittedBytesLimit
                        const double remapMemFraction =
                            estimatedPrivateMapSize / ((double)UncommittedBytesLimit);

                        remapFraction = std::max(remapMemFraction, remapFraction);

                        remapFraction = std::max(systemMemoryPressurePercentage, remapFraction);
                    }
                } else {
                    LOG(4) << "Early release flush lock";

                    // We will not be doing a remap so drop the flush lock. That way we will be
                    // doing the journal I/O outside of lock, so other threads can proceed.
                    invariant(!shouldRemap);
                    autoFlushLock.release();
                }

                buffer->journalListenerToken = getJournalListener()->getToken();
                // Request async I/O to the journal. This may block.
                journalWriter.writeBuffer(buffer, commitNumber);

                // Data has now been written to the shared view. If remap was requested, we
                // would still be holding the S flush lock here, so just upgrade it and
                // perform the remap.
                if (shouldRemap) {
                    // Need to wait for the previously scheduled journal writes to complete
                    // before any remap is attempted.
                    journalWriter.flush();
                    journalWriter.assertIdle();

                    // Upgrading the journal lock to flush stops all activity on the system,
                    // because we will be remapping memory and we don't want readers to be
                    // accessing it. Technically this step could be avoided on systems, which
                    // support atomic remap.
                    autoFlushLock.upgradeFlushLockToExclusive();
                    remapPrivateView(remapFraction);

                    autoFlushLock.release();

                    // Reset the private map estimate outside of the lock
                    estimatedPrivateMapSize = 0;
                    remapLastTimestamp = curTimeMicros64();

                    stats.curr()->_commitsInWriteLock++;
                    stats.curr()->_commitsInWriteLockMicros += t.micros();
                }
            }

            stats.curr()->_commits++;
            stats.curr()->_commitsMicros += t.micros();

            LOG(4) << "groupCommit end";
        } catch (DBException& e) {
            severe() << "dbexception in durThread causing immediate shutdown: " << e.toString();
            invariant(false);
        } catch (std::ios_base::failure& e) {
            severe() << "ios_base exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (std::bad_alloc& e) {
            severe() << "bad_alloc exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (std::exception& e) {
            severe() << "exception in durThread causing immediate shutdown: " << e.what();
            invariant(false);
        } catch (...) {
            severe() << "unhandled exception in durThread causing immediate shutdown";
            invariant(false);
        }
    }

    // Stops the journal thread and ensures everything was written
    invariant(!commitJob.hasWritten());

    journalWriter.flush();
    journalWriter.shutdown();

    log() << "Durability thread stopped";
}
Ejemplo n.º 22
0
 /** declare write intent.  when already in the write view if testIntent is true. */
 void DurableImpl::declareWriteIntent(void *p, unsigned len) {
     WriteIntent w(p, len);
     commitJob.note(w);
 }
Ejemplo n.º 23
0
        /** we will build an output buffer ourself and then use O_DIRECT
            we could be in read lock for this
            caller handles locking 
            */
        static void PREPLOGBUFFER() { 
            assert( cmdLine.dur );
            AlignedBuilder& bb = commitJob._ab;
            bb.reset();

            unsigned lenOfs;
            // JSectHeader
            {
                bb.appendStr("\nHH\n", false);
                lenOfs = bb.skip(4);
            }

            // ops other than basic writes
            {
                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { 
                    (*i)->serialize(bb);
                }
            }

            // write intents
            {
                scoped_lock lk(privateViews._mutex());
                string lastFilePath;
                for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
                    size_t ofs;
                    MongoMMF *mmf = privateViews._find(i->p, ofs);
                    if( mmf == 0 ) {
                        string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
                        journalingFailure(s.c_str()); // asserts
                        return;
                    }

                    if( !mmf->willNeedRemap() ) {
                        mmf->willNeedRemap() = true; // usually it will already be dirty so don't bother writing then
                    }
                    //size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p);
                    i->w_ptr = ((char*)mmf->view_write()) + ofs;
                    if( mmf->filePath() != lastFilePath ) { 
                        lastFilePath = mmf->filePath();
                        JDbContext c;
                        bb.appendStruct(c);
                        bb.appendStr(lastFilePath);
                    }
                    JEntry e;
                    e.len = i->len;
                    assert( ofs <= 0x80000000 );
                    e.ofs = (unsigned) ofs;
                    e.fileNo = mmf->fileSuffixNo();
                    bb.appendStruct(e);
                    bb.appendBuf(i->p, i->len);
                }
            }

            {
                JSectFooter f(bb.buf(), bb.len());
                bb.appendStruct(f);
            }

            {
                assert( 0xffffe000 == (~(Alignment-1)) );
                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
                dassert( L >= (unsigned) bb.len() );
                *((unsigned*)bb.atOfs(lenOfs)) = L;
                unsigned padding = L - bb.len();
                bb.skip(padding);
                dassert( bb.len() % Alignment == 0 );
            }

            return;
        }