Пример #1
0
/*********************************************************************
 *
 *  chkpt_m::take()
 *
 *  Take a checkpoint. A Checkpoint consists of:
 *    1. Checkpoint Begin Log    (chkpt_begin)
 *    2. Checkpoint Device Table Log(s) (chkpt_dev_tab)
 *        -- all mounted devices
 *    3. Checkpoint Buffer Table Log(s)  (chkpt_bf_tab)
 *        -- dirty page entries in bf and their recovery lsn
 *    4. Checkpoint Transaction Table Log(s) (chkpt_xct_tab)
 *        -- active transactions and their first lsn
 *    5. Checkpoint Prepared Transactions (optional)
 *        -- prepared transactions and their locks
 *         (using the same log records that prepare does)
 *    6. Checkpoint End Log (chkpt_end)
 *
 *********************************************************************/
void chkpt_m::take()
{
    FUNC(chkpt_m::take);
    if (! log)   {
        /*
         *  recovery facilities disabled ... do nothing
         */
        return;
    }
    INC_TSTAT(log_chkpt_cnt);
    
    /*
     *  Allocate a buffer for storing log records
     */
    w_auto_delete_t<logrec_t> logrec(new logrec_t);

    /*
     * checkpoints are fuzzy
     * but must be serialized wrt each other.
     *
     * Acquire the mutex to serialize prepares and
     * checkpoints. 
     *
     * NB: EVERYTHING BETWEEN HERE AND RELEASING THE MUTEX
     * MUST BE W_COERCE (not W_DO).
     */
    chkpt_serial_m::chkpt_acquire();
 retry:
    
    /*
     * FRJ: We must somehow guarantee that the log always has space to
     * accept checkpoints.  We impose two constraints to this end:
     *
     * 1. We cap the total space checkpoints are allowed to consume in
     *    any one log partition. This is a good idea anyway because
     *    checkpoint size is linear in the number of dirty buffer pool
     *    pages -- ~2MB per GB of dirty data -- and yet the utility of
     *    checkpoints drops off quickly as the dirty page count
     *    increases -- log analysis and recovery must start at the lsn
     *    of the oldest dirty page regardless of how recent the
     *    checkpoint was.
     *
     * 2. No checkpoint may depend on more than /max_openlog-1/ log
     *    partitions. In other words, every checkpoint completion must
     *    leave at least one log partition available.
     *
     * We use these two constraints, together with log reservations,
     * to guarantee the ability to reclaim log space if the log
     * becomes full. The log maintains, on our behalf, a reservation
     * big enough for two maximally-sized checkpoints (ie the dirty
     * page table lists every page in the buffer pool). Every time we
     * reclaim a log segment this reservation is topped up atomically.
     */
#define LOG_INSERT(constructor_call, rlsn)            \
    do {                            \
    new (logrec) constructor_call;                \
    W_COERCE( log->insert(*logrec, rlsn) );            \
    if(!log->consume_chkpt_reservation(logrec->length())) {    \
        W_FATAL(eOUTOFLOGSPACE);                \
    }                            \
    } while(0)
    
    /* if current partition is max_openlog then the oldest lsn we can
       tolerate is 2.0. We must flush all pages dirtied before that
       time and must wait until all transactions with an earlier
       start_lsn have ended (at worst they will abort if the log fills
       up before they can commit).

       TODO: use smlevel_0::log_warn_callback to notify the VAS in
       case old transactions are't currently active for some reason.

       Also, remember the current checkpoint count so we can see
       whether we get raced...
     */
// #warning "TODO use log_warn_callback in case old transactions aren't logging right now"
    long curr_pnum = log->curr_lsn().file();
    long too_old_pnum = std::max(0l, curr_pnum - max_openlog+1);
    if(!log->verify_chkpt_reservation()) {
    /* Yikes! The log can't guarantee that we'll be able to
       complete any checkpoint after this one, so we must reclaim
       space even if the log doesn't seem to be full.
    */
		long too_old_pnum = log->global_min_lsn().file();
		if(too_old_pnum == curr_pnum) {
			// how/why did they reserve so much log space???
			W_FATAL(eOUTOFLOGSPACE);
		}
	}

	/* We cannot proceed if any transaction has a too-low start_lsn;
	   wait for them to complete before continuing.
	   
	   WARNING: we have to wake any old transactions which are waiting
	   on locks, or we risk deadlocks where the lock holder waits on a
	   full log while the old transaction waits on the lock.
	 */
	lsn_t oldest_valid_lsn = log_m::first_lsn(too_old_pnum+1);
	old_xct_tracker tracker;
    { 
    xct_i it(true); // do acquire the xlist_mutex...
    while(xct_t* xd=it.next()) {
        lsn_t const &flsn = xd->first_lsn();
        if(flsn.valid() && flsn < oldest_valid_lsn) {
			// poison the transaction and add it to the list...
			xd->force_nonblocking();
			tracker.track(xd);
        }
    }
    }

    /* release the chkpt_serial to do expensive stuff

       We'll record the current checkpoint count so we can detect
       whether we get raced during the gap.
     */
    long chkpt_stamp = _chkpt_count;
    chkpt_serial_m::chkpt_release();

    
    // clear out all too-old pages
    W_COERCE(bf->force_until_lsn(oldest_valid_lsn, false));

    /* hopefully the page cleaning took long enough that the old
       transactions all ended...
     */
    if(!tracker.finished())
       tracker.wait_for_all();

    // raced?
    chkpt_serial_m::chkpt_acquire();
    if(_chkpt_count != chkpt_stamp)
    goto retry;
    
    /*
     *  Finally, we're ready to start the actual checkpoint!
     *
     *  Write a Checkpoint Begin Log and record its lsn in master
     */
    lsn_t master;
    LOG_INSERT(chkpt_begin_log(io->GetLastMountLSN()), &master);

    /*
     *  Checkpoint the buffer pool dirty page table, and record
     *  minimum of the recovery lsn of all dirty pages.
     *
     *  We could do this (very slow) operation before grabbing the
     *  checkpoint mutex because all pages can do is get younger by
     *  being flushed; no page can become older than the min_rec_lsn
     *  we record here ... however, we have to serialize checkpoints
     *  because, although they are fuzzy, they cannot intermingle.
     *  One must complete before another starts. Recovery relies
     *  on it.  Either everyone uses wakeup_and_take or they (dismount,
     *  mount, etc) wait on this.
     *
     *  The srv_log does wakeup_and_take() whenever a new partition is
     *  opened, and it might be that a checkpoint is spanning a 
     *  partition.
     */
    lsn_t min_rec_lsn = lsn_t::max;
    {
        int     bfsz = bf->npages();
        const     int chunk = chkpt_bf_tab_t::max;

        w_auto_delete_array_t<lpid_t> pid(new lpid_t[chunk]);
        w_auto_delete_array_t<lsn_t> rec_lsn(new lsn_t[chunk]);
        w_assert1(pid && rec_lsn);

        int total_count = 0;
        for (int i = 0; i < bfsz; )  {
            /*
             *  Loop over all buffer pages
             */
            int count = chunk;
            // Have the minimum rec_lsn of the bunch
            // returned iff it's less than the value passed in
            W_COERCE( bf->get_rec_lsn(i, count, pid, rec_lsn, min_rec_lsn) );
            if (count)  {
                total_count+= count;


                /*
                 *  Write a Buffer Table Log
                 */
                LOG_INSERT(chkpt_bf_tab_log(count, pid, rec_lsn), 0);
            }
        }
        //fprintf(stderr, "Checkpoint found %d dirty pages\n", total_count);
    }


    /*
     *  Checkpoint the dev mount table
     */
    {
        /*
         *  Log the mount table in "max loggable size" chunks.
         */
        // XXX casts due to enums
        const int chunk = (int)max_vols > (int)chkpt_dev_tab_t::max 
            ? (int)chkpt_dev_tab_t::max : (int)max_vols;
        int dev_cnt = io->num_vols();

        int    i;
        char        **devs;
        devs = new char *[chunk];
        if (!devs)
            W_FATAL(fcOUTOFMEMORY);
        for (i = 0; i < chunk; i++) {
            devs[i] = new char[max_devname+1];
            if (!devs[i])
                W_FATAL(fcOUTOFMEMORY);
        }
        vid_t        *vids;
        vids = new vid_t[chunk];
        if (!vids)
            W_FATAL(fcOUTOFMEMORY);

        for (i = 0; i < dev_cnt; i += chunk)  {
            
            int ret;
            W_COERCE( io->get_vols(i, MIN(dev_cnt - i, chunk),
                          devs, vids, ret));
            if (ret)  {
                /*
                 *  Write a Checkpoint Device Table Log
                 */
                // XXX The bogus 'const char **' cast is for visual c++
                LOG_INSERT(chkpt_dev_tab_log(ret, (const char **) devs, vids), 0);
            }
        }
        delete [] vids;
        for (i = 0; i < chunk; i++)
            delete [] devs[i];
        delete [] devs;
    }


    /*
     *  Checkpoint the transaction table, and record
     *  minimum of first_lsn of all transactions.
     */
    lsn_t min_xct_lsn = lsn_t::max;
    {
        const int    chunk = chkpt_xct_tab_t::max;
        tid_t        youngest = xct_t::youngest_tid();
        w_auto_delete_array_t<tid_t> tid(new tid_t[chunk]);
        w_auto_delete_array_t<xct_state_t> state(new xct_state_t[chunk]);
        w_auto_delete_array_t<lsn_t> last_lsn(new lsn_t[chunk]);
        w_auto_delete_array_t<lsn_t> undo_nxt(new lsn_t[chunk]);

        /* Keep the transaction list static while we write the state of
           prepared transactions.  Without the lock the list could change
           underneath this checkpoint. Note that we are using the
           iterator without locking because we own the lock.

           FRJ: even though xct_i now locks in a fully safe way, we
           want to hold the mutex longer than it's in scope so we
           continue using manual locking.
        */
        xct_i x(true); // true -> acquire the mutex

        const xct_t* xd = 0;
        do {
            int i = 0;
            while (i < chunk && (xd = x.next()))  {
                /*
                 *  Loop over all transactions and record only
                 *  xcts that dirtied something.
                 *  Skip those that have ended but not yet
                 *  been destroyed.
                 */
                if( xd->state() == xct_t::xct_ended) {
                   continue;
                }
                if (xd->first_lsn().valid())  {
                    tid[i] = xd->tid();
                    state[i] = xd->state();
                    //
                    // NOTE: aborting xcts are installed as active -
                    // they will be aborted on restart if not ended
                    // by the time we restart.
                    if (state[i] == xct_t::xct_aborting) 
                        state[i] = xct_t::xct_active;
                    //

                    if (state[i] == xct_t::xct_prepared)  {
                        DBG(<< tid[i] <<" is prepared -- logging as active");
                        state[i] = xct_t::xct_active;
                    }
                    //  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
                    // don't worry - it
                    // will be prepared in the next section.
                    // this just makes recovery debug-checking
                    // a little easier
                    /////////////////////////////////////////

                    last_lsn[i] = xd->last_lsn();
                    undo_nxt[i] = xd->undo_nxt();
                    
                    if (min_xct_lsn > xd->first_lsn())
                        min_xct_lsn = xd->first_lsn();

                    i++;
                }
            }

            /*
            // We *always* have to write this record, because we have
            // to record the youngest xct!!!! NEH
            // if (i)  
            */
            {
                /*
                 *  Write a Transaction Table Log
                 */
        LOG_INSERT(chkpt_xct_tab_log(youngest, i, tid, state,
                                   last_lsn, undo_nxt), 0);
            }
        } while (xd);
Пример #2
0
void chkpt_t::serialize()
{
    // Allocate a buffer for storing log records
    logrec_t* logrec = new logrec_t;

    size_t chunk;

    // Serialize bkp_tab -- CS TODO
    if (!bkp_path.empty()) {
        vector<string> backup_paths;
        backup_paths.push_back(bkp_path);
        LOG_INSERT(chkpt_backup_tab_log(backup_paths.size(),
                    (const string*)(&backup_paths[0])), 0);
    }

    //LOG_INSERT(chkpt_restore_tab_log(vol->vid()), 0);

    // Serialize buf_tab
    chunk = chkpt_bf_tab_t::max;
    vector<PageID> pid;
    vector<lsn_t> rec_lsn;
    vector<lsn_t> page_lsn;
    for(buf_tab_t::const_iterator it = buf_tab.begin();
            it != buf_tab.end(); ++it)
    {
        DBGOUT1(<<"pid[]="<<it->first<< " , " <<
                  "rec_lsn[]="<<it->second.rec_lsn<< " , " <<
                  "page_lsn[]="<<it->second.page_lsn);
        pid.push_back(it->first);
        rec_lsn.push_back(it->second.rec_lsn);
        page_lsn.push_back(it->second.page_lsn);
         if(pid.size()==chunk || &*it==&*buf_tab.rbegin()) {
            LOG_INSERT(chkpt_bf_tab_log(pid.size(), (const PageID*)(&pid[0]),
                                                    (const lsn_t*)(&rec_lsn[0]),
                                                    (const lsn_t*)(&page_lsn[0])), 0);
            pid.clear();
            rec_lsn.clear();
            page_lsn.clear();
         }
    }

    chunk = chkpt_xct_tab_t::max;
    vector<tid_t> tid;
    vector<smlevel_0::xct_state_t> state;
    vector<lsn_t> last_lsn;
    vector<lsn_t> first_lsn;
    vector<okvl_mode> lock_mode;
    vector<uint32_t> lock_hash;
    for(xct_tab_t::const_iterator it=xct_tab.begin();
            it != xct_tab.end(); ++it) {
        DBGOUT1(<<"tid[]="<<it->first<<" , " <<
                  "state[]="<<it->second.state<< " , " <<
                  "last_lsn[]="<<it->second.last_lsn<<" , " <<
                  "first_lsn[]="<<it->second.first_lsn);

        tid.push_back(it->first);
        state.push_back(it->second.state);
        last_lsn.push_back(it->second.last_lsn);
        first_lsn.push_back(it->second.first_lsn);
        if(tid.size()==chunk || &*it==&*xct_tab.rbegin()) {
            LOG_INSERT(chkpt_xct_tab_log(get_highest_tid(), tid.size(),
                                        (const tid_t*)(&tid[0]),
                                        (const smlevel_0::xct_state_t*)(&state[0]),
                                        (const lsn_t*)(&last_lsn[0]),
                                        (const lsn_t*)(&first_lsn[0])), 0);
            tid.clear();
            state.clear();
            last_lsn.clear();
            first_lsn.clear();
        }

        // gather lock table
        for(vector<lock_info_t>::const_iterator jt = it->second.locks.begin();
                jt != it->second.locks.end(); ++jt)
        {
            DBGOUT1(<<"    lock_mode[]="<<jt->lock_mode<<" , lock_hash[]="<<jt->lock_hash);
            lock_mode.push_back(jt->lock_mode);
            lock_hash.push_back(jt->lock_hash);
            if(lock_mode.size() == chunk) {
                LOG_INSERT(chkpt_xct_lock_log(it->first,
                                      lock_mode.size(),
                                      (const okvl_mode*)(&lock_mode[0]),
                                      (const uint32_t*)(&lock_hash[0])), 0);
                lock_mode.clear();
                lock_hash.clear();
            }
        }
        if(lock_mode.size() > 0) {
            LOG_INSERT(chkpt_xct_lock_log(it->first,
                        lock_mode.size(),
                        (const okvl_mode*)(&lock_mode[0]),
                        (const uint32_t*)(&lock_hash[0])), 0);
            lock_mode.clear();
            lock_hash.clear();
        }
    }

    // In case the transaction table was empty, we insert a xct_tab_log anyway,
    // because we want to save the highest tid.
    if(xct_tab.size() == 0) {
        LOG_INSERT(chkpt_xct_tab_log(get_highest_tid(), tid.size(),
                                        (const tid_t*)(&tid[0]),
                                        (const smlevel_0::xct_state_t*)(&state[0]),
                                        (const lsn_t*)(&last_lsn[0]),
                                        (const lsn_t*)(&first_lsn[0])), 0);
    }

    delete logrec;
}