void chkpt_m::take() { chkpt_mutex.acquire_write(); DBGOUT1(<<"BEGIN chkpt_m::take"); INC_TSTAT(log_chkpt_cnt); // Insert chkpt_begin log record. logrec_t* logrec = new logrec_t; lsn_t begin_lsn; LOG_INSERT(chkpt_begin_log(lsn_t::null), &begin_lsn); W_COERCE(ss_m::log->flush_all()); // Collect checkpoint information from log curr_chkpt.scan_log(); // Serialize chkpt to file fs::path fpath = smlevel_0::log->get_storage()->make_chkpt_path(lsn_t::null); fs::path newpath = smlevel_0::log->get_storage()->make_chkpt_path(begin_lsn); ofstream ofs(fpath.string(), ios::binary | ios::trunc); curr_chkpt.serialize_binary(ofs); ofs.close(); fs::rename(fpath, newpath); _min_rec_lsn = curr_chkpt.get_min_rec_lsn(); _min_xct_lsn = curr_chkpt.get_min_xct_lsn(); // Release the 'write' mutex so the next checkpoint request can come in chkpt_mutex.release_write(); delete logrec; }
/********************************************************************* * * chkpt_m::take() * * Take a checkpoint. A Checkpoint consists of: * 1. Checkpoint Begin Log (chkpt_begin) * 2. Checkpoint Device Table Log(s) (chkpt_dev_tab) * -- all mounted devices * 3. Checkpoint Buffer Table Log(s) (chkpt_bf_tab) * -- dirty page entries in bf and their recovery lsn * 4. Checkpoint Transaction Table Log(s) (chkpt_xct_tab) * -- active transactions and their first lsn * 5. Checkpoint Prepared Transactions (optional) * -- prepared transactions and their locks * (using the same log records that prepare does) * 6. Checkpoint End Log (chkpt_end) * *********************************************************************/ void chkpt_m::take() { FUNC(chkpt_m::take); if (! log) { /* * recovery facilities disabled ... do nothing */ return; } INC_TSTAT(log_chkpt_cnt); /* * Allocate a buffer for storing log records */ w_auto_delete_t<logrec_t> logrec(new logrec_t); /* * checkpoints are fuzzy * but must be serialized wrt each other. * * Acquire the mutex to serialize prepares and * checkpoints. * * NB: EVERYTHING BETWEEN HERE AND RELEASING THE MUTEX * MUST BE W_COERCE (not W_DO). */ chkpt_serial_m::chkpt_acquire(); retry: /* * FRJ: We must somehow guarantee that the log always has space to * accept checkpoints. We impose two constraints to this end: * * 1. We cap the total space checkpoints are allowed to consume in * any one log partition. This is a good idea anyway because * checkpoint size is linear in the number of dirty buffer pool * pages -- ~2MB per GB of dirty data -- and yet the utility of * checkpoints drops off quickly as the dirty page count * increases -- log analysis and recovery must start at the lsn * of the oldest dirty page regardless of how recent the * checkpoint was. * * 2. No checkpoint may depend on more than /max_openlog-1/ log * partitions. In other words, every checkpoint completion must * leave at least one log partition available. * * We use these two constraints, together with log reservations, * to guarantee the ability to reclaim log space if the log * becomes full. The log maintains, on our behalf, a reservation * big enough for two maximally-sized checkpoints (ie the dirty * page table lists every page in the buffer pool). Every time we * reclaim a log segment this reservation is topped up atomically. */ #define LOG_INSERT(constructor_call, rlsn) \ do { \ new (logrec) constructor_call; \ W_COERCE( log->insert(*logrec, rlsn) ); \ if(!log->consume_chkpt_reservation(logrec->length())) { \ W_FATAL(eOUTOFLOGSPACE); \ } \ } while(0) /* if current partition is max_openlog then the oldest lsn we can tolerate is 2.0. We must flush all pages dirtied before that time and must wait until all transactions with an earlier start_lsn have ended (at worst they will abort if the log fills up before they can commit). TODO: use smlevel_0::log_warn_callback to notify the VAS in case old transactions are't currently active for some reason. Also, remember the current checkpoint count so we can see whether we get raced... */ // #warning "TODO use log_warn_callback in case old transactions aren't logging right now" long curr_pnum = log->curr_lsn().file(); long too_old_pnum = std::max(0l, curr_pnum - max_openlog+1); if(!log->verify_chkpt_reservation()) { /* Yikes! The log can't guarantee that we'll be able to complete any checkpoint after this one, so we must reclaim space even if the log doesn't seem to be full. */ long too_old_pnum = log->global_min_lsn().file(); if(too_old_pnum == curr_pnum) { // how/why did they reserve so much log space??? W_FATAL(eOUTOFLOGSPACE); } } /* We cannot proceed if any transaction has a too-low start_lsn; wait for them to complete before continuing. WARNING: we have to wake any old transactions which are waiting on locks, or we risk deadlocks where the lock holder waits on a full log while the old transaction waits on the lock. */ lsn_t oldest_valid_lsn = log_m::first_lsn(too_old_pnum+1); old_xct_tracker tracker; { xct_i it(true); // do acquire the xlist_mutex... while(xct_t* xd=it.next()) { lsn_t const &flsn = xd->first_lsn(); if(flsn.valid() && flsn < oldest_valid_lsn) { // poison the transaction and add it to the list... xd->force_nonblocking(); tracker.track(xd); } } } /* release the chkpt_serial to do expensive stuff We'll record the current checkpoint count so we can detect whether we get raced during the gap. */ long chkpt_stamp = _chkpt_count; chkpt_serial_m::chkpt_release(); // clear out all too-old pages W_COERCE(bf->force_until_lsn(oldest_valid_lsn, false)); /* hopefully the page cleaning took long enough that the old transactions all ended... */ if(!tracker.finished()) tracker.wait_for_all(); // raced? chkpt_serial_m::chkpt_acquire(); if(_chkpt_count != chkpt_stamp) goto retry; /* * Finally, we're ready to start the actual checkpoint! * * Write a Checkpoint Begin Log and record its lsn in master */ lsn_t master; LOG_INSERT(chkpt_begin_log(io->GetLastMountLSN()), &master); /* * Checkpoint the buffer pool dirty page table, and record * minimum of the recovery lsn of all dirty pages. * * We could do this (very slow) operation before grabbing the * checkpoint mutex because all pages can do is get younger by * being flushed; no page can become older than the min_rec_lsn * we record here ... however, we have to serialize checkpoints * because, although they are fuzzy, they cannot intermingle. * One must complete before another starts. Recovery relies * on it. Either everyone uses wakeup_and_take or they (dismount, * mount, etc) wait on this. * * The srv_log does wakeup_and_take() whenever a new partition is * opened, and it might be that a checkpoint is spanning a * partition. */ lsn_t min_rec_lsn = lsn_t::max; { int bfsz = bf->npages(); const int chunk = chkpt_bf_tab_t::max; w_auto_delete_array_t<lpid_t> pid(new lpid_t[chunk]); w_auto_delete_array_t<lsn_t> rec_lsn(new lsn_t[chunk]); w_assert1(pid && rec_lsn); int total_count = 0; for (int i = 0; i < bfsz; ) { /* * Loop over all buffer pages */ int count = chunk; // Have the minimum rec_lsn of the bunch // returned iff it's less than the value passed in W_COERCE( bf->get_rec_lsn(i, count, pid, rec_lsn, min_rec_lsn) ); if (count) { total_count+= count; /* * Write a Buffer Table Log */ LOG_INSERT(chkpt_bf_tab_log(count, pid, rec_lsn), 0); } } //fprintf(stderr, "Checkpoint found %d dirty pages\n", total_count); } /* * Checkpoint the dev mount table */ { /* * Log the mount table in "max loggable size" chunks. */ // XXX casts due to enums const int chunk = (int)max_vols > (int)chkpt_dev_tab_t::max ? (int)chkpt_dev_tab_t::max : (int)max_vols; int dev_cnt = io->num_vols(); int i; char **devs; devs = new char *[chunk]; if (!devs) W_FATAL(fcOUTOFMEMORY); for (i = 0; i < chunk; i++) { devs[i] = new char[max_devname+1]; if (!devs[i]) W_FATAL(fcOUTOFMEMORY); } vid_t *vids; vids = new vid_t[chunk]; if (!vids) W_FATAL(fcOUTOFMEMORY); for (i = 0; i < dev_cnt; i += chunk) { int ret; W_COERCE( io->get_vols(i, MIN(dev_cnt - i, chunk), devs, vids, ret)); if (ret) { /* * Write a Checkpoint Device Table Log */ // XXX The bogus 'const char **' cast is for visual c++ LOG_INSERT(chkpt_dev_tab_log(ret, (const char **) devs, vids), 0); } } delete [] vids; for (i = 0; i < chunk; i++) delete [] devs[i]; delete [] devs; } /* * Checkpoint the transaction table, and record * minimum of first_lsn of all transactions. */ lsn_t min_xct_lsn = lsn_t::max; { const int chunk = chkpt_xct_tab_t::max; tid_t youngest = xct_t::youngest_tid(); w_auto_delete_array_t<tid_t> tid(new tid_t[chunk]); w_auto_delete_array_t<xct_state_t> state(new xct_state_t[chunk]); w_auto_delete_array_t<lsn_t> last_lsn(new lsn_t[chunk]); w_auto_delete_array_t<lsn_t> undo_nxt(new lsn_t[chunk]); /* Keep the transaction list static while we write the state of prepared transactions. Without the lock the list could change underneath this checkpoint. Note that we are using the iterator without locking because we own the lock. FRJ: even though xct_i now locks in a fully safe way, we want to hold the mutex longer than it's in scope so we continue using manual locking. */ xct_i x(true); // true -> acquire the mutex const xct_t* xd = 0; do { int i = 0; while (i < chunk && (xd = x.next())) { /* * Loop over all transactions and record only * xcts that dirtied something. * Skip those that have ended but not yet * been destroyed. */ if( xd->state() == xct_t::xct_ended) { continue; } if (xd->first_lsn().valid()) { tid[i] = xd->tid(); state[i] = xd->state(); // // NOTE: aborting xcts are installed as active - // they will be aborted on restart if not ended // by the time we restart. if (state[i] == xct_t::xct_aborting) state[i] = xct_t::xct_active; // if (state[i] == xct_t::xct_prepared) { DBG(<< tid[i] <<" is prepared -- logging as active"); state[i] = xct_t::xct_active; } // ^^^^^^^^^^^^^^^^^^^^^^^^^^^ // don't worry - it // will be prepared in the next section. // this just makes recovery debug-checking // a little easier ///////////////////////////////////////// last_lsn[i] = xd->last_lsn(); undo_nxt[i] = xd->undo_nxt(); if (min_xct_lsn > xd->first_lsn()) min_xct_lsn = xd->first_lsn(); i++; } } /* // We *always* have to write this record, because we have // to record the youngest xct!!!! NEH // if (i) */ { /* * Write a Transaction Table Log */ LOG_INSERT(chkpt_xct_tab_log(youngest, i, tid, state, last_lsn, undo_nxt), 0); } } while (xd);
void chkpt_t::serialize() { // Allocate a buffer for storing log records logrec_t* logrec = new logrec_t; size_t chunk; // Serialize bkp_tab -- CS TODO if (!bkp_path.empty()) { vector<string> backup_paths; backup_paths.push_back(bkp_path); LOG_INSERT(chkpt_backup_tab_log(backup_paths.size(), (const string*)(&backup_paths[0])), 0); } //LOG_INSERT(chkpt_restore_tab_log(vol->vid()), 0); // Serialize buf_tab chunk = chkpt_bf_tab_t::max; vector<PageID> pid; vector<lsn_t> rec_lsn; vector<lsn_t> page_lsn; for(buf_tab_t::const_iterator it = buf_tab.begin(); it != buf_tab.end(); ++it) { DBGOUT1(<<"pid[]="<<it->first<< " , " << "rec_lsn[]="<<it->second.rec_lsn<< " , " << "page_lsn[]="<<it->second.page_lsn); pid.push_back(it->first); rec_lsn.push_back(it->second.rec_lsn); page_lsn.push_back(it->second.page_lsn); if(pid.size()==chunk || &*it==&*buf_tab.rbegin()) { LOG_INSERT(chkpt_bf_tab_log(pid.size(), (const PageID*)(&pid[0]), (const lsn_t*)(&rec_lsn[0]), (const lsn_t*)(&page_lsn[0])), 0); pid.clear(); rec_lsn.clear(); page_lsn.clear(); } } chunk = chkpt_xct_tab_t::max; vector<tid_t> tid; vector<smlevel_0::xct_state_t> state; vector<lsn_t> last_lsn; vector<lsn_t> first_lsn; vector<okvl_mode> lock_mode; vector<uint32_t> lock_hash; for(xct_tab_t::const_iterator it=xct_tab.begin(); it != xct_tab.end(); ++it) { DBGOUT1(<<"tid[]="<<it->first<<" , " << "state[]="<<it->second.state<< " , " << "last_lsn[]="<<it->second.last_lsn<<" , " << "first_lsn[]="<<it->second.first_lsn); tid.push_back(it->first); state.push_back(it->second.state); last_lsn.push_back(it->second.last_lsn); first_lsn.push_back(it->second.first_lsn); if(tid.size()==chunk || &*it==&*xct_tab.rbegin()) { LOG_INSERT(chkpt_xct_tab_log(get_highest_tid(), tid.size(), (const tid_t*)(&tid[0]), (const smlevel_0::xct_state_t*)(&state[0]), (const lsn_t*)(&last_lsn[0]), (const lsn_t*)(&first_lsn[0])), 0); tid.clear(); state.clear(); last_lsn.clear(); first_lsn.clear(); } // gather lock table for(vector<lock_info_t>::const_iterator jt = it->second.locks.begin(); jt != it->second.locks.end(); ++jt) { DBGOUT1(<<" lock_mode[]="<<jt->lock_mode<<" , lock_hash[]="<<jt->lock_hash); lock_mode.push_back(jt->lock_mode); lock_hash.push_back(jt->lock_hash); if(lock_mode.size() == chunk) { LOG_INSERT(chkpt_xct_lock_log(it->first, lock_mode.size(), (const okvl_mode*)(&lock_mode[0]), (const uint32_t*)(&lock_hash[0])), 0); lock_mode.clear(); lock_hash.clear(); } } if(lock_mode.size() > 0) { LOG_INSERT(chkpt_xct_lock_log(it->first, lock_mode.size(), (const okvl_mode*)(&lock_mode[0]), (const uint32_t*)(&lock_hash[0])), 0); lock_mode.clear(); lock_hash.clear(); } } // In case the transaction table was empty, we insert a xct_tab_log anyway, // because we want to save the highest tid. if(xct_tab.size() == 0) { LOG_INSERT(chkpt_xct_tab_log(get_highest_tid(), tid.size(), (const tid_t*)(&tid[0]), (const smlevel_0::xct_state_t*)(&state[0]), (const lsn_t*)(&last_lsn[0]), (const lsn_t*)(&first_lsn[0])), 0); } delete logrec; }