/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_CONNECTION_IMPL *conn) { WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; WT_SESSION_IMPL *session; struct WT_RECOVERY_FILE *metafile; char *config; int was_backup; WT_CLEAR(r); INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); WT_ASSERT(session, LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0); } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); if (IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }