/* * __wt_log_vprintf -- * Write a message into the log. */ int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; va_list ap_copy; const char *rec_fmt = WT_UNCHECKED_STRING(I); uint32_t rectype = WT_LOGREC_MESSAGE; size_t header_size, len; conn = S2C(session); if (!conn->logging) return (0); va_copy(ap_copy, ap); len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1; va_end(ap_copy); WT_RET( __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); /* * We're writing a record with the type (an integer) followed by a * string (NUL-terminated data). To avoid writing the string into * a buffer before copying it, we write the header first, then the * raw bytes of the string. */ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, header_size, rec_fmt, rectype)); logrec->size += (uint32_t)header_size; (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap); WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size)); logrec->size += len; WT_ERR(__wt_log_write(session, logrec, NULL, 0)); err: __wt_scr_free(&logrec); return (ret); }
/* * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is * to find the last log file that exists, open it and set our initial * LSNs to the end of that file. If none exist, call __wt_log_newfile * to create it. */ int __wt_log_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t firstlog, lastlog, lognum; u_int i, logcount; char **logfiles; conn = S2C(session); log = conn->log; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } log->fileid = lastlog; WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_open: first log %d last log %d", firstlog, lastlog)); log->first_lsn.file = firstlog; log->first_lsn.offset = 0; /* * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ WT_ERR(__wt_log_newfile(session, 1)); /* * If there were log files, run recovery. * XXX belongs at a higher level than this. */ if (logcount > 0) { log->trunc_lsn = log->alloc_lsn; WT_ERR(__wt_txn_recover(session)); } err: __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_fsync_async -- * Flush a file handle and don't wait for the result. */ int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) { #ifdef HAVE_SYNC_FILE_RANGE WT_DECL_RET; WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name)); if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0) return (0); WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name); #else WT_UNUSED(session); WT_UNUSED(fh); return (0); #endif }
/* * __wt_block_truncate -- * Truncate the file. */ int __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); __wt_verbose(session, WT_VERB_BLOCK, "truncate file to %" PRIuMAX, (uintmax_t)len); /* * Truncate requires serialization, we depend on our caller for that. * * Truncation isn't a requirement of the block manager, it's only used * to conserve disk space. Regardless of the underlying file system * call's result, the in-memory understanding of the file size changes. */ block->size = block->extend_size = len; /* * Backups are done by copying files outside of WiredTiger, potentially * by system utilities. We cannot truncate the file during the backup * window, we might surprise an application. * * This affects files that aren't involved in the backup (for example, * doing incremental backups, which only copies log files, or targeted * backups, stops all block truncation unnecessarily). We may want a * more targeted solution at some point. */ if (!conn->hot_backup) { WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, block->fh, len), NULL); } /* * The truncate may fail temporarily or permanently (for example, there * may be a file mapping if there's an open checkpoint on the file on a * POSIX system, in which case the underlying function returns EBUSY). * It's OK, we don't have to be able to truncate files. */ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); }
/* * __recovery_setup_file -- * Set up the recovery slot for a file. */ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) { WT_CONFIG_ITEM cval; WT_LSN lsn; intmax_t offset; uint32_t fileid; WT_RET(__wt_config_getones(r->session, config, "id", &cval)); fileid = (uint32_t)cval.val; /* Track the largest file ID we have seen. */ if (fileid > r->max_fileid) r->max_fileid = fileid; if (r->nfiles <= fileid) { WT_RET(__wt_realloc_def( r->session, &r->file_alloc, fileid + 1, &r->files)); r->nfiles = fileid + 1; } WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); WT_RET( __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); /* If there is checkpoint logged for the file, apply everything. */ if (cval.type != WT_CONFIG_ITEM_STRUCT) WT_INIT_LSN(&lsn); else if (sscanf(cval.str, "(%" SCNu32 ",%" SCNdMAX ")", &lsn.file, &offset) == 2) lsn.offset = offset; else WT_RET_MSG(r->session, EINVAL, "Failed to parse checkpoint LSN '%.*s'", (int)cval.len, cval.str); r->files[fileid].ckpt_lsn = lsn; WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY, "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")", uri, fileid, lsn.file, lsn.offset)); return (0); }
/* * __wt_metadata_search -- * Return a copied row from the metadata. * The caller is responsible for freeing the allocated memory. */ int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep) { WT_CURSOR *cursor; WT_DECL_RET; const char *value; *valuep = NULL; WT_RET(__wt_verbose(session, WT_VERB_METADATA, "Search: key: %s, tracking: %s, %s" "turtle", key, WT_META_TRACKING(session) ? "true" : "false", __metadata_turtle(key) ? "" : "not ")); if (__metadata_turtle(key)) return (__wt_turtle_read(session, key, valuep)); /* * All metadata reads are at read-uncommitted isolation. That's * because once a schema-level operation completes, subsequent * operations must see the current version of checkpoint metadata, or * they may try to read blocks that may have been freed from a file. * Metadata updates use non-transactional techniques (such as the * schema and metadata locks) to protect access to in-flight updates. */ WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, ret = cursor->search(cursor)); WT_ERR(ret); WT_ERR(cursor->get_value(cursor, &value)); WT_ERR(__wt_strdup(session, value, valuep)); err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); if (ret != 0) __wt_free(session, *valuep); return (ret); }
/* * __wt_fopen -- * Open a FILE handle. */ int __wt_fopen(WT_SESSION_IMPL *session, const char *name, WT_FHANDLE_MODE mode_flag, u_int flags, FILE **fpp) { WT_DECL_RET; const char *mode, *path; char *pathbuf; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fopen", name)); pathbuf = NULL; if (LF_ISSET(WT_FOPEN_FIXED)) path = name; else { WT_RET(__wt_filename(session, name, &pathbuf)); path = pathbuf; } mode = NULL; switch (mode_flag) { case WT_FHANDLE_APPEND: mode = WT_FOPEN_APPEND; break; case WT_FHANDLE_READ: mode = WT_FOPEN_READ; break; case WT_FHANDLE_WRITE: mode = WT_FOPEN_WRITE; break; } *fpp = fopen(path, mode); if (*fpp == NULL) ret = __wt_errno(); if (pathbuf != NULL) __wt_free(session, pathbuf); if (ret == 0) return (0); WT_RET_MSG(session, ret, "%s: fopen", name); }
/* * __wt_remove -- * Remove a file. */ int __wt_remove(WT_SESSION_IMPL *session, const char *name) { WT_DECL_RET; char *path; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name)); __remove_file_check(session, name); WT_RET(__wt_filename(session, name, &path)); WT_SYSCALL_RETRY(remove(path), ret); __wt_free(session, path); if (ret == 0 || ret == ENOENT) return (0); WT_RET_MSG(session, ret, "%s: remove", name); }
/* * __ovfl_txnc_verbose -- * Dump information about a transaction-cached overflow record. */ static int __ovfl_txnc_verbose(WT_SESSION_IMPL *session, WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag) { WT_DECL_ITEM(tmp); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 64, &tmp)); WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", page, __wt_addr_string( session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp), txnc->current, WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc))); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_log_truncate_files -- * Truncate log files via archive once. Requires that the server is not * currently running. */ int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t backup_file; bool locked; WT_UNUSED(cfg); conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); if (F_ISSET(conn, WT_CONN_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); log = conn->log; backup_file = 0; if (cursor != NULL) backup_file = WT_CURSOR_BACKUP_ID(cursor); WT_ASSERT(session, backup_file <= log->alloc_lsn.l.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file)); WT_RET(__wt_writelock(session, log->log_archive_lock)); locked = true; WT_ERR(__log_archive_once(session, backup_file)); WT_ERR(__wt_writeunlock(session, log->log_archive_lock)); locked = false; err: if (locked) WT_RET(__wt_writeunlock(session, log->log_archive_lock)); return (ret); }
/* * __wt_rename -- * Rename a file. */ int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) { WT_DECL_RET; uint32_t lasterror; char *from_path, *to_path; WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); from_path = to_path = NULL; WT_RET(__wt_filename(session, from, &from_path)); WT_TRET(__wt_filename(session, to, &to_path)); /* * Check if file exists since Windows does not override the file if * it exists. */ if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) { if ((ret = DeleteFileA(to_path)) == FALSE) { lasterror = GetLastError(); goto err; } } if ((MoveFileA(from_path, to_path)) == FALSE) lasterror = GetLastError(); err: __wt_free(session, from_path); __wt_free(session, to_path); if (ret != FALSE) return (0); WT_RET_MSG(session, lasterror, "MoveFile %s to %s", from, to); }
/* * __ovfl_discard_verbose -- * Dump information about a discard overflow record. */ static int __ovfl_discard_verbose( WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag) { WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 512, &tmp)); unpack = &_unpack; __wt_cell_unpack(cell, unpack); WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, "discard: %s%s%p %s", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", page, __wt_addr_string(session, unpack->data, unpack->size, tmp))); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_remove -- * Remove a file. */ int __wt_remove(WT_SESSION_IMPL *session, const char *name) { WT_DECL_RET; char *path; uint32_t lasterror; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name)); __remove_file_check(session, name); WT_RET(__wt_filename(session, name, &path)); if ((ret = DeleteFileA(path)) == FALSE) lasterror = __wt_errno(); __wt_free(session, path); if (ret != FALSE) return (0); WT_RET_MSG(session, lasterror, "%s: remove", name); }
/* * __wt_thread_group_create -- * Create a new thread group, assumes incoming group structure is * zero initialized. */ int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) { WT_DECL_RET; bool cond_alloced; /* Check that the structure is initialized as expected */ WT_ASSERT(session, group->alloc == 0); cond_alloced = false; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); WT_ERR(__wt_cond_alloc( session, "Thread group cond", false, &group->wait_cond)); cond_alloced = true; __wt_writelock(session, group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); __wt_writeunlock(session, group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { if (cond_alloced) WT_TRET(__wt_cond_destroy(session, &group->wait_cond)); __wt_rwlock_destroy(session, &group->lock); } return (ret); }
/* * __wt_mmap -- * Map a file into memory. */ int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp) { void *map; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: map %" PRIuMAX " bytes", fh->name, (uintmax_t)fh->size)); if ((map = mmap(NULL, (size_t)fh->size, PROT_READ, #ifdef MAP_NOCORE MAP_NOCORE | #endif MAP_PRIVATE, fh->fd, (off_t)0)) == MAP_FAILED) { WT_RET_MSG(session, __wt_errno(), "%s map error: failed to map %" PRIuMAX " bytes", fh->name, (uintmax_t)fh->size); } *(void **)mapp = map; *lenp = (size_t)fh->size; return (0); }
/* * __wt_metadata_remove -- * Remove a row from the metadata. */ int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key) { WT_CURSOR *cursor; WT_DECL_RET; WT_RET(__wt_verbose(session, WT_VERB_METADATA, "Remove: key: %s, tracking: %s, %s" "turtle", key, WT_META_TRACKING(session) ? "true" : "false", __metadata_turtle(key) ? "" : "not ")); if (__metadata_turtle(key)) WT_RET_MSG(session, EINVAL, "%s: remove not supported on the turtle file", key); WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); WT_ERR(cursor->search(cursor)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_update(session, key)); WT_ERR(cursor->remove(cursor)); err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); }
/*调用rename系统调用*/ int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) { WT_DECL_RET; char *from_path, *to_path; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); from_path = to_path = NULL; /*对路劲进行校验,如果不合法路径,直接返回*/ WT_RET(__wt_filename(session, from, &from_path)); WT_TRET(__wt_filename(session, to, &to_path)); if (ret == 0) WT_SYSCALL_RETRY(rename(from_path, to_path), ret); __wt_free(session, from_path); __wt_free(session, to_path); if(ret == 0) return 0; WT_RET_MSG(session, ret, "rename %s to %s", from, to); }
/* * __wt_block_close -- * Close a block handle. */ int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; if (block == NULL) /* Safety check */ return (0); conn = S2C(session); WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name )); __wt_spin_lock(session, &conn->block_lock); /* Reference count is initialized to 1. */ if (block->ref == 0 || --block->ref == 0) WT_TRET(__block_destroy(session, block)); __wt_spin_unlock(session, &conn->block_lock); return (ret); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); if (block->ckpt_inprogress) ret = __wt_block_panic(session, EINVAL, "%s: unexpected checkpoint ordering", block->name); else block->ckpt_inprogress = true; __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } #endif /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) ret = __wt_block_panic(session, ret, "%s: fatal checkpoint failure", block->name); if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_lsm_get_chunk_to_flush -- * Find and pin a chunk in the LSM tree that is likely to need flushing. */ int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) { WT_DECL_RET; WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk; u_int i; *chunkp = NULL; chunk = evict_chunk = flush_chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Search for a chunk to evict and/or a chunk to flush. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { /* * Normally we don't want to force out the last chunk. * But if we're doing a forced flush on behalf of a * compact, then we want to include the final chunk. */ if (evict_chunk == NULL && !chunk->evicted && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE)) evict_chunk = chunk; } else if (flush_chunk == NULL && chunk->switch_txn != 0 && (force || i < lsm_tree->nchunks - 1)) flush_chunk = chunk; } /* * Don't be overly zealous about pushing old chunks from cache. * Attempting too many drops can interfere with checkpoints. * * If retrying a discard push an additional work unit so there are * enough to trigger checkpoints. */ if (evict_chunk != NULL && flush_chunk != NULL) { chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); } else chunk = (evict_chunk != NULL) ? evict_chunk : flush_chunk; if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Flush%s: return chunk %u of %u: %s", force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri)); (void)__wt_atomic_add32(&chunk->refcnt, 1); } err: WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); *chunkp = chunk; return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM key; uint64_t insert_count; WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); /* * Setup so that we don't hold pages we read into cache, and so * that we don't get stuck if the cache is full. If we allow * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count)); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; bool eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); } if (F_ISSET(conn, WT_CONN_READONLY)) goto done; /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else { ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); if (ret != 0) __wt_err(session, ret, "Recovery failed"); /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); F_CLR(conn, WT_CONN_RECOVERING); return (ret); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { DWORD dwCreationDisposition; HANDLE filehandle, filehandle_secondary; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; uint64_t bucket, hash; int direct_io, f, matched, share_mode; char *path; conn = S2C(session); fh = NULL; path = NULL; filehandle = INVALID_HANDLE_VALUE; filehandle_secondary = INVALID_HANDLE_VALUE; direct_io = 0; hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); /* For directories, create empty file handles with invalid handles */ if (dio_type == WT_FILE_TYPE_DIRECTORY) { goto setupfh; } WT_RET(__wt_filename(session, name, &path)); share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. * * TODO: Set tighter file permissions but set bInheritHandle to false * to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; dwCreationDisposition = 0; if (ok_create) { dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; } else dwCreationDisposition = OPEN_EXISTING; if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; direct_io = 1; } if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { f |= FILE_FLAG_WRITE_THROUGH; } /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) f |= FILE_FLAG_RANDOM_ACCESS; filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, dwCreationDisposition, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) { if (GetLastError() == ERROR_FILE_EXISTS && ok_create) filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); } /* * Open a second handle to file to support allocation/truncation * concurrently with reads on the file. Writes would also move the file * pointer. */ filehandle_secondary = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), "open failed for secondary handle: %s", path); setupfh: WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->name_hash = hash; fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; fh->ref = 1; fh->direct_io = direct_io; /* Set the file's size. */ if (dio_type != WT_FILE_TYPE_DIRECTORY) WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* Configure fallocate/posix_fallocate calls. */ __wt_fallocate_config(session, fh); /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (filehandle != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle); if (filehandle_secondary != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle_secondary); } __wt_free(session, path); return (ret); }
/* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the * log archive lock held. */ static int __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t lognum, min_lognum; u_int i, locked, logcount; char **logfiles; conn = S2C(session); log = conn->log; logcount = 0; logfiles = NULL; /* * If we're coming from a backup cursor we want the smaller of * the last full log file copied in backup or the checkpoint LSN. * Otherwise we want the minimum of the last log file written to * disk and the checkpoint LSN. */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file); else min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum)); /* * Main archive code. Get the list of all log files and * remove any earlier than the minimum log number. */ WT_RET(__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount)); /* * We can only archive files if a hot backup is not in progress or * if we are the backup. */ WT_RET(__wt_readlock(session, conn->hot_backup_lock)); locked = 1; if (conn->hot_backup == 0 || backup_file != 0) { for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum( session, logfiles[i], &lognum)); if (lognum < min_lognum) WT_ERR(__wt_log_remove( session, WT_LOG_FILENAME, lognum)); } } WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); locked = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; logcount = 0; /* * Indicate what is our new earliest LSN. It is the start * of the log file containing the last checkpoint. */ log->first_lsn.file = min_lognum; log->first_lsn.offset = 0; if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint) { WT_BLOCK_CKPT *ci, _ci; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; ci = NULL; /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the checkpoint was empty). In that case we return an empty root * address, set that up now. */ *root_addr_sizep = 0; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (const char *)tmp->data); } #endif /* * There's a single checkpoint in the file that can be written, all of * the others are read-only. We use the same initialization calls for * readonly checkpoints, but the information doesn't persist. */ if (checkpoint) { ci = &_ci; WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { /* * We depend on the btree level for locking: things will go bad * fast if we open the live system in two handles, or salvage, * truncate or verify the live/running file. */ #ifdef HAVE_DIAGNOSTIC __wt_spin_lock(session, &block->live_lock); WT_ASSERT(session, block->live_open == false); block->live_open = true; __wt_spin_unlock(session, &block->live_lock); #endif ci = &block->live; WT_ERR(__wt_block_ckpt_init(session, ci, "live")); } /* * If the checkpoint has an on-disk root page, load it. Otherwise, size * the file past the description information. */ if (addr == NULL || addr_size == 0) ci->file_size = block->allocsize; else { /* Crack the checkpoint cookie. */ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { endp = root_addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, ci->root_offset, ci->root_size, ci->root_checksum)); *root_addr_sizep = WT_PTRDIFF(endp, root_addr); } /* * Rolling a checkpoint forward requires the avail list, the * blocks from which we can allocate. */ if (!checkpoint) WT_ERR(__wt_block_extlist_read_avail( session, block, &ci->avail, ci->file_size)); } /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough I don't bother). */ if (!checkpoint) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { err: /* * Don't call checkpoint-unload: unload does real work including * file truncation. If we fail early enough that the checkpoint * information isn't correct, bad things would happen. The only * allocated memory was in the service of verify, clean that up. */ if (block->verify) WT_TRET(__wt_verify_ckpt_unload(session, block)); } /* Checkpoints don't need the original information, discard it. */ if (checkpoint && ci != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __ckpt_update -- * Update a checkpoint. */ static int __ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); #endif /* * Write the checkpoint's alloc and discard extent lists. After each * write, remove any allocated blocks from the system's allocation * list, checkpoint extent blocks don't appear on any extent lists. */ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); /* * We only write an avail list for the live system, other checkpoint's * avail lists are static and never change. * * Write the avail list last so it reflects changes due to allocating * blocks for the alloc and discard lists. Second, when we write the * live system's avail list, it's two lists: the current avail list * plus the list of blocks to be made available when the new checkpoint * completes. We can't merge that second list into the real list yet, * it's not truly available until the new checkpoint locations have been * saved to the metadata. */ if (is_live) WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); /* * Set the file size for the live system. * * !!! * We do NOT set the file size when re-writing checkpoints because we * want to test the checkpoint's blocks against a reasonable maximum * file size during verification. This is bad: imagine a checkpoint * appearing early in the file, re-written, and then the checkpoint * requires blocks at the end of the file, blocks after the listed file * size. If the application opens that checkpoint for writing * (discarding subsequent checkpoints), we would truncate the file to * the early chunk, discarding the re-written checkpoint information. * The alternative, updating the file size has its own problems, in * that case we'd work correctly, but we'd lose all of the blocks * between the original checkpoint and the re-written checkpoint. * Currently, there's no API to roll-forward intermediate checkpoints, * if there ever is, this will need to be fixed. */ if (is_live) ci->file_size = block->size; /* * Copy the checkpoint information into the checkpoint array's address * cookie. */ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = ckpt->raw.mem; WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem); if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: create-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } err: __wt_scr_free(session, &tmp); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; int freq_per_sec, signalled; session = arg; conn = S2C(session); log = conn->log; signalled = 0; /* * Set this to the number of times per second we want to force out the * log slot buffer. */ #define WT_FORCE_PER_SECOND 20 freq_per_sec = WT_FORCE_PER_SECOND; /* * The log server thread does a variety of work. It forces out any * buffered log writes. It pre-allocates log files and it performs * log archiving. The reason the wrlsn thread does not force out * the buffered writes is because we want to process and move the * write_lsn forward as quickly as possible. The same reason applies * to why the log file server thread does not force out the writes. * That thread does fsync calls which can take a long time and we * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the * wrlsn thread because of interaction advancing the write_lsn * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ if (--freq_per_sec <= 0 || signalled != 0) { freq_per_sec = WT_FORCE_PER_SECOND; /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) WT_ERR(__log_prealloc_once(session)); /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); WT_TRET(__wt_writeunlock( session, log->log_archive_lock)); WT_ERR(ret); } else WT_ERR( __wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open " "log cursor holding archive lock")); } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); } if (0) { err: __wt_err(session, ret, "log server error"); } return (WT_THREAD_RET_VALUE); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; wt_off_t max, offset; uint32_t allocsize, checksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = block->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read( session, fh, offset, (size_t)allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); size = blk->disk_size; checksum = blk->checksum; /* * Check the block size: if it's not insane, read the block. * Reading the block validates any checksum; if reading the * block succeeds, return its address as a possible page, * otherwise, move past it. */ if (!__wt_block_offset_invalid(block, offset, size) && __wt_block_read_off( session, block, tmp, offset, size, checksum) == 0) break; /* Free the allocation-size block. */ __wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); WT_ERR(__wt_block_off_free( session, block, offset, (wt_off_t)allocsize)); block->slvg_off += allocsize; } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); done: err: __wt_scr_free(session, &tmp); return (ret); }