/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; session = arg; conn = S2C(session); while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Write out any log record buffers. */ WT_ERR(__wt_log_wrlsn(session)); WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000)); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ WT_ERR(__wt_log_force_write(session, 1)); WT_ERR(__wt_log_wrlsn(session)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } return (WT_THREAD_RET_VALUE); }
/* * __ckpt_server -- * The checkpoint server thread. */ static void * __ckpt_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; WT_SESSION_IMPL *session; session = arg; conn = S2C(session); wt_session = (WT_SESSION *)session; while (F_ISSET(conn, WT_CONN_SERVER_RUN) && F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { /* Checkpoint the database. */ WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config)); /* Wait... */ WT_ERR_TIMEDOUT_OK( __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); } if (0) { err: __wt_err(session, ret, "checkpoint server error"); } return (NULL); }
/* * __sweep_server -- * The handle sweep server thread. */ static void * __sweep_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; session = arg; conn = S2C(session); /* * Sweep for dead handles. */ while (F_ISSET(conn, WT_CONN_SERVER_RUN) && F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { /* Wait until the next event. */ WT_ERR_TIMEDOUT_OK( __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION)); /* Sweep the handles. */ WT_ERR(__sweep(session)); } if (0) { err: __wt_err(session, ret, "handle sweep server error"); } return (NULL); }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; int locked, yield; session = arg; conn = S2C(session); log = conn->log; locked = yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_wrlsn(session, NULL, &yield)); locked = 0; __wt_spin_unlock(session, &log->log_slot_lock); if (++yield < 1000) __wt_yield(); else WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } if (locked) __wt_spin_unlock(session, &log->log_slot_lock); return (WT_THREAD_RET_VALUE); }
/* * __ckpt_server -- * The checkpoint server thread. */ static void * __ckpt_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; WT_SESSION_IMPL *session; session = arg; conn = S2C(session); wt_session = (WT_SESSION *)session; while (F_ISSET(conn, WT_CONN_SERVER_RUN) && F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { /* Checkpoint the database. */ WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config)); /* Reset. */ if (conn->ckpt_logsize) { __wt_log_written_reset(session); conn->ckpt_signalled = 0; /* * In case we crossed the log limit during the * checkpoint and the condition variable was already * signalled, do a tiny wait to clear it so we don't do * another checkpoint immediately. */ WT_ERR(__wt_cond_wait(session, conn->ckpt_cond, 1)); } /* * Wait... * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ WT_ERR( __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); } if (0) { err: WT_PANIC_MSG(session, ret, "checkpoint server error"); } return (NULL); }
/* * __wt_bt_cache_op -- * Cache operations: compaction, discard, sync/checkpoint. */ int __wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) { WT_DECL_RET; WT_BTREE *btree; btree = session->btree; /* * Compaction and sync/checkpoint reconcile dirty pages from the cache * to the backing block manager. Reconciliation is just another reader * of the page, so with some care, it can be done in the current thread, * leaving the eviction thread to keep freeing spaces if the cache is * full. Sync and eviction cannot operate on the same page at the same * time, and there are different modes inside __wt_tree_walk to make * sure they don't trip over each other. * * The current thread cannot evict pages from the cache, so discard is * done by calling the eviction server for service. * * XXX * Set the checkpoint reference for reconciliation -- this is ugly, but * there's no data structure path from here to reconciliation. * * Publish: there must be a barrier to ensure the structure fields are * set before the eviction thread can see the request. */ WT_PUBLISH(btree->ckpt, ckptbase); switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_COMPACT: case WT_SYNC_WRITE_LEAVES: WT_ERR(__wt_sync_file(session, op)); break; case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_NOWRITE: /* * Schedule and wake the eviction server, then wait for the * eviction server to wake us. */ WT_ERR(__wt_sync_file_serial(session, op)); WT_ERR(__wt_evict_server_wake(session)); WT_ERR(__wt_cond_wait(session, session->cond, 0)); ret = session->syncop_ret; /* If discarding the tree, the root page should be gone. */ WT_ASSERT(session, ret != 0 || btree->root_page == NULL); break; WT_ILLEGAL_VALUE_ERR(session); } err: btree->ckpt = NULL; return (ret); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; u_int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) WT_ERR(__log_prealloc_once(session)); /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { locked = 1; WT_ERR(__log_archive_once(session, 0)); WT_ERR( __wt_writeunlock( session, log->log_archive_lock)); locked = 0; } else WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open log " "cursor holding archive lock")); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log server error"); } if (locked) (void)__wt_writeunlock(session, log->log_archive_lock); return (WT_THREAD_RET_VALUE); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __lsm_worker -- * A thread that executes work units for all open LSM trees. */ static WT_THREAD_RET __lsm_worker(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_WORK_UNIT *entry; WT_LSM_WORKER_ARGS *cookie; WT_SESSION_IMPL *session; int progress, ran; cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; conn = S2C(session); entry = NULL; while (F_ISSET(conn, WT_CONN_SERVER_RUN) && F_ISSET(cookie, WT_LSM_WORKER_RUN)) { progress = 0; /* * Workers process the different LSM work queues. Some workers * can handle several or all work unit types. So the code is * prioritized so important operations happen first. * Switches are the highest priority. */ while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) && (ret = __wt_lsm_manager_pop_entry( session, WT_LSM_WORK_SWITCH, &entry)) == 0 && entry != NULL) WT_ERR( __wt_lsm_work_switch(session, &entry, &progress)); /* Flag an error if the pop failed. */ WT_ERR(ret); /* * Next the general operations. */ ret = __lsm_worker_general_op(session, cookie, &ran); if (ret == EBUSY || ret == WT_NOTFOUND) ret = 0; WT_ERR(ret); progress = progress || ran; /* * Finally see if there is any merge work we can do. This is * last because the earlier operations may result in adding * merge work to the queue. */ if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) && (ret = __wt_lsm_manager_pop_entry( session, WT_LSM_WORK_MERGE, &entry)) == 0 && entry != NULL) { WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE); ret = __wt_lsm_merge(session, entry->lsm_tree, cookie->id); if (ret == WT_NOTFOUND) { F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING); ret = 0; } else if (ret == EBUSY) ret = 0; /* Paranoia: clear session state. */ session->dhandle = NULL; __wt_lsm_manager_free_work_unit(session, entry); entry = NULL; progress = 1; } /* Flag an error if the pop failed. */ WT_ERR(ret); /* Don't busy wait if there was any work to do. */ if (!progress) { WT_ERR( __wt_cond_wait(session, cookie->work_cond, 10000)); continue; } } if (ret != 0) { err: __wt_lsm_manager_free_work_unit(session, entry); WT_PANIC_MSG(session, ret, "Error in LSM worker thread %d", cookie->id); } return (WT_THREAD_RET_VALUE); }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *slot; WT_SESSION_IMPL *session; size_t written_i; uint32_t i, save_i; int yield; session = arg; conn = S2C(session); log = conn->log; yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * No need to use the log_slot_lock because the slot pool * is statically allocated and any slot in the * WT_LOG_SLOT_WRITTEN state is exclusively ours for now. */ i = 0; written_i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or stop * as soon as one is not in order. */ for (i = 0; i < written_i; i++) { if (WT_LOG_CMP(&log->write_lsn, &written[i].lsn) != 0) break; /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ slot = &log->slot_pool[written[i].slot_index]; WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); } } /* * If we saw a later write, we always want to yield because * we know something is in progress. */ if (yield++ < 1000) __wt_yield(); else /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } if (0) err: __wt_err(session, ret, "log wrlsn server error"); return (WT_THREAD_RET_VALUE); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, close_lsn, min_lsn; WT_SESSION_IMPL *session; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL && (ret = __wt_log_extract_lognum(session, close_fh->name, &close_lsn.file)) == 0 && close_lsn.file < log->write_lsn.file) { /* * We've copied the file handle, clear out the one in * log structure to allow it to be set again. */ log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately after * ours. That is, the beginning of the next log file. * We need to know the LSN file number of our own close * in case earlier calls are still in progress and the * next one to move the sync_lsn into the next file for * later syncs. */ close_lsn.offset = 0; close_end_lsn = close_lsn; close_end_lsn.file++; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } /* * If a later thread asked for a background sync, do it now. */ if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * The sync LSN we asked for better be smaller than * the current written LSN. */ WT_ASSERT(session, WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0); WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we were * writing to disk. */ if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_log_write -- * Write a record into the log. */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; uint32_t rdup_len; int locked; conn = S2C(session); log = conn->log; locked = 0; INIT_LSN(&lsn); myslot.slot = NULL; /* * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has * a header at the beginning for us to fill in. * * If using direct_io, the caller should pass us an aligned record. * But we need to make sure it is big enough and zero-filled so * that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size); rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* * If the caller's record only partially fills the necessary * space, we need to zero-fill the remainder. */ if (record->size != rdup_len) { memset((uint8_t *)record->mem + record->size, 0, rdup_len - record->size); record->size = rdup_len; } logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, record->size); WT_STAT_FAST_CONN_INCR(session, log_writes); if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { ret = __log_direct_write(session, record, lsnp, flags); if (ret == 0) return (0); if (ret != EAGAIN) WT_ERR(ret); /* * An EAGAIN return means we failed to get the try lock - * fall through to the consolidation code in that case. */ } /* * As soon as we see contention for the log slot, disable direct * log writes. We get better performance by forcing writes through * the consolidation code. This is because individual writes flood * the I/O system faster than they contend on the log slot lock. */ F_SET(log, WT_LOG_FORCE_CONSOLIDATE); if ((ret = __wt_log_slot_join( session, rdup_len, flags, &myslot)) == ENOMEM) { /* * If we couldn't find a consolidated slot for this record * write the record directly. */ while ((ret = __log_direct_write( session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); /* * Increase the buffer size of any slots we can get access * to, so future consolidations are likely to succeed. */ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); if (myslot.offset == 0) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_slot_close(session, myslot.slot)); WT_ERR(__log_acquire( session, myslot.slot->slot_group_size, myslot.slot)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; WT_ERR(__wt_log_slot_notify(session, myslot.slot)); } else WT_ERR(__wt_log_slot_wait(session, myslot.slot)); WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { WT_ERR(__log_release(session, myslot.slot)); WT_ERR(__wt_log_slot_free(myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); } err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* * If we're synchronous and some thread had an error, we don't know * if our write made it out to the file or not. The error could be * before or after us. So, if anyone got an error, we report it. * If we're not synchronous, only report if our own operation got * an error. */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; return (ret); }
/* * __log_release -- * Release a log slot. */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN sync_lsn; size_t write_size; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); log = conn->log; /* * If we're going to have to close our log file, make a local copy * of the file handle structure. */ close_fh = NULL; if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; F_CLR(slot, SLOT_CLOSEFH); } /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, slot->slot_start_offset, write_size, slot->slot_buf.mem)); } /* * Wait for earlier groups to finish, otherwise there could be holes * in the log file. */ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) __wt_yield(); log->write_lsn = slot->slot_end_lsn; /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->write_lsn. */ while (F_ISSET(slot, SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { (void)__wt_cond_wait( session, log->log_sync_cond, 10000); continue; } /* * Record the current end of log after we grabbed the lock. * That is how far our fsync call with guarantee. */ sync_lsn = log->write_lsn; if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_STAT_FAST_CONN_INCR(session, log_sync); ret = __wt_fsync(session, log->log_fh); if (ret == 0) { F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; ret = __wt_cond_signal( session, log->log_sync_cond); } } __wt_spin_unlock(session, &log->log_sync_lock); WT_ERR(ret); } if (F_ISSET(slot, SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); F_CLR(slot, SLOT_BUF_GROW); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* * If we have a file to close, close it now. */ if (close_fh) WT_ERR(__wt_close(session, close_fh)); err: if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; bool locked; session = arg; conn = S2C(session); log = conn->log; locked = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.l.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ WT_ERR(__wt_fsync(session, close_fh, true)); /* * We want to have the file size reflect actual * data with minimal pre-allocated zeroed space. * We can't truncate the file during hot backup, * or the underlying file system may not support * truncate: both are OK, it's just more work * during cursor traversal. */ if (!conn->hot_backup) { __wt_readlock( session, conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( session, conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; __wt_cond_signal(session, log->log_sync_cond); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. * * We also have to wait for the written LSN and the * sync LSN to be in the same file so that we know we * have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* * If the sync file is behind either the one * wanted for a background sync or the write LSN * has moved to another file continue to let * this worker thread process that older file * immediately. */ if ((log->sync_lsn.l.file < log->bg_sync_lsn.l.file) || (log->sync_lsn.l.file < min_lsn.l.file)) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { WT_ASSERT(session, min_lsn.l.file == log->sync_lsn.l.file); log->sync_lsn = min_lsn; __wt_cond_signal( session, log->log_sync_cond); } locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { __wt_cond_auto_signal( session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_lsm_merge_worker -- * The merge worker thread for an LSM tree, responsible for merging * on-disk trees. */ void * __wt_lsm_merge_worker(void *vargs) { WT_DECL_RET; WT_LSM_WORKER_ARGS *args; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; u_int aggressive, chunk_wait, id, old_aggressive, stallms; int progress; args = vargs; lsm_tree = args->lsm_tree; id = args->id; session = lsm_tree->worker_sessions[id]; __wt_free(session, args); aggressive = stallms = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { /* * Help out with switching chunks in case the checkpoint worker * is busy. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } progress = 0; /* Clear any state from previous worker thread iterations. */ session->dhandle = NULL; /* Try to create a Bloom filter. */ if (__lsm_bloom_work(session, lsm_tree) == 0) progress = 1; /* If we didn't create a Bloom filter, try to merge. */ if (progress == 0 && __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0) progress = 1; /* Clear any state from previous worker thread iterations. */ WT_CLEAR_BTREE_IN_SESSION(session); /* * Only have one thread freeing old chunks, and only if there * are chunks to free. */ if (id == 0 && lsm_tree->nold_chunks > 0 && __lsm_free_chunks(session, lsm_tree) == 0) progress = 1; if (progress) stallms = 0; else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { /* Poll 10 times per second. */ WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); stallms += 100; /* * Get aggressive if more than enough chunks for a * merge should have been created while we waited. * Use 10 seconds as a default if we don't have an * estimate. */ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? 10000 : lsm_tree->chunk_fill_ms); old_aggressive = aggressive; aggressive = chunk_wait / lsm_tree->merge_min; if (aggressive > old_aggressive) WT_VERBOSE_ERR(session, lsm, "LSM merge got aggressive (%u), " "%u / %" PRIu64, aggressive, stallms, lsm_tree->chunk_fill_ms); } } if (0) { err: __wt_err(session, ret, "LSM merge worker failed"); } return (NULL); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }