/* * __wt_async_op_enqueue -- * Enqueue an operation onto the work queue. */ int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) { WT_ASYNC *async; WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t cur_head, cur_tail, my_alloc, my_slot; #ifdef HAVE_DIAGNOSTIC WT_ASYNC_OP_IMPL *my_op; #endif conn = S2C(session); async = conn->async; /* * If an application re-uses a WT_ASYNC_OP, we end up here with an * invalid object. */ if (op->state != WT_ASYNCOP_READY) WT_RET_MSG(session, EINVAL, "application error: WT_ASYNC_OP already in use"); /* * Enqueue op at the tail of the work queue. * We get our slot in the ring buffer to use. */ my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1); my_slot = my_alloc % async->async_qsize; /* * Make sure we haven't wrapped around the queue. * If so, wait for the tail to advance off this slot. */ WT_ORDERED_READ(cur_tail, async->tail_slot); while (cur_tail == my_slot) { __wt_yield(); WT_ORDERED_READ(cur_tail, async->tail_slot); } #ifdef HAVE_DIAGNOSTIC WT_ORDERED_READ(my_op, async->async_queue[my_slot]); if (my_op != NULL) return (__wt_panic(session)); #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue) WT_PUBLISH(async->max_queue, async->cur_queue); /* * Multiple threads may be adding ops to the queue. We need to wait * our turn to make our slot visible to workers. */ WT_ORDERED_READ(cur_head, async->head); while (cur_head != (my_alloc - 1)) { __wt_yield(); WT_ORDERED_READ(cur_head, async->head); } WT_PUBLISH(async->head, my_alloc); return (ret); }
/* * fop -- * File operation function. */ static void * fop(void *arg) { STATS *s; uintptr_t id; WT_RAND_STATE rnd; u_int i; id = (uintptr_t)arg; __wt_yield(); /* Get all the threads created. */ s = &run_stats[id]; __wt_random_init(&rnd); for (i = 0; i < nops; ++i, __wt_yield()) switch (__wt_random(&rnd) % 10) { case 0: ++s->bulk; obj_bulk(); break; case 1: ++s->create; obj_create(); break; case 2: ++s->cursor; obj_cursor(); break; case 3: ++s->drop; obj_drop(__wt_random(&rnd) & 1); break; case 4: ++s->ckpt; obj_checkpoint(); break; case 5: ++s->upgrade; obj_upgrade(); break; case 6: ++s->rebalance; obj_rebalance(); break; case 7: ++s->verify; obj_verify(); break; case 8: ++s->bulk_unique; obj_bulk_unique(__wt_random(&rnd) & 1); break; case 9: ++s->create_unique; obj_create_unique(__wt_random(&rnd) & 1); break; } return (NULL); }
static void op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp) { WT_CURSOR *cursor; WT_DECL_RET; u_int i, key; char buf[128]; bool readonly; /* Close any open cursor in the slot we're about to reuse. */ if (*cpp != NULL) { testutil_check((*cpp)->close(*cpp)); *cpp = NULL; } cursor = NULL; readonly = __wt_random(rnd) % 2 == 0; /* Loop to open an object handle. */ for (i = __wt_random(rnd) % uris; !done; __wt_yield()) { /* Use a checkpoint handle for 50% of reads. */ ret = session->open_cursor(session, uri_list[i], NULL, readonly && (i % 2 == 0) ? "checkpoint=WiredTigerCheckpoint" : NULL, &cursor); if (ret != EBUSY) { testutil_check(ret); break; } (void)__wt_atomic_add64(&worker_busy, 1); } if (cursor == NULL) return; /* Operate on some number of key/value pairs. */ for (key = 1; !done && key < MAXKEY; key += __wt_random(rnd) % 37, __wt_yield()) { testutil_check( __wt_snprintf(buf, sizeof(buf), "key:%020u", key)); cursor->set_key(cursor, buf); if (readonly) testutil_check(cursor->search(cursor)); else { cursor->set_value(cursor, buf); testutil_check(cursor->insert(cursor)); } } /* Close the cursor half the time, otherwise cache it. */ if (__wt_random(rnd) % 2 == 0) testutil_check(cursor->close(cursor)); else *cpp = cursor; (void)__wt_atomic_add64(&worker, 1); }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; int locked, yield; session = arg; conn = S2C(session); log = conn->log; locked = yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_wrlsn(session, NULL, &yield)); locked = 0; __wt_spin_unlock(session, &log->log_slot_lock); if (++yield < 1000) __wt_yield(); else WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } if (locked) __wt_spin_unlock(session, &log->log_slot_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_txn_global_shutdown -- * Shut down the global transaction state. */ int __wt_txn_global_shutdown(WT_SESSION_IMPL *session) { bool txn_active; /* * We're shutting down. Make sure everything gets freed. * * It's possible that the eviction server is in the middle of a long * operation, with a transaction ID pinned. In that case, we will loop * here until the transaction ID is released, when the oldest * transaction ID will catch up with the current ID. */ for (;;) { WT_RET(__wt_txn_activity_check(session, &txn_active)); if (!txn_active) break; WT_STAT_CONN_INCR(session, txn_release_blocked); __wt_yield(); } #ifdef HAVE_TIMESTAMPS /* * Now that all transactions have completed, no timestamps should be * pinned. */ __wt_timestamp_set_inf(&S2C(session)->txn_global.pinned_timestamp); #endif return (0); }
/* * __lsm_tree_close -- * Close an LSM tree structure. */ static int __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; int i; /* Stop any active merges. */ F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE); /* * Wait for all LSM operations and work units that were in flight to * finish. */ for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) { /* * Remove any work units from the manager queues. Do this step * repeatedly in case a work unit was in the process of being * created when we cleared the active flag. * !! Drop the schema lock whilst completing this step so that * we don't block any operations that require the schema * lock to complete. This is safe because any operation that * is closing the tree should first have gotten exclusive * access to the LSM tree via __wt_lsm_tree_get, so other * schema level operations will return EBUSY, even though * we're dropping the schema lock here. */ if (i % 1000 == 0) { WT_WITHOUT_SCHEMA_LOCK(session, ret = __wt_lsm_manager_clear_tree(session, lsm_tree)); WT_RET(ret); } __wt_yield(); } return (0); }
/* * __spin_lock_next_id -- * Return the next spinlock caller ID. */ static int __spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) { static int lock_id = 0, next_id = 0; WT_DECL_RET; /* If we've ever registered this location, we already have an ID. */ if (*idp != WT_SPINLOCK_REGISTER) return (0); /* * We can't use the global spinlock to lock the ID allocation (duh!), * use a CAS instruction to serialize access to a local variable. * This work only gets done once per library instantiation, there * isn't a performance concern. */ while (!WT_ATOMIC_CAS(lock_id, 0, 1)) __wt_yield(); /* Allocate a blocking ID for this location. */ if (*idp == WT_SPINLOCK_REGISTER) { if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) *idp = next_id++; else WT_ERR_MSG(session, ENOMEM, "spinlock caller location registry failed, " "increase the connection's blocking matrix size"); } err: WT_PUBLISH(lock_id, 0); return (ret); }
void obj_bulk(void) { WT_CURSOR *c; WT_SESSION *session; int ret; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); if ((ret = session->create(session, uri, config)) != 0) if (ret != EEXIST && ret != EBUSY) testutil_die(ret, "session.create"); if (ret == 0) { __wt_yield(); if ((ret = session->open_cursor( session, uri, NULL, "bulk", &c)) == 0) { if ((ret = c->close(c)) != 0) testutil_die(ret, "cursor.close"); } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL) testutil_die(ret, "session.open_cursor bulk"); } if ((ret = session->close(session, NULL)) != 0) testutil_die(ret, "session.close"); }
void obj_create_unique(int force) { WT_SESSION *session; int ret; char new_uri[64]; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); /* Generate a unique object name. */ if ((ret = pthread_rwlock_wrlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_wrlock single"); testutil_check(__wt_snprintf( new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); if ((ret = pthread_rwlock_unlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_unlock single"); if ((ret = session->create(session, new_uri, config)) != 0) testutil_die(ret, "session.create"); __wt_yield(); while ((ret = session->drop( session, new_uri, force ? "force" : NULL)) != 0) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); if ((ret = session->close(session, NULL)) != 0) testutil_die(ret, "session.close"); }
/* * __log_slot_find_free -- * Find and return a free log slot. */ static int __log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; uint32_t pool_i; conn = S2C(session); log = conn->log; WT_ASSERT(session, slot != NULL); /* * Encourage processing and moving the write LSN forward. * That process has to walk the slots anyway, so do that * work and let it give us the index of a free slot along * the way. */ WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); while (pool_i == WT_SLOT_POOL) { __wt_yield(); WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); } *slot = &log->slot_pool[pool_i]; WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE); return (0); }
/* * __wt_log_slot_switch -- * Switch out the current slot and set up a new one. */ int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; log = S2C(session)->log; /* * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the * compiler does not like it combined directly with the while loop * here. * * The loop conditional is a bit complex. We have to retry if we * closed the slot but were unable to set up a new slot. In that * case the flag indicating we have closed the slot will still be set. * We have to retry in that case regardless of the retry setting * because we are responsible for setting up the new slot. */ do { WT_WITH_SLOT_LOCK(session, log, ret = __log_slot_switch_internal( session, myslot, forced, did_work)); if (ret == EBUSY) { WT_STAT_CONN_INCR(session, log_slot_switch_busy); __wt_yield(); } WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(S2C(session), WT_CONN_CLOSING)) break; } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY)); return (ret); }
/* * Create a table and open a bulk cursor on it. */ void op_bulk(void *arg) { TEST_OPTS *opts; TEST_PER_THREAD_OPTS *args; WT_CURSOR *c; WT_SESSION *session; int ret; args = (TEST_PER_THREAD_OPTS *)arg; opts = args->testopts; testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); if ((ret = session->create(session, opts->uri, DEFAULT_TABLE_SCHEMA)) != 0) if (ret != EEXIST && ret != EBUSY) testutil_die(ret, "session.create"); if (ret == 0) { __wt_yield(); if ((ret = session->open_cursor(session, opts->uri, NULL, "bulk,checkpoint_wait=false", &c)) == 0) { testutil_check(c->close(c)); } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL) testutil_die(ret, "session.open_cursor bulk"); } testutil_check(session->close(session, NULL)); args->thread_counter++; }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LSN prev; WT_SESSION_IMPL *session; int yield; bool did_work; session = arg; conn = S2C(session); log = conn->log; yield = 0; WT_INIT_LSN(&prev); did_work = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Write out any log record buffers if anything was done * since last time. Only call the function to walk the * slots if the system is not idle. On an idle system * the alloc_lsn will not advance and the written lsn will * match the alloc_lsn. */ if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 || __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0) WT_ERR(__wt_log_wrlsn(session, &yield)); else WT_STAT_FAST_CONN_INCR(session, log_write_lsn_skip); prev = log->alloc_lsn; if (yield == 0) did_work = true; else did_work = false; /* * If __wt_log_wrlsn did work we want to yield instead of sleep. */ if (yield++ < WT_THOUSAND) __wt_yield(); else /* * Send in false because if we did any work we would * not be on this path. */ WT_ERR(__wt_cond_auto_wait( session, conn->log_wrlsn_cond, did_work)); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ WT_ERR(__wt_log_force_write(session, 1, NULL)); WT_ERR(__wt_log_wrlsn(session, NULL)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } return (WT_THREAD_RET_VALUE); }
/* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (;; __wt_yield()) switch (ref->state) { case WT_REF_DISK: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } }
/* * __wt_log_slot_new -- * Find a free slot and switch it as the new active slot. * Must be called holding the slot lock. */ int __wt_log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; int32_t i; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; /* * Although this function is single threaded, multiple threads could * be trying to set a new active slot sequentially. If we find an * active slot that is valid, return. */ if ((slot = log->active_slot) != NULL && WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); /* * Keep trying until we can find a free slot. */ for (;;) { /* * For now just restart at 0. We could use log->pool_index * if that is inefficient. */ for (i = 0; i < WT_SLOT_POOL; i++) { slot = &log->slot_pool[i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the * log file. Assume the full buffer size. */ WT_RET(__wt_log_acquire(session, log->slot_buf_size, slot)); /* * We have a new, initialized slot to use. * Set it as the active slot. */ WT_STAT_FAST_CONN_INCR(session, log_slot_transitions); log->active_slot = slot; return (0); } } /* * If we didn't find any free slots signal the worker thread. */ (void)__wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); } /* NOTREACHED */ }
/* * __page_refp -- * Return the page's index and slot for a reference. */ static inline void __page_refp(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; uint32_t i; /* * Copy the parent page's index value: the page can split at any time, * but the index's value is always valid, even if it's not up-to-date. */ retry: WT_INTL_INDEX_GET(session, ref->home, pindex); /* * Use the page's reference hint: it should be correct unless the page * split before our slot. If the page splits after our slot, the hint * will point earlier in the array than our actual slot, so the first * loop is from the hint to the end of the list, and the second loop * is from the start of the list to the end of the list. (The second * loop overlaps the first, but that only happen in cases where we've * deepened the tree and aren't going to find our slot at all, that's * not worth optimizing.) * * It's not an error for the reference hint to be wrong, it just means * the first retrieval (which sets the hint for subsequent retrievals), * is slower. */ i = ref->pindex_hint; if (i < pindex->entries && pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = i; return; } while (++i < pindex->entries) if (pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } for (i = 0; i < pindex->entries; ++i) if (pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } /* * If we don't find our reference, the page split into a new level and * our home pointer references the wrong page. After internal pages * deepen, their reference structure home value are updated; yield and * wait for that to happen. */ __wt_yield(); goto retry; }
/* * Create a guaranteed unique table and open and close a bulk cursor on it. */ void op_bulk_unique(void *arg) { TEST_OPTS *opts; TEST_PER_THREAD_OPTS *args; WT_CURSOR *c; WT_RAND_STATE rnd; WT_SESSION *session; int ret; char new_uri[64]; args = (TEST_PER_THREAD_OPTS *)arg; opts = args->testopts; __wt_random_init_seed(NULL, &rnd); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); /* Generate a unique object name. */ testutil_check(__wt_snprintf( new_uri, sizeof(new_uri), "%s.%" PRIu64, opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); testutil_check(session->create(session, new_uri, DEFAULT_TABLE_SCHEMA)); __wt_yield(); /* * Opening a bulk cursor may have raced with a forced checkpoint * which created a checkpoint of the empty file, and triggers an EINVAL. */ if ((ret = session->open_cursor( session, new_uri, NULL, "bulk,checkpoint_wait=false", &c)) == 0) { testutil_check(c->close(c)); } else if (ret != EINVAL && ret != EBUSY) testutil_die(ret, "session.open_cursor bulk unique: %s", new_uri); while ((ret = session->drop(session, new_uri, __wt_random(&rnd) & 1 ? "force,checkpoint_wait=false" : "checkpoint_wait=false")) != 0) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); else /* * The EBUSY is expected when we run with * checkpoint_wait set to false, so we increment the * counter while in this loop to avoid false positives. */ args->thread_counter++; testutil_check(session->close(session, NULL)); args->thread_counter++; }
/* * __wt_log_slot_wait -- * Wait for slot leader to allocate log area and tell us our log offset. */ int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { int yield_count; yield_count = 0; WT_UNUSED(session); while (slot->slot_state > WT_LOG_SLOT_DONE) if (++yield_count < 1000) __wt_yield(); else __wt_sleep(0, 200); return (0); }
void obj_bulk_unique(int force) { WT_CURSOR *c; WT_SESSION *session; int ret; char new_uri[64]; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "conn.session"); /* Generate a unique object name. */ if ((ret = pthread_rwlock_wrlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_wrlock single"); testutil_check(__wt_snprintf( new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); if ((ret = pthread_rwlock_unlock(&single)) != 0) testutil_die(ret, "pthread_rwlock_unlock single"); if ((ret = session->create(session, new_uri, config)) != 0) testutil_die(ret, "session.create: %s", new_uri); __wt_yield(); /* * Opening a bulk cursor may have raced with a forced checkpoint * which created a checkpoint of the empty file, and triggers an EINVAL */ if ((ret = session->open_cursor( session, new_uri, NULL, "bulk", &c)) == 0) { if ((ret = c->close(c)) != 0) testutil_die(ret, "cursor.close"); } else if (ret != EINVAL) testutil_die(ret, "session.open_cursor bulk unique: %s, new_uri"); while ((ret = session->drop( session, new_uri, force ? "force" : NULL)) != 0) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); if ((ret = session->close(session, NULL)) != 0) testutil_die(ret, "session.close"); }
/* * __sync_dup_walk -- * Duplicate a tree walk point. */ static inline int __sync_dup_walk( WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp) { WT_REF *old; bool busy; if ((old = *dupp) != NULL) { *dupp = NULL; WT_RET(__wt_page_release(session, old, flags)); } /* It is okay to duplicate a walk before it starts. */ if (walk == NULL || __wt_ref_is_root(walk)) { *dupp = walk; return (0); } /* Get a duplicate hazard pointer. */ for (;;) { #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, walk, &busy, __func__, __LINE__)); #else WT_RET(__wt_hazard_set(session, walk, &busy)); #endif /* * We already have a hazard pointer, we should generally be able * to get another one. We can get spurious busy errors (e.g., if * eviction is attempting to lock the page. Keep trying: we have * one hazard pointer so we should be able to get another one. */ if (!busy) break; __wt_yield(); } *dupp = walk; return (0); }
/* * Create and drop a unique guaranteed table. */ void op_create_unique(void *arg) { TEST_OPTS *opts; TEST_PER_THREAD_OPTS *args; WT_RAND_STATE rnd; WT_SESSION *session; int ret; char new_uri[64]; args = (TEST_PER_THREAD_OPTS *)arg; opts = args->testopts; __wt_random_init_seed(NULL, &rnd); testutil_check( opts->conn->open_session(opts->conn, NULL, NULL, &session)); /* Generate a unique object name. */ testutil_check(__wt_snprintf( new_uri, sizeof(new_uri), "%s.%" PRIu64, opts->uri, __wt_atomic_add64(&opts->unique_id, 1))); testutil_check(session->create(session, new_uri, DEFAULT_TABLE_SCHEMA)); __wt_yield(); while ((ret = session->drop(session, new_uri, __wt_random(&rnd) & 1 ? "force,checkpoint_wait=false" : "checkpoint_wait=false")) != 0) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); else /* * The EBUSY is expected when we run with * checkpoint_wait set to false, so we increment the * counter while in this loop to avoid false positives. */ args->thread_counter++; testutil_check(session->close(session, NULL)); args->thread_counter++; }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; bool last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; retry: current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return; /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* Update the last running ID. */ last_running_moved = WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { /* * We know we want to update. Check if we're racing. */ if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, last_running)) last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named * snapshots. * * Don't include the read/assignment in the assert * statement. Coverity complains if there are * assignments only done in diagnostic builds, and * when the read is from a volatile. */ id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif if (WT_TXNID_LT(txn_global->last_running, last_running)) txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { /* * We wanted to update the oldest ID but we're racing * another thread. Retry if this is a forced update. */ WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); if (force) { __wt_yield(); goto retry; } } } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); } }
/* * __ref_descend_prev -- * Descend the tree one level, during a previous-cursor walk. */ static inline void __ref_descend_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; uint64_t yield_count; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. */ for (yield_count = 0;; yield_count++, __wt_yield()) { /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal * page into its parent, we move the WT_REF structures and * update the parent's page index before updating the split * page's page index, and it's not an atomic update. A thread * can read the parent page's replacement page index and then * read the split page's original index. * * This can create a race for previous-cursor movements. * * For example, imagine an internal page with 3 child pages, * with the namespaces a-f, g-h and i-j; the first child page * splits. The parent starts out with the following page-index: * * | ... | a | g | i | ... | * * The split page starts out with the following page-index: * * | a | b | c | d | e | f | * * The first step is to move the c-f ranges into a new subtree, * so, for example we might have two new internal pages 'c' and * 'e', where the new 'c' page references the c-d namespace and * the new 'e' page references the e-f namespace. The top of the * subtree references the parent page, but until the parent's * page index is updated, any threads in the subtree won't be * able to ascend out of the subtree. However, once the parent * page's page index is updated to this: * * | ... | a | c | e | g | i | ... | * * threads in the subtree can ascend into the parent. Imagine a * cursor in the c-d part of the namespace that ascends to the * parent's 'c' slot. It would then decrement to the slot before * the 'c' slot, the 'a' slot. * * The previous-cursor movement selects the last slot in the 'a' * page; if the split page's page-index hasn't been updated yet, * it will select the 'f' slot, which is incorrect. Once the * split page's page index is updated to this: * * | a | b | * * the previous-cursor movement will select the 'b' slot, which * is correct. * * This function takes an argument which is the internal page * from which we're descending. If the last slot on the page no * longer points to the current page as its "home", the page is * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *slot; WT_SESSION_IMPL *session; size_t written_i; uint32_t i, save_i; int yield; session = arg; conn = S2C(session); log = conn->log; yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * No need to use the log_slot_lock because the slot pool * is statically allocated and any slot in the * WT_LOG_SLOT_WRITTEN state is exclusively ours for now. */ i = 0; written_i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or stop * as soon as one is not in order. */ for (i = 0; i < written_i; i++) { if (WT_LOG_CMP(&log->write_lsn, &written[i].lsn) != 0) break; /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ slot = &log->slot_pool[written[i].slot_index]; WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); } } /* * If we saw a later write, we always want to yield because * we know something is in progress. */ if (yield++ < 1000) __wt_yield(); else /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } if (0) err: __wt_err(session, ret, "log wrlsn server error"); return (WT_THREAD_RET_VALUE); }
/* * __wt_connection_close -- * Close a connection handle. */ int __wt_connection_close(WT_CONNECTION_IMPL *conn) { WT_CONNECTION *wt_conn; WT_DECL_RET; WT_DLH *dlh; WT_SESSION_IMPL *s, *session; WT_TXN_GLOBAL *txn_global; u_int i; wt_conn = &conn->iface; txn_global = &conn->txn_global; session = conn->default_session; /* * We're shutting down. Make sure everything gets freed. * * It's possible that the eviction server is in the middle of a long * operation, with a transaction ID pinned. In that case, we will loop * here until the transaction ID is released, when the oldest * transaction ID will catch up with the current ID. */ for (;;) { WT_TRET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); if (txn_global->oldest_id == txn_global->current) break; __wt_yield(); } /* Clear any pending async ops. */ WT_TRET(__wt_async_flush(session)); /* * Shut down server threads other than the eviction server, which is * needed later to close btree handles. Some of these threads access * btree handles, so take care in ordering shutdown to make sure they * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ WT_TRET(__wt_las_destroy(session)); /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); /* Shut down metadata tracking, required before creating tables. */ WT_TRET(__wt_meta_track_destroy(session)); /* * Now that all data handles are closed, tell logging that a checkpoint * has completed then shut down the log manager (only after closing * data handles). The call to destroy the log manager is outside the * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ WT_TRET(__wt_conn_remove_collator(session)); WT_TRET(__wt_conn_remove_compressor(session)); WT_TRET(__wt_conn_remove_data_source(session)); WT_TRET(__wt_conn_remove_encryptor(session)); WT_TRET(__wt_conn_remove_extractor(session)); /* Disconnect from shared cache - must be before cache destroy. */ WT_TRET(__wt_conn_cache_pool_destroy(session)); /* Discard the cache. */ WT_TRET(__wt_cache_destroy(session)); /* Discard transaction state. */ WT_TRET(__wt_txn_global_destroy(session)); /* Close extensions, first calling any unload entry point. */ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { TAILQ_REMOVE(&conn->dlhqh, dlh, q); if (dlh->terminate != NULL) WT_TRET(dlh->terminate(wt_conn)); WT_TRET(__wt_dlclose(session, dlh)); } /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); /* Close any file handles left open. */ WT_TRET(__wt_close_connection_close(session)); /* * Close the internal (default) session, and switch back to the dummy * session in case of any error messages from the remaining operations * while destroying the connection handle. */ if (session != &conn->dummy_session) { WT_TRET(session->iface.close(&session->iface, NULL)); session = conn->default_session = &conn->dummy_session; } /* * The session's split stash isn't discarded during normal session close * because it may persist past the life of the session. Discard it now. */ if ((s = conn->sessions) != NULL) for (i = 0; i < conn->session_size; ++s, ++i) __wt_split_stash_discard_all(session, s); /* * The session's hazard pointer memory isn't discarded during normal * session close because access to it isn't serialized. Discard it * now. */ if ((s = conn->sessions) != NULL) for (i = 0; i < conn->session_size; ++s, ++i) { /* * If hash arrays were allocated, free them now. */ __wt_free(session, s->dhhash); __wt_free(session, s->tablehash); __wt_free(session, s->hazard); } /* Destroy the handle. */ WT_TRET(__wt_connection_destroy(conn)); return (ret); }
/* * __log_release -- * Release a log slot. */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN sync_lsn; size_t write_size; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); log = conn->log; /* * If we're going to have to close our log file, make a local copy * of the file handle structure. */ close_fh = NULL; if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; F_CLR(slot, SLOT_CLOSEFH); } /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, slot->slot_start_offset, write_size, slot->slot_buf.mem)); } /* * Wait for earlier groups to finish, otherwise there could be holes * in the log file. */ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) __wt_yield(); log->write_lsn = slot->slot_end_lsn; /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->write_lsn. */ while (F_ISSET(slot, SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { (void)__wt_cond_wait( session, log->log_sync_cond, 10000); continue; } /* * Record the current end of log after we grabbed the lock. * That is how far our fsync call with guarantee. */ sync_lsn = log->write_lsn; if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_STAT_FAST_CONN_INCR(session, log_sync); ret = __wt_fsync(session, log->log_fh); if (ret == 0) { F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; ret = __wt_cond_signal( session, log->log_sync_cond); } } __wt_spin_unlock(session, &log->log_sync_lock); WT_ERR(ret); } if (F_ISSET(slot, SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); F_CLR(slot, SLOT_BUF_GROW); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* * If we have a file to close, close it now. */ if (close_fh) WT_ERR(__wt_close(session, close_fh)); err: if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); }
/* * Child process creates the database and table, and then writes data into * the table until it is killed by the parent. */ static void fill_db(void) { FILE *fp; WT_CONNECTION *conn; WT_CURSOR *cursor; WT_ITEM data; WT_RAND_STATE rnd; WT_SESSION *session; uint64_t i; int ret; uint8_t buf[MAX_VAL]; __wt_random_init(&rnd); memset(buf, 0, sizeof(buf)); /* * Initialize the first 25% to random values. Leave a bunch of data * space at the end to emphasize zero data. */ for (i = 0; i < MAX_VAL/4; i++) buf[i] = (uint8_t)__wt_random(&rnd); /* * Run in the home directory so that the records file is in there too. */ chdir(home); if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "WT_CONNECTION:open_session"); if ((ret = session->create(session, uri, "key_format=Q,value_format=u")) != 0) testutil_die(ret, "WT_SESSION.create: %s", uri); if ((ret = session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) testutil_die(ret, "WT_SESSION.open_cursor: %s", uri); /* * Keep a separate file with the records we wrote for checking. */ (void)unlink(RECORDS_FILE); if ((fp = fopen(RECORDS_FILE, "w")) == NULL) testutil_die(errno, "fopen"); /* * Set to no buffering. */ setvbuf(fp, NULL, _IONBF, 0); /* * Write data into the table until we are killed by the parent. * The data in the buffer is already set to random content. */ data.data = buf; for (i = 0;; ++i) { data.size = __wt_random(&rnd) % MAX_VAL; cursor->set_key(cursor, i); cursor->set_value(cursor, &data); if ((ret = cursor->insert(cursor)) != 0) testutil_die(ret, "WT_CURSOR.insert"); /* * Save the key separately for checking later. */ if (fprintf(fp, "%" PRIu64 "\n", i) == -1) testutil_die(errno, "fprintf"); if (i % 5000) __wt_yield(); } }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }