/* * __wt_spin_lock_register_lock -- * Add a lock to the connection's list. */ int __wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_CONNECTION_IMPL *conn; u_int i; /* * There is a spinlock we initialize before we have a connection, the * global library lock. In that case, the session will be NULL and * we can't track the lock. */ if (session == NULL) return (0); conn = S2C(session); for (i = 0; i < WT_SPINLOCK_MAX; i++) if (conn->spinlock_list[i] == NULL && WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t)) return (0); WT_RET_MSG(session, ENOMEM, "spinlock connection registry failed, increase the connection's " "spinlock list size"); }
/* * __spin_lock_next_id -- * Return the next spinlock caller ID. */ static int __spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) { static int lock_id = 0, next_id = 0; WT_DECL_RET; /* If we've ever registered this location, we already have an ID. */ if (*idp != WT_SPINLOCK_REGISTER) return (0); /* * We can't use the global spinlock to lock the ID allocation (duh!), * use a CAS instruction to serialize access to a local variable. * This work only gets done once per library instantiation, there * isn't a performance concern. */ while (!WT_ATOMIC_CAS(lock_id, 0, 1)) __wt_yield(); /* Allocate a blocking ID for this location. */ if (*idp == WT_SPINLOCK_REGISTER) { if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) *idp = next_id++; else WT_ERR_MSG(session, ENOMEM, "spinlock caller location registry failed, " "increase the connection's blocking matrix size"); } err: WT_PUBLISH(lock_id, 0); return (ret); }
/* * __wt_cond_signal -- * Signal a waiting thread. */ int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; int locked; locked = 0; /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL && WT_VERBOSE_ISSET(session, mutex)) WT_RET(__wt_verbose( session, "signal %s cond (%p)", cond->name, cond)); /* Fast path if already signalled. */ if (cond->waiters == -1) return (0); if (cond->waiters > 0 || !WT_ATOMIC_CAS(cond->waiters, 0, -1)) { WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; WT_ERR(pthread_cond_broadcast(&cond->cond)); } err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_cond_broadcast"); }
/* * __wt_update_obsolete_check -- * Check for obsolete updates. */ WT_UPDATE * __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_UPDATE *first, *next; /* * This function identifies obsolete updates, and truncates them from * the rest of the chain; because this routine is called from inside * a serialization function, the caller has responsibility for actually * freeing the memory. * * Walk the list of updates, looking for obsolete updates at the end. */ for (first = NULL; upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) { if (first == NULL) first = upd; } else if (upd->txnid != WT_TXN_ABORTED) first = NULL; /* * We cannot discard this WT_UPDATE structure, we can only discard * WT_UPDATE structures subsequent to it, other threads of control will * terminate their walk in this element. Save a reference to the list * we will discard, and terminate the list. */ if (first != NULL && (next = first->next) != NULL && WT_ATOMIC_CAS(first->next, next, NULL)) return (next); return (NULL); }
/* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) { /* * Make sure there is space to track exclusive access so we can unlock * to clean up. */ WT_RET(__wt_realloc_def(session, &session->excl_allocated, session->excl_next + 1, &session->excl)); /* * Hazard pointers are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page. The top-level page should * already be in the locked state, lock child pages in memory. * If another thread already has this page, give up. */ if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED)) return (EBUSY); /* We couldn't change the state. */ WT_ASSERT(session, ref->state == WT_REF_LOCKED); session->excl[session->excl_next++] = ref; /* Check for a matching hazard pointer. */ if (__wt_page_hazard_check(session, ref->page) == NULL) return (0); WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard); WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard); WT_VERBOSE_RET( session, evict, "page %p hazard request failed", ref->page); return (EBUSY); }
/* * __wt_txn_begin -- * Begin a transaction. */ int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; WT_ASSERT(session, txn_state->id == WT_TXN_NONE); WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); if (cval.len == 0) txn->isolation = session->isolation; else txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; /* * Allocate a transaction ID. * * We use an atomic compare and swap to ensure that we get a * unique ID that is published before the global counter is * updated. * * If two threads race to allocate an ID, only the latest ID * will proceed. The winning thread can be sure its snapshot * contains all of the earlier active IDs. Threads that race * and get an earlier ID may not appear in the snapshot, but * they will loop and allocate a new ID before proceeding to * make any updates. * * This potentially wastes transaction IDs when threads race to * begin transactions: that is the price we pay to keep this * path latch free. */ do { txn_state->id = txn->id = txn_global->current; } while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1)); /* * If we have used 64-bits of transaction IDs, there is nothing * more we can do. */ if (txn->id == WT_TXN_ABORTED) WT_RET_MSG(session, ENOMEM, "Out of transaction IDs"); F_SET(txn, TXN_RUNNING); if (txn->isolation == TXN_ISO_SNAPSHOT) __wt_txn_refresh(session, WT_TXN_NONE, 1); return (0); }
/* * __lsm_bloom_work -- * Try to create a Bloom filter for the newest on-disk chunk. */ static int __lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i; WT_CLEAR(cookie); /* If no work is done, tell our caller by returning WT_NOTFOUND. */ ret = WT_NOTFOUND; WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Create bloom filters in all checkpointed chunks. */ for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; /* * Skip if a thread is still active in the chunk or it * isn't suitable. */ if (!F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) || F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || chunk->generation > 0 || chunk->count == 0) continue; /* See if we win the race to switch on the "busy" flag. */ if (WT_ATOMIC_CAS(chunk->bloom_busy, 0, 1)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); chunk->bloom_busy = 0; break; } } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); }
/* * __wt_update_obsolete_check -- * Check for obsolete updates. */ WT_UPDATE * __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_TXN *txn; WT_UPDATE *next; /* * This function identifies obsolete updates, and truncates them from * the rest of the chain; because this routine is called from inside * a serialization function, the caller has responsibility for actually * freeing the memory. */ txn = &session->txn; if (txn->isolation != TXN_ISO_SNAPSHOT && txn->isolation != TXN_ISO_READ_COMMITTED) return (NULL); /* * Walk the list of updates, looking for obsolete updates. If we find * an update no session will ever move past, we can discard any updates * that appear after it. */ for (; upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) { /* * We cannot discard this WT_UPDATE structure, we can * only discard WT_UPDATE structures subsequent to it, * other threads of control will terminate their walk * in this element. Save a reference to the list we * will discard, and terminate the list. */ if ((next = upd->next) == NULL) return (NULL); if (!WT_ATOMIC_CAS(upd->next, next, NULL)) return (NULL); return (next); } return (NULL); }
/* * __wt_txn_refresh -- * Allocate a transaction ID and/or a snapshot. */ void __wt_txn_refresh(WT_SESSION_IMPL *session, uint64_t max_id, int get_snapshot) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id, snap_min, oldest_id, prev_oldest_id; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; /* For pure read-only workloads, avoid updates to shared state. */ if (!get_snapshot) { /* * If we are trying to update the oldest ID and it is already * equal to the current ID, there is no point scanning. */ if (prev_oldest_id == current_id) return; } else if (txn->id == max_id && txn->snapshot_count == 0 && txn->snap_min == snap_min && TXNID_LE(prev_oldest_id, snap_min)) { txn_state->snap_min = txn->snap_min; /* If nothing has changed in the meantime, we're done. */ if (txn_global->scan_count == 0 && txn_global->oldest_id == prev_oldest_id) return; } /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS(txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; /* If the maximum ID is constrained, so is the oldest. */ oldest_id = (max_id != WT_TXN_NONE) ? max_id : snap_min; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Ignore the ID if we are committing (indicated by max_id * being set): it is about to be released. * * Also ignore the ID if it is older than the oldest ID we saw. * This can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && id + 1 != max_id && TXNID_LE(prev_oldest_id, id)) { if (get_snapshot) txn->snapshot[n++] = id; if (TXNID_LT(id, snap_min)) snap_min = id; } /* * Ignore the session's own snap_min if we are in the process * of updating it. */ if (get_snapshot && s == txn_state) continue; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; } if (TXNID_LT(snap_min, oldest_id)) oldest_id = snap_min; if (get_snapshot) { WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; } /* * Update the last running ID if we have a much newer value or we are * forcing an update. */ if (!get_snapshot || snap_min > txn_global->last_running + 100) txn_global->last_running = snap_min; /* * Update the oldest ID if we have a newer ID and we can get exclusive * access. During normal snapshot refresh, only do this if we have a * much newer value. Once we get exclusive access, do another pass to * make sure nobody else is using an earlier ID. */ if (max_id == WT_TXN_NONE && TXNID_LT(prev_oldest_id, oldest_id) && (!get_snapshot || oldest_id - prev_oldest_id > 100) && WT_ATOMIC_CAS(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; } if (TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; txn_global->scan_count = 0; } else { WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB(txn_global->scan_count, 1); } if (get_snapshot) __txn_sort_snapshot(session, n, current_id); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) { WT_CURSOR *c; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_SESSION_IMPL *session; uint64_t *txnid_maxp; uint64_t id, myid, snap_min; session = (WT_SESSION_IMPL *)clsm->iface.session; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { c = &clsm->iface; /* Copy out data before resetting chunk cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); if (F_ISSET(c, WT_CURSTD_VALUE_INT) && !WT_DATA_IN_ITEM(&c->value)) WT_RET(__wt_buf_set( session, &c->value, c->value.data, c->value.size)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; WT_RET(__wt_cache_full_check(session)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update && (chunk = clsm->primary_chunk) != NULL) { WT_RET(__wt_txn_autocommit_check(session)); for (id = chunk->txnid_max, myid = session->txn.id; !TXNID_LE(myid, id); id = chunk->txnid_max) { WT_ASSERT(session, myid != WT_TXN_NONE); (void)WT_ATOMIC_CAS( chunk->txnid_max, id, myid); } } /* * Figure out how many updates are required for snapshot * isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID that * overlaps with our snapshot is a potential conflict. */ clsm->nupdates = 1; if (session->txn.isolation == TXN_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { snap_min = session->txn.snap_min; for (txnid_maxp = &clsm->txnid_max[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, txnid_maxp--) if (TXNID_LT(*txnid_maxp, snap_min)) break; } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || session->txn.isolation != TXN_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }