/* * __split_stash_add -- * Add a new entry into the session's split stash list. */ static int __split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len) { WT_SPLIT_STASH *stash; WT_ASSERT(session, p != NULL); /* Grow the list as necessary. */ WT_RET(__wt_realloc_def(session, &session->split_stash_alloc, session->split_stash_cnt + 1, &session->split_stash)); stash = session->split_stash + session->split_stash_cnt++; stash->split_gen = WT_ATOMIC_ADD(S2C(session)->split_gen, 1); stash->p = p; stash->len = len; WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len); WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects); /* See if we can free any previous entries. */ if (session->split_stash_cnt > 1) __wt_split_stash_discard(session); return (0); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk, **cp; uint32_t in_memory, new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, we throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. */ for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1; in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK); ++in_memory, --cp) ; if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2) lsm_tree->throttle_sleep = 0; else if (in_memory == lsm_tree->nchunks || F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->throttle_sleep = WT_MAX(20, 2 * lsm_tree->throttle_sleep); } else { chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; lsm_tree->throttle_sleep = (long)((in_memory - 2) * WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) / (20 * in_memory * chunk->count)); } WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d", new_id, (int)lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __wt_lsm_tree_truncate -- * Truncate an LSM tree. */ int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; int locked; WT_UNUSED(cfg); chunk = NULL; locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_RET(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; /* Create the new chunk. */ WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); /* Mark all chunks old. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, 0, lsm_tree->nchunks, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) { if (chunk != NULL) { (void)__wt_schema_drop(session, chunk->uri, NULL); __wt_free(session, chunk); } /* * Discard the LSM tree structure on error. This will force the * LSM tree to be re-opened the next time it is accessed and * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ WT_TRET(__lsm_tree_discard(session, lsm_tree)); } return (ret); }
/* * __wt_cond_wait -- * Wait on a mutex, optionally timing out. */ int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) { struct timespec ts; WT_DECL_RET; int locked; locked = 0; WT_ASSERT(session, usecs >= 0); /* Fast path if already signalled. */ if (WT_ATOMIC_ADD(cond->waiters, 1) == 0) return (0); /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL) { WT_VERBOSE_RET( session, mutex, "wait %s cond (%p)", cond->name, cond); WT_STAT_FAST_CONN_INCR(session, cond_wait); } WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; if (usecs > 0) { WT_ERR(__wt_epoch(session, &ts)); ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION; ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION; ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts); } else ret = pthread_cond_wait(&cond->cond, &cond->mtx); /* * Check pthread_cond_wait() return for EINTR, ETIME and * ETIMEDOUT, some systems return these errors. */ if (ret == EINTR || #ifdef ETIME ret == ETIME || #endif ret == ETIMEDOUT) ret = 0; (void)WT_ATOMIC_SUB(cond->waiters, 1); err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_cond_wait"); }
/* * __wt_lsm_tree_truncate -- * Truncate an LSM tree. */ int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; int locked; WT_UNUSED(cfg); locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_RET(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_RET(__wt_try_writelock(session, lsm_tree->rwlock)); locked = 1; /* Create the new chunk. */ WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); /* Mark all chunks old. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, 0, lsm_tree->nchunks, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); ret = __wt_rwunlock(session, lsm_tree->rwlock); locked = 0; if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); /* * Don't discard the LSM tree structure unless there has been an * error. The handle remains valid for future operations. */ if (ret != 0) WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ if ((nchunks = lsm_tree->nchunks) != 0 && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_VERBOSE_ERR(session, lsm, "Tree switch to: %" PRIu32 ", throttle %ld", new_id, lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->txnid_max = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: /* TODO: mark lsm_tree bad on error(?) */ WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __lsm_copy_chunks -- * Take a copy of part of the LSM tree chunk array so that we can work on * the contents without holding the LSM tree handle lock long term. */ static int __lsm_copy_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks) { WT_DECL_RET; u_int i, nchunks; size_t alloc; /* Always return zero chunks on error. */ cookie->nchunks = 0; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) return (__wt_lsm_tree_unlock(session, lsm_tree)); /* Take a copy of the current state of the LSM tree. */ nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks; alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc; /* * If the tree array of active chunks is larger than our current buffer, * increase the size of our current buffer to match. */ if (cookie->chunk_alloc < alloc) WT_ERR(__wt_realloc(session, &cookie->chunk_alloc, alloc, &cookie->chunk_array)); if (nchunks > 0) memcpy(cookie->chunk_array, old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk, nchunks * sizeof(*cookie->chunk_array)); /* * Mark each chunk as active, so we don't drop it until after we know * it's safe. */ for (i = 0; i < nchunks; i++) (void)WT_ATOMIC_ADD(cookie->chunk_array[i]->refcnt, 1); err: WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret == 0) cookie->nchunks = nchunks; return (ret); }
/* * __wt_lsm_tree_get -- * get an LSM tree structure for the given name. */ int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep) { WT_LSM_TREE *lsm_tree; /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { if (exclusive && lsm_tree->refcnt) return (EBUSY); (void)WT_ATOMIC_ADD(lsm_tree->refcnt, 1); *treep = lsm_tree; return (0); } /* Open a new tree. */ return (__lsm_tree_open(session, uri, treep)); }
/* * __wt_lsm_tree_get -- * get an LSM tree structure for the given name. */ int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep) { WT_LSM_TREE *lsm_tree; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { if (exclusive && lsm_tree->refcnt) return (EBUSY); (void)WT_ATOMIC_ADD(lsm_tree->refcnt, 1); *treep = lsm_tree; return (0); } /* * If we don't already hold the schema lock, get it now so that we * can find and/or open the handle. */ return (__lsm_tree_open(session, uri, treep)); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep) { WT_CACHE *cache; WT_PAGE *page; size_t size; void *p; *pagep = NULL; cache = S2C(session)->cache; /* * Allocate a page, and for most page types, the additional information * it needs to describe the disk image. */ size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: size += alloc_entries * sizeof(WT_REF); break; case WT_PAGE_COL_VAR: size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); p = (uint8_t *)page + sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->u.intl.t = p; break; case WT_PAGE_COL_VAR: page->u.col_var.d = p; break; case WT_PAGE_ROW_LEAF: page->u.row.d = p; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD(cache->pages_inmem, 1); /* The one page field we set is the type. */ page->type = type; *pagep = page; return (0); }
/* * __wt_txn_begin -- * Begin a transaction. */ int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; wt_txnid_t id, oldest_snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; WT_ASSERT(session, txn_state->id == WT_TXN_NONE); WT_RET(__wt_config_gets_defno(session, cfg, "isolation", &cval)); if (cval.len == 0) txn->isolation = session->isolation; else txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; F_SET(txn, TXN_RUNNING); do { /* * Allocate a transaction ID. * * We use an atomic increment to ensure that we get a unique * ID, then publish that to the global state table. * * If two threads race to allocate an ID, only the latest ID * will proceed. The winning thread can be sure its snapshot * contains all of the earlier active IDs. Threads that race * and get an earlier ID may not appear in the snapshot, * but they will loop and allocate a new ID before proceeding * to make any updates. * * This potentially wastes transaction IDs when threads race to * begin transactions, but that is the price we pay to keep * this path latch free. */ do { txn->id = WT_ATOMIC_ADD(txn_global->current, 1); } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); WT_PUBLISH(txn_state->id, txn->id); /* * If we are starting a snapshot isolation transaction, get * a snapshot of the running transactions. * * If we already have a snapshot (e.g., for an auto-commit * operation), update it so that the newly-allocated ID is * visible. */ if (txn->isolation == TXN_ISO_SNAPSHOT) { txn->last_gen = txn->last_oldest_gen = txn_global->gen; oldest_snap_min = txn->id; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->snap_min) != WT_TXN_NONE) if (TXNID_LT(id, oldest_snap_min)) oldest_snap_min = id; if ((id = s->id) == WT_TXN_NONE) continue; else txn->snapshot[n++] = id; } __txn_sort_snapshot( session, n, txn->id, oldest_snap_min); txn_state->snap_min = txn->snap_min; } /* * Ensure the snapshot reads are complete before re-checking * the global current ID. */ WT_READ_BARRIER(); } while (txn->id != txn_global->current); return (0); }