/* * __curfile_remove -- * WT_CURSOR->remove method for the btree cursor type. */ static int __curfile_remove(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); /* * After a successful remove, copy the key: the value is not available. */ if (ret == 0) { if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&(cursor)->key)) { WT_ERR(__wt_buf_set(session, &cursor->key, cursor->key.data, cursor->key.size)); F_CLR(cursor, WT_CURSTD_KEY_INT); F_SET(cursor, WT_CURSTD_KEY_EXT); } F_CLR(cursor, WT_CURSTD_VALUE_SET); } err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __async_set_key -- * WT_ASYNC_OP->set_key implementation for op handles. */ static void __async_set_key(WT_ASYNC_OP *asyncop, ...) { WT_CURSOR *c; va_list ap; c = &asyncop->c; va_start(ap, asyncop); __wt_cursor_set_keyv(c, c->flags, ap); if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c)) c->saved_err = __wt_buf_set( O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key, c->key.data, c->key.size); va_end(ap); }
/* * __async_set_value -- * WT_ASYNC_OP->set_value implementation for op handles. */ static void __async_set_value(WT_ASYNC_OP *asyncop, ...) { WT_CURSOR *c; va_list ap; c = &asyncop->c; va_start(ap, asyncop); __wt_cursor_set_valuev(c, ap); /* Copy the data, if it is pointing at data elsewhere. */ if (!WT_DATA_IN_ITEM(&c->value)) c->saved_err = __wt_buf_set( O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->value, c->value.data, c->value.size); va_end(ap); }
/*尝试重分配ITEM中的mem*/ int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { size_t offset; int copy_data; /*需要拷贝数据的长度*/ /*判断buf->data是否在buf->mem上,如果在,不用做数据拷贝*/ if(WT_DATA_IN_ITEM(buf)){ offset =WT_PTRDIFF(buf->data, buf->mem); copy_data = 0; } else{ offset = 0; copy_data = buf->size ? 1 : 0; } /*进行内存重分配*/ if(size > buf->memsize){ if (F_ISSET(buf, WT_ITEM_ALIGNED)) WT_RET(__wt_realloc_aligned(session, &buf->memsize, size, &buf->mem)); else WT_RET(__wt_realloc(session, &buf->memsize, size, &buf->mem)); } if(buf->data == NULL){ buf->data = buf->mem; buf->size = 0; } else{ if (copy_data) /*进行数据拷贝*/ memcpy(buf->mem, buf->data, buf->size); buf->data = (uint8_t *)buf->mem + offset; } return 0; }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (conn->las_sweep_call != 0 && key->data != NULL) { __wt_cursor_set_raw_key(cursor, key); if ((ret = cursor->search_near(cursor, ¬used)) != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. * * We can't know for sure how many records are in the lookaside table, * the cursor insert and remove statistics aren't updated atomically. * Start with reviewing 100 rows, and if it takes more than the target * number of calls to finish, increase the number of rows checked on * each call; if it takes less than the target calls to finish, then * decrease the number of rows reviewed on each call (but never less * than 100). */ #define WT_SWEEP_LOOKASIDE_MIN_CNT 100 #define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 ++conn->las_sweep_call; if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) WT_ERR(cursor->remove(cursor)); } /* * When reaching the lookaside table end or the target number of calls, * adjust the row count. Decrease/increase the row count depending on * if the number of calls is less/more than the target. */ if (ret == WT_NOTFOUND || conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; } srch_notfound: if (ret == WT_NOTFOUND) conn->las_sweep_call = 0; WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __wt_cache_read -- * Read a page from the file. */ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; WT_PAGE_STATE previous_state; size_t addr_size; const uint8_t *addr; page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. * Otherwise, there's an address, read the backing disk page and build * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; } else { /* Read the backing disk page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); /* Build the in-memory version of the page. */ WT_ERR(__wt_page_inmem(session, ref, tmp.data, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) { WT_CURSOR *c; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_SESSION_IMPL *session; uint64_t *txnid_maxp; uint64_t id, myid, snap_min; session = (WT_SESSION_IMPL *)clsm->iface.session; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { c = &clsm->iface; /* Copy out data before resetting chunk cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); if (F_ISSET(c, WT_CURSTD_VALUE_INT) && !WT_DATA_IN_ITEM(&c->value)) WT_RET(__wt_buf_set( session, &c->value, c->value.data, c->value.size)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; WT_RET(__wt_cache_full_check(session)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update && (chunk = clsm->primary_chunk) != NULL) { WT_RET(__wt_txn_autocommit_check(session)); for (id = chunk->txnid_max, myid = session->txn.id; !TXNID_LE(myid, id); id = chunk->txnid_max) { WT_ASSERT(session, myid != WT_TXN_NONE); (void)WT_ATOMIC_CAS( chunk->txnid_max, id, myid); } } /* * Figure out how many updates are required for snapshot * isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID that * overlaps with our snapshot is a potential conflict. */ clsm->nupdates = 1; if (session->txn.isolation == TXN_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { snap_min = session->txn.snap_min; for (txnid_maxp = &clsm->txnid_max[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, txnid_maxp--) if (TXNID_LT(*txnid_maxp, snap_min)) break; } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || session->txn.isolation != TXN_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if there is a * transaction in progress and it rolls back, it would leave the * metadata inconsistent. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && (ngood < clsm->nchunks || (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) WT_ERR(__clsm_close_cursors( clsm, 0, nchunks - nupdates)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }