/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; uint64_t last_merge_progressing; time_t begin, end; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) || lsm_tree->merge_threads == 0) WT_RET_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_RET(__wt_seconds(session, &begin)); F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); /* Wake up the merge threads. */ WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); /* Now wait for merge activity to stop. */ do { last_merge_progressing = lsm_tree->merge_progressing; __wt_sleep(1, 0); WT_RET(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) WT_ERR(ETIMEDOUT); } while (lsm_tree->merge_progressing != last_merge_progressing && lsm_tree->nchunks > 1); err: F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); return (ret); }
/* * __wt_meta_ckptlist_set -- * Set a file's checkpoint value from the WT_CKPT list. */ int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) { WT_CKPT *ckpt; WT_DECL_ITEM(buf); WT_DECL_RET; time_t secs; int64_t maxorder; const char *sep; WT_ERR(__wt_scr_alloc(session, 0, &buf)); maxorder = 0; sep = ""; WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=(")); WT_CKPT_FOREACH(ckptbase, ckpt) { /* * Each internal checkpoint name is appended with a generation * to make it a unique name. We're solving two problems: when * two checkpoints are taken quickly, the timer may not be * unique and/or we can even see time travel on the second * checkpoint if we snapshot the time in-between nanoseconds * rolling over. Second, if we reset the generational counter * when new checkpoints arrive, we could logically re-create * specific checkpoints, racing with cursors open on those * checkpoints. I can't think of any way to return incorrect * results by racing with those cursors, but it's simpler not * to worry about it. */ if (ckpt->order > maxorder) maxorder = ckpt->order; /* Skip deleted checkpoints. */ if (F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) { /* * We fake checkpoints for handles in the middle of a * bulk load. If there is a checkpoint, convert the * raw cookie to a hex string. */ if (ckpt->raw.size == 0) ckpt->addr.size = 0; else WT_ERR(__wt_raw_to_hex(session, ckpt->raw.data, ckpt->raw.size, &ckpt->addr)); /* Set the order and timestamp. */ if (F_ISSET(ckpt, WT_CKPT_ADD)) ckpt->order = ++maxorder; /* * XXX * Assumes a time_t fits into a uintmax_t, which isn't * guaranteed, a time_t has to be an arithmetic type, * but not an integral type. */ WT_ERR(__wt_seconds(session, &secs)); ckpt->sec = (uintmax_t)secs; } if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) WT_ERR(__wt_buf_catfmt(session, buf, "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64 ",time=%" PRIuMAX ",size=%" PRIu64 ",write_gen=%" PRIu64 ")", sep, ckpt->name, ckpt->order, (int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec, ckpt->ckpt_size, ckpt->write_gen)); else WT_ERR(__wt_buf_catfmt(session, buf, "%s%s=(addr=\"%.*s\",order=%" PRIu64 ",time=%" PRIuMAX ",size=%" PRIu64 ",write_gen=%" PRIu64 ")", sep, ckpt->name, (int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec, ckpt->ckpt_size, ckpt->write_gen)); sep = ","; } WT_ERR(__wt_buf_catfmt(session, buf, ")")); if (ckptlsn != NULL) WT_ERR(__wt_buf_catfmt(session, buf, ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", ckptlsn->file, (uintmax_t)ckptlsn->offset)); WT_ERR(__ckpt_set(session, fname, buf->mem)); err: __wt_scr_free(&buf); return (ret); }
/* * __sweep -- * Close unused dhandles on the connection dhandle list. */ static int __sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; time_t now; conn = S2C(session); /* * Session's cache handles unless the session itself is closed, at which * time the handle reference counts are immediately decremented. Don't * discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); dhandle = SLIST_FIRST(&conn->dhlh); for (; dhandle != NULL; dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (dhandle->session_ref != 0 || now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) continue; /* * We have a candidate for closing; if it's open, acquire an * exclusive lock on the handle and close it (the lock blocks * threads from opening the handle). We might be blocking an * open for a fairly long time (over disk I/O), but the handle * has been quiescent for awhile. * * The close can fail if an update cannot be written (updates in * a no-longer-referenced file might not yet be globally visible * if sessions have disjoint sets of files open). If the handle * is busy, skip it, we'll retry the close the next time, after * the transaction state has progressed. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { /* * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we * want opens to block on us rather than returning an * EBUSY error to the application. */ ret = __wt_try_writelock(session, dhandle->rwlock); if (ret == EBUSY) { ret = 0; continue; } WT_RET(ret); WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session)); if (ret == EBUSY) ret = 0; WT_TRET(__wt_rwunlock(session, dhandle->rwlock)); WT_RET(ret); } /* * Attempt to discard the handle (the called function checks the * handle-open flag after acquiring appropriate locks, which is * why we don't do any special handling of EBUSY returns above, * that path never cleared the handle-open flag. */ ret = __wt_conn_dhandle_discard_single(session, dhandle, 0); if (ret == EBUSY) ret = 0; WT_RET(ret); } return (0); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; time_t begin, end; uint64_t progress; int i, compacting, flushing, locked, ref; compacting = flushing = locked = ref = 0; chunk = NULL; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_ERR(__wt_seconds(session, &begin)); /* * Compacting has two distinct phases. * 1. All in-memory chunks up to and including the current * current chunk must be flushed. Normally, the flush code * does not flush the last, in-use chunk, so we set a force * flag to include that last chunk. We monitor the state of the * last chunk and periodically push another forced flush work * unit until it is complete. * 2. After all flushing is done, we move onto the merging * phase for compaction. Again, we monitor the state and * continue to push merge work units until all merging is done. */ /* Lock the tree: single-thread compaction. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = 1; /* Clear any merge throttle: compact throws out that calculation. */ lsm_tree->merge_throttle = 0; lsm_tree->merge_aggressiveness = 0; progress = lsm_tree->merge_progressing; /* If another thread started a compact on this tree, we're done. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) goto err; /* * Set the switch transaction on the current chunk, if it * hasn't been set before. This prevents further writes, so it * can be flushed by the checkpoint worker. */ if (lsm_tree->nchunks > 0 && (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { if (chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* * If we have a chunk, we want to look for it to be on-disk. * So we need to add a reference to keep it available. */ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); ref = 1; } locked = 0; WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 " chunk %u flags 0x%" PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = 1; /* * Make sure the in-memory chunk gets flushed do not push a * switch, because we don't want to create a new in-memory * chunk if the tree is being used read-only now. */ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } else { /* * If there is no chunk to flush, go straight to the * compacting state. */ compacting = 1; progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "COMPACT: Start compacting %s", lsm_tree->name)); } /* Wait for the work unit queues to drain. */ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. * Once it is on disk move to the compacting phase. */ if (flushing) { WT_ASSERT(session, chunk != NULL); if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %u. " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush retry %s chunk %u", name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } } /* * The compacting flag is cleared when no merges can be done. * Ensure that we push through some aggressive merges before * stopping otherwise we might not do merges that would * span chunks with different generations. */ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { if (lsm_tree->merge_aggressiveness < 10 || (progress < lsm_tree->merge_progressing) || lsm_tree->merge_syncing) { progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 10; } else break; } __wt_sleep(1, 0); WT_ERR(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); } /* * Push merge operations while they are still getting work * done. If we are pushing merges, make sure they are * aggressive, to avoid duplicating effort. */ if (compacting) #define COMPACT_PARALLEL_MERGES 5 for (i = lsm_tree->queue_ref; i < COMPACT_PARALLEL_MERGES; i++) { lsm_tree->merge_aggressiveness = 10; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } err: /* Ensure anything we set is cleared. */ if (ref) (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; } if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Compact %s complete, return %d", name, ret)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __sweep -- * Close unused dhandles on the connection dhandle list. */ static int __sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; time_t now; int locked; conn = S2C(session); /* Don't discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); dhandle = SLIST_FIRST(&conn->dhlh); for (; dhandle != NULL; dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (WT_IS_METADATA(dhandle)) continue; if (dhandle->session_inuse != 0 || now <= dhandle->timeofdeath + WT_DHANDLE_SWEEP_WAIT) continue; if (dhandle->timeofdeath == 0) { dhandle->timeofdeath = now; WT_STAT_FAST_CONN_INCR(session, dh_conn_tod); continue; } /* * We have a candidate for closing; if it's open, acquire an * exclusive lock on the handle and close it. We might be * blocking opens for a long time (over disk I/O), but the * handle was quiescent for awhile. * * The close can fail if an update cannot be written (updates * in a no-longer-referenced file might not yet be globally * visible if sessions have disjoint sets of files open). If * the handle is busy, skip it, we'll retry the close the next * time, after the transaction state has progressed. * * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want * opens to block on us rather than returning an EBUSY error to * the application. */ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == EBUSY) continue; WT_RET(ret); locked = 1; /* If the handle is open, try to close it. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0)); if (ret != 0) goto unlock; /* We closed the btree handle, bump the statistic. */ WT_STAT_FAST_CONN_INCR(session, dh_conn_handles); } /* * If there are no longer any references to the handle in any * sessions, attempt to discard it. The called function * re-checks that the handle is not in use, which is why we * don't do any special handling of EBUSY returns above. */ if (dhandle->session_inuse == 0 && dhandle->session_ref == 0) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_dhandle_discard_single(session, 0)); if (ret != 0) goto unlock; /* If the handle was discarded, it isn't locked. */ locked = 0; } else WT_STAT_FAST_CONN_INCR(session, dh_conn_ref); unlock: if (locked) WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); WT_RET_BUSY_OK(ret); } return (0); }