/* Returns number of items remove, expired, or evicted. * Callable from worker threads or the LRU maintainer thread */ static int lru_pull_tail(const int orig_id, const int cur_lru, const uint64_t total_bytes, uint8_t flags) { item *it = NULL; int id = orig_id; int removed = 0; if (id == 0) return 0; int tries = 5; item *search; item *next_it; void *hold_lock = NULL; unsigned int move_to_lru = 0; uint64_t limit = 0; id |= cur_lru; pthread_mutex_lock(&lru_locks[id]); search = tails[id]; /* We walk up *only* for locked items, and if bottom is expired. */ for (; tries > 0 && search != NULL; tries--, search=next_it) { /* we might relink search mid-loop, so search->prev isn't reliable */ next_it = search->prev; if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ if (flags & LRU_PULL_CRAWL_BLOCKS) { pthread_mutex_unlock(&lru_locks[id]); return 0; } tries++; continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount. Also skip ourselves. */ if ((hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { /* Note pathological case with ref'ed items in tail. * Can still unlink the item, but it won't be reusable yet */ itemstats[id].lrutail_reflocked++; /* In case of refcount leaks, enable for quick workaround. */ /* WARNING: This can cause terrible corruption */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; /* This will call item_remove -> item_free since refcnt is 1 */ do_item_unlink_nolock(search, hv); item_trylock_unlock(hold_lock); continue; } } /* Expired or flushed */ if ((search->exptime != 0 && search->exptime < current_time) || item_is_flushed(search)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } /* refcnt 2 -> 1 */ do_item_unlink_nolock(search, hv); /* refcnt 1 -> 0 -> item_free */ do_item_remove(search); item_trylock_unlock(hold_lock); removed++; /* If all we're finding are expired, can keep going */ continue; } /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU. * If we're COLD_LRU, send to WARM_LRU unless we need to evict */ switch (cur_lru) { case HOT_LRU: limit = total_bytes * settings.hot_lru_pct / 100; case WARM_LRU: if (limit == 0) limit = total_bytes * settings.warm_lru_pct / 100; if (sizes_bytes[id] > limit) { itemstats[id].moves_to_cold++; move_to_lru = COLD_LRU; do_item_unlink_q(search); it = search; removed++; break; } else if ((search->it_flags & ITEM_ACTIVE) != 0) { /* Only allow ACTIVE relinking if we're not too large. */ itemstats[id].moves_within_lru++; search->it_flags &= ~ITEM_ACTIVE; do_item_update_nolock(search); do_item_remove(search); item_trylock_unlock(hold_lock); } else { /* Don't want to move to COLD, not active, bail out */ it = search; } break; case COLD_LRU: it = search; /* No matter what, we're stopping */ if (flags & LRU_PULL_EVICT) { if (settings.evict_to_free == 0) { /* Don't think we need a counter for this. It'll OOM. */ break; } itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search); do_item_unlink_nolock(search, hv); removed++; if (settings.slab_automove == 2) { slabs_reassign(-1, orig_id); } } else if ((search->it_flags & ITEM_ACTIVE) != 0 && settings.lru_maintainer_thread) { itemstats[id].moves_to_warm++; search->it_flags &= ~ITEM_ACTIVE; move_to_lru = WARM_LRU; do_item_unlink_q(search); removed++; } break; } if (it != NULL) break; } pthread_mutex_unlock(&lru_locks[id]); if (it != NULL) { if (move_to_lru) { it->slabs_clsid = ITEM_clsid(it); it->slabs_clsid |= move_to_lru; item_link_q(it); } do_item_remove(it); item_trylock_unlock(hold_lock); } return removed; }
/** wrapper around assoc_find which does the lazy expiration logic */ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c) { item *it = assoc_find(key, nkey, hv); if (it != NULL) { refcount_incr(&it->refcount); /* Optimization for slab reassignment. prevents popular items from * jamming in busy wait. Can only do this here to satisfy lock order * of item_lock, slabs_lock. */ /* This was made unsafe by removal of the cache_lock: * slab_rebalance_signal and slab_rebal.* are modified in a separate * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start = * NULL (0), but slab_end is still equal to some value, this would end * up unlinking every item fetched. * This is either an acceptable loss, or if slab_rebalance_signal is * true, slab_start/slab_end should be put behind the slabs_lock. * Which would cause a huge potential slowdown. * Could also use a specific lock for slab_rebal.* and * slab_rebalance_signal (shorter lock?) */ /*if (slab_rebalance_signal && ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; }*/ } int was_found = 0; if (settings.verbose > 2) { int ii; if (it == NULL) { fprintf(stderr, "> NOT FOUND "); } else { fprintf(stderr, "> FOUND KEY "); } for (ii = 0; ii < nkey; ++ii) { fprintf(stderr, "%c", key[ii]); } } if (it != NULL) { was_found = 1; if (item_is_flushed(it)) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.get_flushed++; pthread_mutex_unlock(&c->thread->stats.mutex); if (settings.verbose > 2) { fprintf(stderr, " -nuked by flush"); } was_found = 2; } else if (it->exptime != 0 && it->exptime <= current_time) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.get_expired++; pthread_mutex_unlock(&c->thread->stats.mutex); if (settings.verbose > 2) { fprintf(stderr, " -nuked by expire"); } was_found = 3; } else { it->it_flags |= ITEM_FETCHED|ITEM_ACTIVE; DEBUG_REFCNT(it, '+'); } } if (settings.verbose > 2) fprintf(stderr, "\n"); /* For now this is in addition to the above verbose logging. */ LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key, nkey); return it; }
static void storage_compact_readback(void *storage, logger *l, bool drop_unread, char *readback_buf, uint32_t page_id, uint64_t page_version, uint64_t read_size) { uint64_t offset = 0; unsigned int rescues = 0; unsigned int lost = 0; unsigned int skipped = 0; while (offset < read_size) { item *hdr_it = NULL; item_hdr *hdr = NULL; item *it = (item *)(readback_buf+offset); unsigned int ntotal; // probably zeroed out junk at the end of the wbuf if (it->nkey == 0) { break; } ntotal = ITEM_ntotal(it); uint32_t hv = (uint32_t)it->time; item_lock(hv); // We don't have a conn and don't need to do most of do_item_get hdr_it = assoc_find(ITEM_key(it), it->nkey, hv); if (hdr_it != NULL) { bool do_write = false; refcount_incr(hdr_it); // Check validity but don't bother removing it. if ((hdr_it->it_flags & ITEM_HDR) && !item_is_flushed(hdr_it) && (hdr_it->exptime == 0 || hdr_it->exptime > current_time)) { hdr = (item_hdr *)ITEM_data(hdr_it); if (hdr->page_id == page_id && hdr->page_version == page_version) { // Item header is still completely valid. extstore_delete(storage, page_id, page_version, 1, ntotal); // drop inactive items. if (drop_unread && GET_LRU(hdr_it->slabs_clsid) == COLD_LRU) { do_write = false; skipped++; } else { do_write = true; } } } if (do_write) { bool do_update = false; int tries; obj_io io; io.len = ntotal; io.mode = OBJ_IO_WRITE; for (tries = 10; tries > 0; tries--) { if (extstore_write_request(storage, PAGE_BUCKET_COMPACT, PAGE_BUCKET_COMPACT, &io) == 0) { memcpy(io.buf, it, io.len); extstore_write(storage, &io); do_update = true; break; } else { usleep(1000); } } if (do_update) { if (it->refcount == 2) { hdr->page_version = io.page_version; hdr->page_id = io.page_id; hdr->offset = io.offset; rescues++; } else { lost++; // TODO: re-alloc and replace header. } } else { lost++; } } do_item_remove(hdr_it); } item_unlock(hv); offset += ntotal; if (read_size - offset < sizeof(struct _stritem)) break; } STATS_LOCK(); stats.extstore_compact_lost += lost; stats.extstore_compact_rescues += rescues; stats.extstore_compact_skipped += skipped; STATS_UNLOCK(); LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_READ_END, NULL, page_id, offset, rescues, lost, skipped); }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; status = MOVE_PASS; /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { /* remove from slab freelist */ if (s_cls->slots == it) { s_cls->slots = it->next; } if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; s_cls->sl_curr--; status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(&it->refcount); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(&it->refcount); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* TODO: maybe we only want to save if item is in HOT or * WARM LRU? */ save_item = 0; } else if ((new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { save_item = 0; slab_rebal.evictions_nomem++; } else { save_item = 1; } pthread_mutex_unlock(&slabs_lock); if (save_item) { /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); slab_rebal.rescues++; } else { do_item_unlink(it, hv); } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= ntotal; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; item_chunk *ch = NULL; status = MOVE_PASS; if (it->it_flags & ITEM_CHUNK) { /* This chunk is a chained part of a larger item. */ ch = (item_chunk *) it; /* Instead, we use the head chunk to find the item and effectively * lock the entire structure. If a chunk has ITEM_CHUNK flag, its * head cannot be slabbed, so the normal routine is safe. */ it = ch->head; assert(it->it_flags & ITEM_CHUNKED); } /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { assert(ch == NULL); slab_rebalance_cut_free(s_cls, it); status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(it); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(it); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) { /* Chunked should be identical to non-chunked, except we need * to swap out ntotal for the head-chunk-total. */ ntotal = s_cls->size; } if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* Expired, don't save. */ save_item = 0; } else if (ch == NULL && (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { /* Not a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else if (ch != NULL && (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) { /* Is a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else { /* Was whatever it was, and we have memory for it. */ save_item = 1; } pthread_mutex_unlock(&slabs_lock); unsigned int requested_adjust = 0; if (save_item) { if (ch == NULL) { assert((new_it->it_flags & ITEM_CHUNKED) == 0); /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); /* Need to walk the chunks and repoint head */ if (new_it->it_flags & ITEM_CHUNKED) { item_chunk *fch = (item_chunk *) ITEM_data(new_it); fch->next->prev = fch; while (fch) { fch->head = new_it; fch = fch->next; } } it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif slab_rebal.rescues++; requested_adjust = ntotal; } else { item_chunk *nch = (item_chunk *) new_it; /* Chunks always have head chunk (the main it) */ ch->prev->next = nch; if (ch->next) ch->next->prev = nch; memcpy(nch, ch, ch->used + sizeof(item_chunk)); ch->refcount = 0; ch->it_flags = ITEM_SLABBED|ITEM_FETCHED; slab_rebal.chunk_rescues++; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key((item *)ch), "deadbeef", 8); #endif refcount_decr(it); requested_adjust = s_cls->size; } } else { /* restore ntotal in case we tried saving a head chunk. */ ntotal = ITEM_ntotal(it); do_item_unlink(it, hv); slabs_free(it, ntotal, slab_rebal.s_clsid); /* Swing around again later to remove it from the freelist. */ slab_rebal.busy_items++; was_busy++; } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= requested_adjust; break; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }