/* Loop up to N times: * If too many items are in HOT_LRU, push to COLD_LRU * If too many items are in WARM_LRU, push to COLD_LRU * If too many items are in COLD_LRU, poke COLD_LRU tail * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The * locks can't handle much more than that. Leaving a TODO for how to * autoadjust in the future. */ static int lru_maintainer_juggle(const int slabs_clsid) { int i; int did_moves = 0; bool mem_limit_reached = false; uint64_t total_bytes = 0; unsigned int chunks_perslab = 0; unsigned int chunks_free = 0; /* TODO: if free_chunks below high watermark, increase aggressiveness */ chunks_free = slabs_available_chunks(slabs_clsid, &mem_limit_reached, &total_bytes, &chunks_perslab); if (settings.expirezero_does_not_evict) total_bytes -= noexp_lru_size(slabs_clsid); /* If slab automove is enabled on any level, and we have more than 2 pages * worth of chunks free in this class, ask (gently) to reassign a page * from this class back into the global pool (0) */ if (settings.slab_automove > 0 && chunks_free > (chunks_perslab * 2.5)) { slabs_reassign(slabs_clsid, SLAB_GLOBAL_PAGE_POOL); } /* Juggle HOT/WARM up to N times */ for (i = 0; i < 1000; i++) { int do_more = 0; if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS) || lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS)) { do_more++; } do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS); if (do_more == 0) break; did_moves++; } return did_moves; }
item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes) { int i; uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; /* If no memory is available, attempt a direct LRU juggle/eviction */ /* This is a race in order to simplify lru_pull_tail; in cases where * locked items are on the tail, you want them to fall out and cause * occasional OOM's, rather than internally work around them. * This also gives one fewer code path for slab alloc/free */ /* TODO: if power_largest, try a lot more times? or a number of times * based on how many chunks the new object should take up? * or based on the size of an object lru_pull_tail() says it evicted? * This is a classical GC problem if "large items" are of too varying of * sizes. This is actually okay here since the larger the data, the more * bandwidth it takes, the more time we can loop in comparison to serving * and replacing small items. */ for (i = 0; i < 10; i++) { uint64_t total_bytes; /* Try to reclaim memory first */ if (!settings.lru_maintainer_thread) { lru_pull_tail(id, COLD_LRU, 0, 0); } it = slabs_alloc(ntotal, id, &total_bytes, 0); if (settings.expirezero_does_not_evict) total_bytes -= noexp_lru_size(id); if (it == NULL) { if (settings.lru_maintainer_thread) { lru_pull_tail(id, HOT_LRU, total_bytes, 0); lru_pull_tail(id, WARM_LRU, total_bytes, 0); if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT) <= 0) break; } else { if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT) <= 0) break; } } else { break; } } if (i > 0) { pthread_mutex_lock(&lru_locks[id]); itemstats[id].direct_reclaims += i; pthread_mutex_unlock(&lru_locks[id]); } if (it == NULL) { pthread_mutex_lock(&lru_locks[id]); itemstats[id].outofmemory++; pthread_mutex_unlock(&lru_locks[id]); return NULL; } assert(it->slabs_clsid == 0); //assert(it != heads[id]); /* Refcount is seeded to 1 by slabs_alloc() */ it->next = it->prev = 0; /* Items are initially loaded into the HOT_LRU. This is '0' but I want at * least a note here. Compiler (hopefully?) optimizes this out. */ if (settings.lru_maintainer_thread) { if (exptime == 0 && settings.expirezero_does_not_evict) { id |= NOEXP_LRU; } else { id |= HOT_LRU; } } else { /* There is only COLD in compat-mode */ id |= COLD_LRU; } it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags |= settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; /* Need to shuffle the pointer stored in h_next into it->data. */ if (it->it_flags & ITEM_CHUNKED) { item_chunk *chunk = (item_chunk *) ITEM_data(it); chunk->next = (item_chunk *) it->h_next; chunk->prev = 0; chunk->head = it; /* Need to chain back into the head's chunk */ chunk->next->prev = chunk; chunk->size = chunk->next->size - ((char *)chunk - (char *)it); chunk->used = 0; assert(chunk->size > 0); } it->h_next = 0; return it; }
static int storage_write(void *storage, const int clsid, const int item_age) { int did_moves = 0; struct lru_pull_tail_return it_info; it_info.it = NULL; lru_pull_tail(clsid, COLD_LRU, 0, LRU_PULL_RETURN_ITEM, 0, &it_info); /* Item is locked, and we have a reference to it. */ if (it_info.it == NULL) { return did_moves; } obj_io io; item *it = it_info.it; /* First, storage for the header object */ size_t orig_ntotal = ITEM_ntotal(it); uint32_t flags; if ((it->it_flags & ITEM_HDR) == 0 && (item_age == 0 || current_time - it->time > item_age)) { FLAGS_CONV(it, flags); item *hdr_it = do_item_alloc(ITEM_key(it), it->nkey, flags, it->exptime, sizeof(item_hdr)); /* Run the storage write understanding the start of the item is dirty. * We will fill it (time/exptime/etc) from the header item on read. */ if (hdr_it != NULL) { int bucket = (it->it_flags & ITEM_CHUNKED) ? PAGE_BUCKET_CHUNKED : PAGE_BUCKET_DEFAULT; // Compress soon to expire items into similar pages. if (it->exptime - current_time < settings.ext_low_ttl) { bucket = PAGE_BUCKET_LOWTTL; } hdr_it->it_flags |= ITEM_HDR; io.len = orig_ntotal; io.mode = OBJ_IO_WRITE; // NOTE: when the item is read back in, the slab mover // may see it. Important to have refcount>=2 or ~ITEM_LINKED assert(it->refcount >= 2); // NOTE: write bucket vs free page bucket will disambiguate once // lowttl feature is better understood. if (extstore_write_request(storage, bucket, bucket, &io) == 0) { // cuddle the hash value into the time field so we don't have // to recalculate it. item *buf_it = (item *) io.buf; buf_it->time = it_info.hv; // copy from past the headers + time headers. // TODO: should be in items.c if (it->it_flags & ITEM_CHUNKED) { // Need to loop through the item and copy item_chunk *sch = (item_chunk *) ITEM_schunk(it); int remain = orig_ntotal; int copied = 0; // copy original header int hdrtotal = ITEM_ntotal(it) - it->nbytes; memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, hdrtotal - STORE_OFFSET); copied = hdrtotal; // copy data in like it were one large object. while (sch && remain) { assert(remain >= sch->used); memcpy((char *)io.buf+copied, sch->data, sch->used); // FIXME: use one variable? remain -= sch->used; copied += sch->used; sch = sch->next; } } else { memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, io.len-STORE_OFFSET); } // crc what we copied so we can do it sequentially. buf_it->it_flags &= ~ITEM_LINKED; buf_it->exptime = crc32c(0, (char*)io.buf+STORE_OFFSET, orig_ntotal-STORE_OFFSET); extstore_write(storage, &io); item_hdr *hdr = (item_hdr *) ITEM_data(hdr_it); hdr->page_version = io.page_version; hdr->page_id = io.page_id; hdr->offset = io.offset; // overload nbytes for the header it hdr_it->nbytes = it->nbytes; /* success! Now we need to fill relevant data into the new * header and replace. Most of this requires the item lock */ /* CAS gets set while linking. Copy post-replace */ item_replace(it, hdr_it, it_info.hv); ITEM_set_cas(hdr_it, ITEM_get_cas(it)); do_item_remove(hdr_it); did_moves = 1; LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EXTSTORE_WRITE, it, bucket); } else { /* Failed to write for some reason, can't continue. */ slabs_free(hdr_it, ITEM_ntotal(hdr_it), ITEM_clsid(hdr_it)); } } } do_item_remove(it); item_unlock(it_info.hv); return did_moves; }