void item_free(item *it) { size_t ntotal = ITEM_ntotal(it); unsigned int clsid; assert((it->it_flags & ITEM_LINKED) == 0); assert(it != heads[it->slabs_clsid]); assert(it != tails[it->slabs_clsid]); assert(it->refcount == 0); /* so slab size changer can tell later if item is already free or not */ clsid = it->slabs_clsid; it->slabs_clsid = 0; it->it_flags |= ITEM_SLABBED; DEBUG_REFCNT(it, 'F'); if ( it->kdtree ) { kd_free(it->kdtree); it->kdtree = 0; } slabs_free(it, ntotal, clsid); }
void item_free(item *it) { #ifdef MOXI_ITEM_MALLOC assert(it->refcount > 0); it->refcount--; if (it->refcount == 0) { free(it); } #else size_t ntotal = ITEM_ntotal(it); unsigned int clsid; assert((it->it_flags & ITEM_LINKED) == 0); assert(it != heads[it->slabs_clsid]); assert(it != tails[it->slabs_clsid]); assert(it->refcount == 0); /* so slab size changer can tell later if item is already free or not */ clsid = it->slabs_clsid; it->slabs_clsid = 0; it->it_flags |= ITEM_SLABBED; DEBUG_REFCNT(it, 'F'); slabs_free(it, ntotal, clsid); #endif }
END_TEST START_TEST(slab_it) { // The freed slab exist at both(slots, end_page) slab.pool_freelist = NULL; void* p1 = slabs_alloc(&slab, 2000000); void* p2 = slabs_alloc(&slab, 2000000); slabs_free(&slab, p1, 2000000); fail_unless(slab.pool_freelist == NULL); slabclass_t* psct = &slab.slabclass[15]; fail_unless(psct->slab_list != NULL); fail_unless(psct->slab_list->next == NULL); fail_unless(psct->sl_curr == 1); fail_unless(psct->end_page_ptr != NULL); fail_unless(psct->end_page_free == 1); slabs_free(&slab, p2, 2000000); fail_unless(slab.pool_freelist != NULL); fail_unless(psct->slab_list == NULL); fail_unless(psct->sl_curr == 0); fail_unless(psct->end_page_ptr == NULL); fail_unless(psct->end_page_free == 0); // The freed slab exist at end_page void* p3 = slabs_alloc(&slab, 2000000); fail_unless(p1 == p3); fail_unless(slab.pool_freelist == NULL); fail_unless(psct->slots == NULL); slabs_free(&slab, p3, 2000000); fail_unless(slab.pool_freelist != NULL); slabheader_t *shp = (slabheader_t*)slab.pool_freelist; fail_unless(shp->next == NULL); fail_unless(psct->slab_list == NULL); fail_unless(psct->sl_curr == 0); fail_unless(psct->end_page_ptr == NULL); fail_unless(psct->end_page_free == 0); // The freed slab exist at slots void* p11 = slabs_alloc(&slab, 2000000); void* p12 = slabs_alloc(&slab, 2000000); void* p13 = slabs_alloc(&slab, 2000000); void* p14 = slabs_alloc(&slab, 2000000); // second slab start void* p15 = slabs_alloc(&slab, 2000000); void* p16 = slabs_alloc(&slab, 2000000); fail_unless(psct->end_page_ptr == NULL); fail_unless(psct->end_page_free == 0); fail_unless(psct->slab_list != NULL); fail_unless(psct->slab_list->next != NULL); fail_unless(psct->slab_list->next->next == NULL); slabs_free(&slab, p11, 2000000); slabs_free(&slab, p13, 2000000); slabs_free(&slab, p14, 2000000); slabs_free(&slab, p15, 2000000); fail_unless(psct->slots != NULL); slabs_free(&slab, p16, 2000000); shp = (slabheader_t*)psct->slots; fail_unless((shp + 1) == p13); fail_unless((shp->next + 1) == p11); fail_unless(shp->next->next == NULL); fail_unless(psct->end_page_ptr == NULL); fail_unless(psct->end_page_free == 0); shp = (slabheader_t*)slab.pool_freelist; fail_unless(shp->next == NULL); }
static int storage_write(void *storage, const int clsid, const int item_age) { int did_moves = 0; struct lru_pull_tail_return it_info; it_info.it = NULL; lru_pull_tail(clsid, COLD_LRU, 0, LRU_PULL_RETURN_ITEM, 0, &it_info); /* Item is locked, and we have a reference to it. */ if (it_info.it == NULL) { return did_moves; } obj_io io; item *it = it_info.it; /* First, storage for the header object */ size_t orig_ntotal = ITEM_ntotal(it); uint32_t flags; if ((it->it_flags & ITEM_HDR) == 0 && (item_age == 0 || current_time - it->time > item_age)) { FLAGS_CONV(it, flags); item *hdr_it = do_item_alloc(ITEM_key(it), it->nkey, flags, it->exptime, sizeof(item_hdr)); /* Run the storage write understanding the start of the item is dirty. * We will fill it (time/exptime/etc) from the header item on read. */ if (hdr_it != NULL) { int bucket = (it->it_flags & ITEM_CHUNKED) ? PAGE_BUCKET_CHUNKED : PAGE_BUCKET_DEFAULT; // Compress soon to expire items into similar pages. if (it->exptime - current_time < settings.ext_low_ttl) { bucket = PAGE_BUCKET_LOWTTL; } hdr_it->it_flags |= ITEM_HDR; io.len = orig_ntotal; io.mode = OBJ_IO_WRITE; // NOTE: when the item is read back in, the slab mover // may see it. Important to have refcount>=2 or ~ITEM_LINKED assert(it->refcount >= 2); // NOTE: write bucket vs free page bucket will disambiguate once // lowttl feature is better understood. if (extstore_write_request(storage, bucket, bucket, &io) == 0) { // cuddle the hash value into the time field so we don't have // to recalculate it. item *buf_it = (item *) io.buf; buf_it->time = it_info.hv; // copy from past the headers + time headers. // TODO: should be in items.c if (it->it_flags & ITEM_CHUNKED) { // Need to loop through the item and copy item_chunk *sch = (item_chunk *) ITEM_schunk(it); int remain = orig_ntotal; int copied = 0; // copy original header int hdrtotal = ITEM_ntotal(it) - it->nbytes; memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, hdrtotal - STORE_OFFSET); copied = hdrtotal; // copy data in like it were one large object. while (sch && remain) { assert(remain >= sch->used); memcpy((char *)io.buf+copied, sch->data, sch->used); // FIXME: use one variable? remain -= sch->used; copied += sch->used; sch = sch->next; } } else { memcpy((char *)io.buf+STORE_OFFSET, (char *)it+STORE_OFFSET, io.len-STORE_OFFSET); } // crc what we copied so we can do it sequentially. buf_it->it_flags &= ~ITEM_LINKED; buf_it->exptime = crc32c(0, (char*)io.buf+STORE_OFFSET, orig_ntotal-STORE_OFFSET); extstore_write(storage, &io); item_hdr *hdr = (item_hdr *) ITEM_data(hdr_it); hdr->page_version = io.page_version; hdr->page_id = io.page_id; hdr->offset = io.offset; // overload nbytes for the header it hdr_it->nbytes = it->nbytes; /* success! Now we need to fill relevant data into the new * header and replace. Most of this requires the item lock */ /* CAS gets set while linking. Copy post-replace */ item_replace(it, hdr_it, it_info.hv); ITEM_set_cas(hdr_it, ITEM_get_cas(it)); do_item_remove(hdr_it); did_moves = 1; LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EXTSTORE_WRITE, it, bucket); } else { /* Failed to write for some reason, can't continue. */ slabs_free(hdr_it, ITEM_ntotal(hdr_it), ITEM_clsid(hdr_it)); } } } do_item_remove(it); item_unlock(it_info.hv); return did_moves; }
/* slawek - reclaim patch */ char *do_item_cacheremove(const unsigned int slabs_clsid, const unsigned int limit, const unsigned int limit_remove, unsigned int *bytes) { uint32_t hv; void *hold_lock = NULL; const short clean_stats_size = 255; unsigned short stats_times[clean_stats_size]; unsigned short stats_expired[clean_stats_size]; memset(&stats_times, 0, sizeof(short) * clean_stats_size); memset(&stats_expired, 0, sizeof(short) * clean_stats_size); char* buffer = malloc((size_t) 16 * 1024); unsigned int bufcurr = 0; char temp[1024]; slabclass_t* slab = &slabclass[slabs_clsid]; if (slab->list_size == 0 || slabs_clsid > power_largest /*slabclass[power_largest] is defined*/ ) { // return as slab is currently empty (0 items, nothing even allocated!) int len = snprintf(temp, sizeof(temp), "Slab %d is currently empty (not allocated), no cleaning done\r\n", slabs_clsid); memcpy(buffer + bufcurr, temp, len); bufcurr += len; memcpy(buffer + bufcurr, "END\r\n", 6); bufcurr += 5; *bytes = bufcurr; return buffer; } slab->reclaimed_slab_item_num++; bool slab_changed = false; if (slab->reclaimed_slab_item_num >= slab->perslab) { // we're switching slabs slab->reclaimed_slab_num = (slab->reclaimed_slab_num + 1) % (slab->slabs); slab->reclaimed_slab_item_num = 0; slab_changed = true; } unsigned int _ipos_start = slab->reclaimed_slab_item_num; item *it_first = NULL; int i=0; for (; i < slab->slabs; i++) { it_first = (void *)(slab->slab_list[slab->reclaimed_slab_num]); if (it_first != NULL) break; // we're switching slabs slab->reclaimed_slab_num = (slab->reclaimed_slab_num + 1) % slab->slabs; slab->reclaimed_slab_item_num = 0; } if (it_first == NULL) { // oh crap, no items found in whole slab class, return the good news! int len = snprintf(temp, sizeof(temp), "Slab %d is currently empty (no items), no cleaning done\r\n", slabs_clsid); memcpy(buffer + bufcurr, temp, len); bufcurr += len; memcpy(buffer + bufcurr, "END\r\n", 6); bufcurr += 5; *bytes = bufcurr; return buffer; } int start = slab->reclaimed_slab_item_num; if (start >= slab->perslab) start = 0; int end = start + limit; if (end >= slab->perslab || limit == 0) end = slab->perslab; // run the cleaning, taking limits into account! uint64_t items_removed = 0; uint64_t bytes_removed = 0; item *it = (void*) ((char*)it_first + (slab->size*start)); i=start; int items_found = 0; int _ttl_left = 0; for (; i<end; i++) { // iterate over items at start, so we won't have to iterate near each continue if (i!=start) { it = (void*) (((char*)it) + slab->size); } // dirty reads, prevent locking! if ( (it->it_flags & ITEM_SLABBED) == 0 && it->refcount > 0 /* from remove items */ && it->slabs_clsid > 0 ) { items_found ++; if (it->exptime == 0) continue; } else continue; // check if we can expire the item _ttl_left = it->exptime - current_time; if (_ttl_left > 0) { if (_ttl_left >= clean_stats_size) _ttl_left = clean_stats_size-1; stats_times[_ttl_left] ++; continue; } else { _ttl_left = 0-_ttl_left; if (_ttl_left >= clean_stats_size) _ttl_left = clean_stats_size-1; stats_expired[_ttl_left]++; } // dirty reads ends here, proceed with locking! hv = hash(ITEM_key(it), it->nkey, 0); /* Attempt to hash item lock the item. If locked, no * other callers can incr the refcount */ if ((hold_lock = item_trylock(hv)) == NULL) { continue; } /* Now see if the item is refcount locked */ if (refcount_incr(&it->refcount) != 2) { refcount_decr(&it->refcount); if (hold_lock) { item_trylock_unlock(hold_lock); } continue; } // LOCKED, proceed with removing! assert(it->nkey <= KEY_MAX_LENGTH); _ttl_left = it->exptime - current_time; if ( (it->it_flags & ITEM_SLABBED) == 0 && it->refcount > 0 /* from remove items */ && it->slabs_clsid > 0 && _ttl_left <= 0) { //do_item_remove(it); item will get removed automatically in unlink function! do_item_unlink_nolock_nostat(it, hv, &items_removed, &bytes_removed); /* Initialize the item block: */ it->slabs_clsid = 0; slabs_free(it, ITEM_ntotal(it), slabs_clsid); } else { refcount_decr(&it->refcount); } if (hold_lock) { item_trylock_unlock(hold_lock); } } // let's update position! slab->reclaimed_slab_item_num = i; // update stats STATS_LOCK(); stats.curr_bytes -= bytes_removed; stats.curr_items -= items_removed; stats.reclaimed_fast += items_removed; stats.reclaimed_fast_bytes += bytes_removed; stats.reclaim_item_passes += (i - start); stats.reclaim_item_found += items_found; stats.reclaim_slab_memory_passes += (slab_changed ? 1 : 0); STATS_UNLOCK(); int len = snprintf(temp, sizeof(temp), "Expiring items in SLAB: %d, Memory region: %d / %d (max: %d), Item pos: S:%d / E:%d\r\n" "Scanned items: %d (Found: %d) / Expired: %llu (%llu KB)\r\n" "Limits ... Items: %u, Max Remove: %u (0 - none)\r\n" ,slabs_clsid, slab->reclaimed_slab_num, slab->slabs, slab->list_size, _ipos_start, slab->reclaimed_slab_item_num, (i - start), items_found, (unsigned long long) items_removed, (unsigned long long) ( (bytes_removed+1023) / 1024), limit, limit_remove); memcpy(buffer + bufcurr, temp, len); bufcurr += len; len = snprintf(temp, sizeof(temp), "TTLs: "); memcpy(buffer + bufcurr, temp, len); bufcurr += len; for (i=0; i<clean_stats_size; i++) { int len = snprintf(temp, sizeof(temp), "%d,", stats_times[i]); memcpy(buffer + bufcurr, temp, len); bufcurr += len; } bufcurr --; *(buffer + bufcurr) = 0; len = snprintf(temp, sizeof(temp), "\r\nExpired Age: "); memcpy(buffer + bufcurr, temp, len); bufcurr += len; for (i=0; i<clean_stats_size; i++) { int len = snprintf(temp, sizeof(temp), "%d,", stats_expired[i]); memcpy(buffer + bufcurr, temp, len); bufcurr += len; } bufcurr --; *(buffer + bufcurr) = 0; memcpy(buffer + bufcurr, "\r\nEND\r\n", 8); bufcurr += 7; *bytes = bufcurr; return buffer; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; item_chunk *ch = NULL; status = MOVE_PASS; if (it->it_flags & ITEM_CHUNK) { /* This chunk is a chained part of a larger item. */ ch = (item_chunk *) it; /* Instead, we use the head chunk to find the item and effectively * lock the entire structure. If a chunk has ITEM_CHUNK flag, its * head cannot be slabbed, so the normal routine is safe. */ it = ch->head; assert(it->it_flags & ITEM_CHUNKED); } /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { assert(ch == NULL); slab_rebalance_cut_free(s_cls, it); status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(it); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(it); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) { /* Chunked should be identical to non-chunked, except we need * to swap out ntotal for the head-chunk-total. */ ntotal = s_cls->size; } if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* Expired, don't save. */ save_item = 0; } else if (ch == NULL && (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { /* Not a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else if (ch != NULL && (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) { /* Is a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else { /* Was whatever it was, and we have memory for it. */ save_item = 1; } pthread_mutex_unlock(&slabs_lock); unsigned int requested_adjust = 0; if (save_item) { if (ch == NULL) { assert((new_it->it_flags & ITEM_CHUNKED) == 0); /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); /* Need to walk the chunks and repoint head */ if (new_it->it_flags & ITEM_CHUNKED) { item_chunk *fch = (item_chunk *) ITEM_data(new_it); fch->next->prev = fch; while (fch) { fch->head = new_it; fch = fch->next; } } it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif slab_rebal.rescues++; requested_adjust = ntotal; } else { item_chunk *nch = (item_chunk *) new_it; /* Chunks always have head chunk (the main it) */ ch->prev->next = nch; if (ch->next) ch->next->prev = nch; memcpy(nch, ch, ch->used + sizeof(item_chunk)); ch->refcount = 0; ch->it_flags = ITEM_SLABBED|ITEM_FETCHED; slab_rebal.chunk_rescues++; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key((item *)ch), "deadbeef", 8); #endif refcount_decr(it); requested_adjust = s_cls->size; } } else { /* restore ntotal in case we tried saving a head chunk. */ ntotal = ITEM_ntotal(it); do_item_unlink(it, hv); slabs_free(it, ntotal, slab_rebal.s_clsid); /* Swing around again later to remove it from the freelist. */ slab_rebal.busy_items++; was_busy++; } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= requested_adjust; break; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }