/* refcount == 0 is safe since nobody can incr while cache_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; enum move_status status = MOVE_PASS; pthread_mutex_lock(&cache_lock); pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { item *it = slab_rebal.slab_pos; status = MOVE_PASS; if (it->slabs_clsid != 255) { void *hold_lock = NULL; uint32_t hv = hash(ITEM_key(it), it->nkey, 0); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(&it->refcount); if (refcount == 1) { /* item is unlinked, unused */ if (it->it_flags & ITEM_SLABBED) { /* remove from slab freelist */ if (s_cls->slots == it) { s_cls->slots = it->next; } if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; s_cls->sl_curr--; status = MOVE_DONE; } else { status = MOVE_BUSY; } } else if (refcount == 2) { /* item is linked but not busy */ if ((it->it_flags & ITEM_LINKED) != 0) { do_item_unlink_nolock(it, hv); status = MOVE_DONE; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } item_trylock_unlock(hold_lock); } } switch (status) { case MOVE_DONE: it->refcount = 0; it->it_flags = 0; it->slabs_clsid = 255; break; case MOVE_BUSY: refcount_decr(&it->refcount); case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); pthread_mutex_unlock(&cache_lock); return was_busy; }
static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ if ((item_lock = item_trylock(expand_bucket))) { for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) { /* We are done expanding.. just wait for next invocation */ started_expanding = false; pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand(); pause_threads(RESUME_ALL_THREADS); } } return NULL; }
/*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; //计算这个item的空间 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } //根据大小判断从属于哪个slab unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ //在LRU中尝试5次还没合适的空间,则执行申请空间的操作 int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=search->prev) { uint32_t hv = hash(ITEM_key(search), search->nkey, 0); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* FIXME: I think we need to mask the hv here for comparison? */ if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* Expired or flushed */ // search指向的item过期了,则直接复用这块内存 if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } //此刻,过期失效的item没有找到,申请内存又失败了。看来只能使用 //LRU淘汰一个item(即使这个item并没有过期失效) else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } //从slab分配器中申请内存 if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
static void *item_crawler_thread(void *arg) { int i; pthread_mutex_lock(&lru_crawler_lock); if (settings.verbose > 2) fprintf(stderr, "Starting LRU crawler background thread\n"); while (do_run_lru_crawler_thread) { pthread_cond_wait(&lru_crawler_cond, &lru_crawler_lock); while (crawler_count) { item *search = NULL; void *hold_lock = NULL; for (i = 0; i < LARGEST_ID; i++) { if (crawlers[i].it_flags != 1) { continue; } pthread_mutex_lock(&cache_lock); search = crawler_crawl_q((item *)&crawlers[i]); if (search == NULL || (crawlers[i].remaining && --crawlers[i].remaining < 1)) { if (settings.verbose > 2) fprintf(stderr, "Nothing left to crawl for %d\n", i); crawlers[i].it_flags = 0; crawler_count--; crawler_unlink_q((item *)&crawlers[i]); pthread_mutex_unlock(&cache_lock); continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ if ((hold_lock = item_trylock(hv)) == NULL) { pthread_mutex_unlock(&cache_lock); continue; } /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); if (hold_lock) item_trylock_unlock(hold_lock); pthread_mutex_unlock(&cache_lock); continue; } /* Frees the item or decrements the refcount. */ /* Interface for this could improve: do the free/decr here * instead? */ item_crawler_evaluate(search, hv, i); if (hold_lock) item_trylock_unlock(hold_lock); pthread_mutex_unlock(&cache_lock); if (settings.lru_crawler_sleep) usleep(settings.lru_crawler_sleep); } } if (settings.verbose > 2) fprintf(stderr, "LRU crawler thread sleeping\n"); STATS_LOCK(); stats.lru_crawler_running = false; STATS_UNLOCK(); } pthread_mutex_unlock(&lru_crawler_lock); if (settings.verbose > 2) fprintf(stderr, "LRU crawler thread stopping\n"); return NULL; }
/* Returns number of items remove, expired, or evicted. * Callable from worker threads or the LRU maintainer thread */ static int lru_pull_tail(const int orig_id, const int cur_lru, const uint64_t total_bytes, uint8_t flags) { item *it = NULL; int id = orig_id; int removed = 0; if (id == 0) return 0; int tries = 5; item *search; item *next_it; void *hold_lock = NULL; unsigned int move_to_lru = 0; uint64_t limit = 0; id |= cur_lru; pthread_mutex_lock(&lru_locks[id]); search = tails[id]; /* We walk up *only* for locked items, and if bottom is expired. */ for (; tries > 0 && search != NULL; tries--, search=next_it) { /* we might relink search mid-loop, so search->prev isn't reliable */ next_it = search->prev; if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ if (flags & LRU_PULL_CRAWL_BLOCKS) { pthread_mutex_unlock(&lru_locks[id]); return 0; } tries++; continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount. Also skip ourselves. */ if ((hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { /* Note pathological case with ref'ed items in tail. * Can still unlink the item, but it won't be reusable yet */ itemstats[id].lrutail_reflocked++; /* In case of refcount leaks, enable for quick workaround. */ /* WARNING: This can cause terrible corruption */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; /* This will call item_remove -> item_free since refcnt is 1 */ do_item_unlink_nolock(search, hv); item_trylock_unlock(hold_lock); continue; } } /* Expired or flushed */ if ((search->exptime != 0 && search->exptime < current_time) || item_is_flushed(search)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } /* refcnt 2 -> 1 */ do_item_unlink_nolock(search, hv); /* refcnt 1 -> 0 -> item_free */ do_item_remove(search); item_trylock_unlock(hold_lock); removed++; /* If all we're finding are expired, can keep going */ continue; } /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU. * If we're COLD_LRU, send to WARM_LRU unless we need to evict */ switch (cur_lru) { case HOT_LRU: limit = total_bytes * settings.hot_lru_pct / 100; case WARM_LRU: if (limit == 0) limit = total_bytes * settings.warm_lru_pct / 100; if (sizes_bytes[id] > limit) { itemstats[id].moves_to_cold++; move_to_lru = COLD_LRU; do_item_unlink_q(search); it = search; removed++; break; } else if ((search->it_flags & ITEM_ACTIVE) != 0) { /* Only allow ACTIVE relinking if we're not too large. */ itemstats[id].moves_within_lru++; search->it_flags &= ~ITEM_ACTIVE; do_item_update_nolock(search); do_item_remove(search); item_trylock_unlock(hold_lock); } else { /* Don't want to move to COLD, not active, bail out */ it = search; } break; case COLD_LRU: it = search; /* No matter what, we're stopping */ if (flags & LRU_PULL_EVICT) { if (settings.evict_to_free == 0) { /* Don't think we need a counter for this. It'll OOM. */ break; } itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search); do_item_unlink_nolock(search, hv); removed++; if (settings.slab_automove == 2) { slabs_reassign(-1, orig_id); } } else if ((search->it_flags & ITEM_ACTIVE) != 0 && settings.lru_maintainer_thread) { itemstats[id].moves_to_warm++; search->it_flags &= ~ITEM_ACTIVE; move_to_lru = WARM_LRU; do_item_unlink_q(search); removed++; } break; } if (it != NULL) break; } pthread_mutex_unlock(&lru_locks[id]); if (it != NULL) { if (move_to_lru) { it->slabs_clsid = ITEM_clsid(it); it->slabs_clsid |= move_to_lru; item_link_q(it); } do_item_remove(it); item_trylock_unlock(hold_lock); } return removed; }
/*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ int tries = 5; /* Avoid hangs if a slab has nothing but refcounted stuff in it. */ int tries_lrutail_reflocked = 1000; int tried_alloc = 0; item *search; item *next_it; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=next_it) { /* we might relink search mid-loop, so search->prev isn't reliable */ next_it = search->prev; if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ tries++; continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* Don't accidentally grab ourselves, or bail if we can't quicklock */ if (hv == cur_hv || (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { /* Avoid pathological case with ref'ed items in tail */ do_item_update_nolock(search); tries_lrutail_reflocked--; tries++; refcount_decr(&search->refcount); itemstats[id].lrutail_reflocked++; /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); if (tries_lrutail_reflocked < 1) break; continue; } /* Expired or flushed */ if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } shadow_item* new_shadow_it = create_shadow_item(search); hv = hash(new_shadow_it->key, new_shadow_it->nkey); shadow_assoc_insert(new_shadow_it, hv); insert_shadowq_item(new_shadow_it,new_shadow_it->slabs_clsid); it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/* refcount == 0 is safe since nobody can incr while cache_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; enum move_status status = MOVE_PASS; pthread_mutex_lock(&cache_lock); pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; //会在start_slab_maintenance_thread函数中读取环境变量设置slab_bulk_check //默认值为1.同样这里也是采用分期处理的方案处理一个页上的多个item for (x = 0; x < slab_bulk_check; x++) { item *it = slab_rebal.slab_pos; status = MOVE_PASS; if (it->slabs_clsid != 255) { void *hold_lock = NULL; uint32_t hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(&it->refcount); if (refcount == 1) { /* item is unlinked, unused */ //如果it_flags&ITEM_SLABBED为真,那么就说明这个item //根本就没有分配出去。如果为假,那么说明这个item被分配 //出去了,但处于归还途中。参考do_item_get函数里面的 //判断语句,有slab_rebalance_signal作为判断条件的那个。 if (it->it_flags & ITEM_SLABBED) {//没有分配出去 /* remove from slab freelist */ if (s_cls->slots == it) { s_cls->slots = it->next; } if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; s_cls->sl_curr--; status = MOVE_DONE;//这个item处理成功 } else {//此时还有另外一个worker线程在归还这个item status = MOVE_BUSY; } } else if (refcount == 2) { /* item is linked but not busy */ //没有worker线程引用这个item if ((it->it_flags & ITEM_LINKED) != 0) { //直接把这个item从哈希表和LRU队列中删除 do_item_unlink_nolock(it, hv); status = MOVE_DONE; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ //现在有worker线程正在引用这个item status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } item_trylock_unlock(hold_lock); } } switch (status) { case MOVE_DONE: it->refcount = 0;//引用计数清零 it->it_flags = 0;//清零所有属性 it->slabs_clsid = 255; break; case MOVE_BUSY: refcount_decr(&it->refcount);//注意这里没有break case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++;//记录是否有不能马上处理的item break; case MOVE_PASS: break; } //处理这个页的下一个item slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end)//遍历完了这个页 break; } //遍历完了这个页的所有item if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ //在处理的时候,跳过了一些item(因为有worker线程在引用) if (slab_rebal.busy_items) {//此时需要从头再扫描一次这个页 slab_rebal.slab_pos = slab_rebal.slab_start; slab_rebal.busy_items = 0; } else { slab_rebal.done++;//标志已经处理完这个页的所有item } } pthread_mutex_unlock(&slabs_lock); pthread_mutex_unlock(&cache_lock); return was_busy;//返回记录 }
//分配一个item,这个函数包含了memcached具体item分配的逻辑 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); //item总大小 if (settings.use_cas) { ntotal += sizeof(uint64_t); //如果有用到cas,那么item大小还要加上unit64_t的size } unsigned int id = slabs_clsid(ntotal); //根据item的大小,找到适合的slabclass if (id == 0) return 0; mutex_lock(&cache_lock); //cache锁 /* do a quick check if we have any expired items in the tail.. */ int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; //全局变量,tails[x]是id为x的slabclass lru链表的尾部 /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ //首先从lru链表尾部查找有没有过期的item,tries = 5,最多循环5次 //注意这里是最多查找5次,只要找到一个没有被其他地方引用的item,那么就不再继续查找,如果这个item过期,就使用这个item的空间,否则创建新的slab for (; tries > 0 && search != NULL; tries--, search=search->prev) { if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ //这里只是搜索过期的item,对于异常的item,直接忽略继续查找 tries++; continue; } //计算item的hash值,hv有两个作用:1.用于hash表保存item 2.用于item lock表中锁住item,通过hv计算出item_lock中哪个锁对当前item加锁 //不同item的hash值可能相同,hash表中用链表的方式解决冲突;item lock中多个item共享一个锁 uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* Don't accidentally grab ourselves, or bail if we can't quicklock */ //锁住当前item if (hv == cur_hv || (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ //检查这个指向的这个item是否被其他地方引用,如果是的话,继续向前查找 if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* Expired or flushed */ //如果找到过期的item if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; //更新统计数据 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); //把旧的item从hash表和LRU链表中移除 do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } //如果没有找到过期的item,则调用slabs_alloc分配空间 //如果slabs_alloc返回null,表示分配失败,内存空间已满 //需要按LRU进行淘汰 else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; //标记一下,表示有尝试调用slabs_alloc分配空间 //记录被淘汰item的信息, 使用memcached经常会查看的evicted_time就是在这里赋值的 if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; //被淘汰item距离上次使用的时间 if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); //更新统计数据 do_item_unlink_nolock(it, hv); //从hash表和LRU链表中移除 /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ //默认情况下,slab_automove=1,会合理地更具淘汰统计数据来分析怎么进行slabclass空间的分配 //如果slab_automove=2,只要分配失败了,马上进行slabclass空间的重分配 if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } //查找5次过期的item都失败,并且也没有淘汰可用且没有过期的item //分配新的内存空间 if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); //分配失败,返回null if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ //item内存空间分配成功,做一些初始化工作 it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/* slawek - reclaim patch */ char *do_item_cacheremove(const unsigned int slabs_clsid, const unsigned int limit, const unsigned int limit_remove, unsigned int *bytes) { uint32_t hv; void *hold_lock = NULL; const short clean_stats_size = 255; unsigned short stats_times[clean_stats_size]; unsigned short stats_expired[clean_stats_size]; memset(&stats_times, 0, sizeof(short) * clean_stats_size); memset(&stats_expired, 0, sizeof(short) * clean_stats_size); char* buffer = malloc((size_t) 16 * 1024); unsigned int bufcurr = 0; char temp[1024]; slabclass_t* slab = &slabclass[slabs_clsid]; if (slab->list_size == 0 || slabs_clsid > power_largest /*slabclass[power_largest] is defined*/ ) { // return as slab is currently empty (0 items, nothing even allocated!) int len = snprintf(temp, sizeof(temp), "Slab %d is currently empty (not allocated), no cleaning done\r\n", slabs_clsid); memcpy(buffer + bufcurr, temp, len); bufcurr += len; memcpy(buffer + bufcurr, "END\r\n", 6); bufcurr += 5; *bytes = bufcurr; return buffer; } slab->reclaimed_slab_item_num++; bool slab_changed = false; if (slab->reclaimed_slab_item_num >= slab->perslab) { // we're switching slabs slab->reclaimed_slab_num = (slab->reclaimed_slab_num + 1) % (slab->slabs); slab->reclaimed_slab_item_num = 0; slab_changed = true; } unsigned int _ipos_start = slab->reclaimed_slab_item_num; item *it_first = NULL; int i=0; for (; i < slab->slabs; i++) { it_first = (void *)(slab->slab_list[slab->reclaimed_slab_num]); if (it_first != NULL) break; // we're switching slabs slab->reclaimed_slab_num = (slab->reclaimed_slab_num + 1) % slab->slabs; slab->reclaimed_slab_item_num = 0; } if (it_first == NULL) { // oh crap, no items found in whole slab class, return the good news! int len = snprintf(temp, sizeof(temp), "Slab %d is currently empty (no items), no cleaning done\r\n", slabs_clsid); memcpy(buffer + bufcurr, temp, len); bufcurr += len; memcpy(buffer + bufcurr, "END\r\n", 6); bufcurr += 5; *bytes = bufcurr; return buffer; } int start = slab->reclaimed_slab_item_num; if (start >= slab->perslab) start = 0; int end = start + limit; if (end >= slab->perslab || limit == 0) end = slab->perslab; // run the cleaning, taking limits into account! uint64_t items_removed = 0; uint64_t bytes_removed = 0; item *it = (void*) ((char*)it_first + (slab->size*start)); i=start; int items_found = 0; int _ttl_left = 0; for (; i<end; i++) { // iterate over items at start, so we won't have to iterate near each continue if (i!=start) { it = (void*) (((char*)it) + slab->size); } // dirty reads, prevent locking! if ( (it->it_flags & ITEM_SLABBED) == 0 && it->refcount > 0 /* from remove items */ && it->slabs_clsid > 0 ) { items_found ++; if (it->exptime == 0) continue; } else continue; // check if we can expire the item _ttl_left = it->exptime - current_time; if (_ttl_left > 0) { if (_ttl_left >= clean_stats_size) _ttl_left = clean_stats_size-1; stats_times[_ttl_left] ++; continue; } else { _ttl_left = 0-_ttl_left; if (_ttl_left >= clean_stats_size) _ttl_left = clean_stats_size-1; stats_expired[_ttl_left]++; } // dirty reads ends here, proceed with locking! hv = hash(ITEM_key(it), it->nkey, 0); /* Attempt to hash item lock the item. If locked, no * other callers can incr the refcount */ if ((hold_lock = item_trylock(hv)) == NULL) { continue; } /* Now see if the item is refcount locked */ if (refcount_incr(&it->refcount) != 2) { refcount_decr(&it->refcount); if (hold_lock) { item_trylock_unlock(hold_lock); } continue; } // LOCKED, proceed with removing! assert(it->nkey <= KEY_MAX_LENGTH); _ttl_left = it->exptime - current_time; if ( (it->it_flags & ITEM_SLABBED) == 0 && it->refcount > 0 /* from remove items */ && it->slabs_clsid > 0 && _ttl_left <= 0) { //do_item_remove(it); item will get removed automatically in unlink function! do_item_unlink_nolock_nostat(it, hv, &items_removed, &bytes_removed); /* Initialize the item block: */ it->slabs_clsid = 0; slabs_free(it, ITEM_ntotal(it), slabs_clsid); } else { refcount_decr(&it->refcount); } if (hold_lock) { item_trylock_unlock(hold_lock); } } // let's update position! slab->reclaimed_slab_item_num = i; // update stats STATS_LOCK(); stats.curr_bytes -= bytes_removed; stats.curr_items -= items_removed; stats.reclaimed_fast += items_removed; stats.reclaimed_fast_bytes += bytes_removed; stats.reclaim_item_passes += (i - start); stats.reclaim_item_found += items_found; stats.reclaim_slab_memory_passes += (slab_changed ? 1 : 0); STATS_UNLOCK(); int len = snprintf(temp, sizeof(temp), "Expiring items in SLAB: %d, Memory region: %d / %d (max: %d), Item pos: S:%d / E:%d\r\n" "Scanned items: %d (Found: %d) / Expired: %llu (%llu KB)\r\n" "Limits ... Items: %u, Max Remove: %u (0 - none)\r\n" ,slabs_clsid, slab->reclaimed_slab_num, slab->slabs, slab->list_size, _ipos_start, slab->reclaimed_slab_item_num, (i - start), items_found, (unsigned long long) items_removed, (unsigned long long) ( (bytes_removed+1023) / 1024), limit, limit_remove); memcpy(buffer + bufcurr, temp, len); bufcurr += len; len = snprintf(temp, sizeof(temp), "TTLs: "); memcpy(buffer + bufcurr, temp, len); bufcurr += len; for (i=0; i<clean_stats_size; i++) { int len = snprintf(temp, sizeof(temp), "%d,", stats_times[i]); memcpy(buffer + bufcurr, temp, len); bufcurr += len; } bufcurr --; *(buffer + bufcurr) = 0; len = snprintf(temp, sizeof(temp), "\r\nExpired Age: "); memcpy(buffer + bufcurr, temp, len); bufcurr += len; for (i=0; i<clean_stats_size; i++) { int len = snprintf(temp, sizeof(temp), "%d,", stats_expired[i]); memcpy(buffer + bufcurr, temp, len); bufcurr += len; } bufcurr --; *(buffer + bufcurr) = 0; memcpy(buffer + bufcurr, "\r\nEND\r\n", 8); bufcurr += 7; *bytes = bufcurr; return buffer; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; status = MOVE_PASS; /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { /* remove from slab freelist */ if (s_cls->slots == it) { s_cls->slots = it->next; } if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; s_cls->sl_curr--; status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(&it->refcount); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(&it->refcount); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* TODO: maybe we only want to save if item is in HOT or * WARM LRU? */ save_item = 0; } else if ((new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { save_item = 0; slab_rebal.evictions_nomem++; } else { save_item = 1; } pthread_mutex_unlock(&slabs_lock); if (save_item) { /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); slab_rebal.rescues++; } else { do_item_unlink(it, hv); } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= ntotal; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; item_chunk *ch = NULL; status = MOVE_PASS; if (it->it_flags & ITEM_CHUNK) { /* This chunk is a chained part of a larger item. */ ch = (item_chunk *) it; /* Instead, we use the head chunk to find the item and effectively * lock the entire structure. If a chunk has ITEM_CHUNK flag, its * head cannot be slabbed, so the normal routine is safe. */ it = ch->head; assert(it->it_flags & ITEM_CHUNKED); } /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared * the chunk for move. Only these two flags should exist. */ if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { assert(ch == NULL); slab_rebalance_cut_free(s_cls, it); status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(it); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(it); item_trylock_unlock(hold_lock); } } } else { /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark * busy and wait for item to complete its upload. */ status = MOVE_BUSY; } } int save_item = 0; item *new_it = NULL; size_t ntotal = 0; switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ /* Check if expired or flushed */ ntotal = ITEM_ntotal(it); /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */ if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) { /* Chunked should be identical to non-chunked, except we need * to swap out ntotal for the head-chunk-total. */ ntotal = s_cls->size; } if ((it->exptime != 0 && it->exptime < current_time) || item_is_flushed(it)) { /* Expired, don't save. */ save_item = 0; } else if (ch == NULL && (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) { /* Not a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else if (ch != NULL && (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) { /* Is a chunk of an item, and nomem. */ save_item = 0; slab_rebal.evictions_nomem++; } else { /* Was whatever it was, and we have memory for it. */ save_item = 1; } pthread_mutex_unlock(&slabs_lock); unsigned int requested_adjust = 0; if (save_item) { if (ch == NULL) { assert((new_it->it_flags & ITEM_CHUNKED) == 0); /* if free memory, memcpy. clear prev/next/h_bucket */ memcpy(new_it, it, ntotal); new_it->prev = 0; new_it->next = 0; new_it->h_next = 0; /* These are definitely required. else fails assert */ new_it->it_flags &= ~ITEM_LINKED; new_it->refcount = 0; do_item_replace(it, new_it, hv); /* Need to walk the chunks and repoint head */ if (new_it->it_flags & ITEM_CHUNKED) { item_chunk *fch = (item_chunk *) ITEM_data(new_it); fch->next->prev = fch; while (fch) { fch->head = new_it; fch = fch->next; } } it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif slab_rebal.rescues++; requested_adjust = ntotal; } else { item_chunk *nch = (item_chunk *) new_it; /* Chunks always have head chunk (the main it) */ ch->prev->next = nch; if (ch->next) ch->next->prev = nch; memcpy(nch, ch, ch->used + sizeof(item_chunk)); ch->refcount = 0; ch->it_flags = ITEM_SLABBED|ITEM_FETCHED; slab_rebal.chunk_rescues++; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key((item *)ch), "deadbeef", 8); #endif refcount_decr(it); requested_adjust = s_cls->size; } } else { /* restore ntotal in case we tried saving a head chunk. */ ntotal = ITEM_ntotal(it); do_item_unlink(it, hv); slabs_free(it, ntotal, slab_rebal.s_clsid); /* Swing around again later to remove it from the freelist. */ slab_rebal.busy_items++; was_busy++; } item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); /* Always remove the ntotal, as we added it in during * do_slabs_alloc() when copying the item. */ s_cls->requested -= requested_adjust; break; case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = ITEM_SLABBED|ITEM_FETCHED; #ifdef DEBUG_SLAB_MOVER memcpy(ITEM_key(it), "deadbeef", 8); #endif break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; STATS_LOCK(); stats.slab_reassign_busy_items += slab_rebal.busy_items; STATS_UNLOCK(); slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }
/* 扩容线程函数,扩容策略如下: * 扩容线程在main函数中创建,在assoc_insert后发现item数目大于哈希表容量1.5倍,唤醒扩容线程。 * 扩容线程先创建一个2倍容量的新哈希表,然后进行把数据从旧哈希表迁移到新哈希表。 * 迁移从旧表索引0开始,每次迁移一个桶(可以增加迁移粒度,但由于迁移需要加锁,可能导致work线程获取锁的等待时间增加), * 迁移完成后释放旧表。 */ static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ //获取单个桶锁 if ((item_lock = item_trylock(expand_bucket))) { //迁移一个桶中所有item for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //重新计算哈希值 bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; //迁移完成 if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; //将迁移标志设置0 free(old_hashtable); //释放旧表 STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) { /* We are done expanding.. just wait for next invocation */ //不需要迁移,挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 started_expanding = false; pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand(); //申请更大的哈希表,并将expanding设置为true pause_threads(RESUME_ALL_THREADS); } } return NULL; }
/* 从 slab 系统分配一个空闲 item */ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=search->prev) { uint32_t hv = hash(ITEM_key(search), search->nkey, 0); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* FIXME: I think we need to mask the hv here for comparison? */ if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* 先检查 LRU 队列最后一个 item 是否超时, 超时的话就把这个 item 分配给用户 */ if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); /* 把这个 item 从 LRU 队列和哈希表中移除 */ do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } /* 没有超时的item, 那就尝试从slabclass分配, 运气不好的话, 分配失败, 那就把 LRU 队列最后一个 item 剔除, 然后分配给用户 */ else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++;//显示出的统计信息 } else { itemstats[id].evicted++;//这个slab的分配失败次数加1,后面的分析统计信息的线程会用到这个统计信息 itemstats[id].evicted_time = current_time - search->time;//显示的统计信息 if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);//不用请求新的item了,减少相关的统计信息 /* 把老的item从hash表和lru队列中删除 */ do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just 回收 an item, and the automover is set to angry bird mode, attempt to rip memory into this slab class. TODO: Move valid object detection into a function, and on a "successful" memory pull, look behind and see if the next alloc would be an eviction. Then kick off the slab mover before the eviction happens.可以看到如果slab_automove=2(默认是1),这样会导致angry模式,就是只要分配失败了,马上就选择一个slab(旧的slagclass 释放的),把这个slab移动到当前slab-class中(不会有通过统计信息有选择的移动slab)*/ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/* refcount == 0 is safe since nobody can incr while item_lock is held. * refcount != 0 is impossible since flags/etc can be modified in other * threads. instead, note we found a busy one and bail. logic in do_item_get * will prevent busy items from continuing to be busy * NOTE: This is checking it_flags outside of an item lock. I believe this * works since it_flags is 8 bits, and we're only ever comparing a single bit * regardless. ITEM_SLABBED bit will always be correct since we're holding the * lock which modifies that bit. ITEM_LINKED won't exist if we're between an * item having ITEM_SLABBED removed, and the key hasn't been added to the item * yet. The memory barrier from the slabs lock should order the key write and the * flags to the item? * If ITEM_LINKED did exist and was just removed, but we still see it, that's * still safe since it will have a valid key, which we then lock, and then * recheck everything. * This may not be safe on all platforms; If not, slabs_alloc() will need to * seed the item key while holding slabs_lock. */ static int slab_rebalance_move(void) { slabclass_t *s_cls; int x; int was_busy = 0; int refcount = 0; uint32_t hv; void *hold_lock; enum move_status status = MOVE_PASS; pthread_mutex_lock(&slabs_lock); s_cls = &slabclass[slab_rebal.s_clsid]; for (x = 0; x < slab_bulk_check; x++) { hv = 0; hold_lock = NULL; item *it = slab_rebal.slab_pos; status = MOVE_PASS; if (it->slabs_clsid != 255) { /* ITEM_SLABBED can only be added/removed under the slabs_lock */ if (it->it_flags & ITEM_SLABBED) { /* remove from slab freelist */ if (s_cls->slots == it) { s_cls->slots = it->next; } if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; s_cls->sl_curr--; status = MOVE_FROM_SLAB; } else if ((it->it_flags & ITEM_LINKED) != 0) { /* If it doesn't have ITEM_SLABBED, the item could be in any * state on its way to being freed or written to. If no * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active * and have the key written to it already. */ hv = hash(ITEM_key(it), it->nkey); if ((hold_lock = item_trylock(hv)) == NULL) { status = MOVE_LOCKED; } else { refcount = refcount_incr(&it->refcount); if (refcount == 2) { /* item is linked but not busy */ /* Double check ITEM_LINKED flag here, since we're * past a memory barrier from the mutex. */ if ((it->it_flags & ITEM_LINKED) != 0) { status = MOVE_FROM_LRU; } else { /* refcount == 1 + !ITEM_LINKED means the item is being * uploaded to, or was just unlinked but hasn't been freed * yet. Let it bleed off on its own and try again later */ status = MOVE_BUSY; } } else { if (settings.verbose > 2) { fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n", it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid); } status = MOVE_BUSY; } /* Item lock must be held while modifying refcount */ if (status == MOVE_BUSY) { refcount_decr(&it->refcount); item_trylock_unlock(hold_lock); } } } } switch (status) { case MOVE_FROM_LRU: /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock. * We only need to hold the slabs_lock while initially looking * at an item, and at this point we have an exclusive refcount * (2) + the item is locked. Drop slabs lock, drop item to * refcount 1 (just our own, then fall through and wipe it */ pthread_mutex_unlock(&slabs_lock); do_item_unlink(it, hv); item_trylock_unlock(hold_lock); pthread_mutex_lock(&slabs_lock); case MOVE_FROM_SLAB: it->refcount = 0; it->it_flags = 0; it->slabs_clsid = 255; break; case MOVE_BUSY: case MOVE_LOCKED: slab_rebal.busy_items++; was_busy++; break; case MOVE_PASS: break; } slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size; if (slab_rebal.slab_pos >= slab_rebal.slab_end) break; } if (slab_rebal.slab_pos >= slab_rebal.slab_end) { /* Some items were busy, start again from the top */ if (slab_rebal.busy_items) { slab_rebal.slab_pos = slab_rebal.slab_start; slab_rebal.busy_items = 0; } else { slab_rebal.done++; } } pthread_mutex_unlock(&slabs_lock); return was_busy; }
static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); //do_run_maintenance_thread是全局变量,初始值为1,在stop_assoc_maintenance_thread //函数中会被赋值0,终止迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ //hash_bulk_move用来控制每次迁移,移动多少个桶的item。默认是一个. //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 if ((item_lock = item_trylock(expand_bucket))) { //在assoc_expand函数中expand_bucket会被赋值0 //遍历旧哈希表中由expand_bucket指明的桶,将该桶的所有item //迁移到新哈希表中。 for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //重新计算新的哈希值,得到其在新哈希表的位置 bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); //将这个item插入到新哈希表中 it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } //不需要清空旧桶。直接将冲突链的链头赋值为NULL即可 old_hashtable[expand_bucket] = NULL; //迁移完一个桶,接着把expand_bucket指向下一个待迁移的桶 expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) {//全部数据迁移完毕 expanding = false;//将扩展标志设置为false free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { //遍历完hash_bulk_move个桶的所有item后,就释放锁 item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) {//不需要迁移数据(了) /* We are done expanding.. just wait for next invocation */ started_expanding = false;//重置 //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand();//申请更大的哈希表,并将expanding设置为true pause_threads(RESUME_ALL_THREADS); } } return NULL; }