/* Slab rebalancer thread. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and * go to sleep if locks are contended */ static void *slab_maintenance_thread(void *arg) { int was_busy = 0; int src, dest; while (do_run_slab_thread) { if (slab_rebalance_signal == 1) { if (slab_rebalance_start() < 0) { /* Handle errors with more specifity as required. */ slab_rebalance_signal = 0; } } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) { /* If we have a decision to continue, continue it */ was_busy = slab_rebalance_move(); } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) { /* Blind to the return codes. It will retry on its own */ slabs_reassign(src, dest); } if (slab_rebal.done) { slab_rebalance_finish(); } /* Sleep a bit if no work to do, or waiting on busy objects */ if (was_busy || !slab_rebalance_signal) sleep(1); } return NULL; }
/* Loop up to N times: * If too many items are in HOT_LRU, push to COLD_LRU * If too many items are in WARM_LRU, push to COLD_LRU * If too many items are in COLD_LRU, poke COLD_LRU tail * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The * locks can't handle much more than that. Leaving a TODO for how to * autoadjust in the future. */ static int lru_maintainer_juggle(const int slabs_clsid) { int i; int did_moves = 0; bool mem_limit_reached = false; uint64_t total_bytes = 0; unsigned int chunks_perslab = 0; unsigned int chunks_free = 0; /* TODO: if free_chunks below high watermark, increase aggressiveness */ chunks_free = slabs_available_chunks(slabs_clsid, &mem_limit_reached, &total_bytes, &chunks_perslab); if (settings.expirezero_does_not_evict) total_bytes -= noexp_lru_size(slabs_clsid); /* If slab automove is enabled on any level, and we have more than 2 pages * worth of chunks free in this class, ask (gently) to reassign a page * from this class back into the global pool (0) */ if (settings.slab_automove > 0 && chunks_free > (chunks_perslab * 2.5)) { slabs_reassign(slabs_clsid, SLAB_GLOBAL_PAGE_POOL); } /* Juggle HOT/WARM up to N times */ for (i = 0; i < 1000; i++) { int do_more = 0; if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS) || lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS)) { do_more++; } do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS); if (do_more == 0) break; did_moves++; } return did_moves; }
/* Slab rebalancer thread. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and * go to sleep if locks are contended */ static void *slab_maintenance_thread(void *arg) { int src, dest; while (do_run_slab_thread) { if (settings.slab_automove == 1) { if (slab_automove_decision(&src, &dest) == 1) { /* Blind to the return codes. It will retry on its own */ slabs_reassign(src, dest); } sleep(1); } else { /* Don't wake as often if we're not enabled. * This is lazier than setting up a condition right now. */ sleep(5); } } return NULL; }
/* Slab rebalancer thread. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and * go to sleep if locks are contended */ static void *slab_maintenance_thread(void *arg) { int src, dest; //src是失败次数很少,或者很平均的那个slab-class id,dest是那些分配失败次数骤增的slab-class id while (do_run_slab_thread) { if (settings.slab_automove == 1)//如果是前面的激烈模式,slab_automove=2,就不用该线程去选择移动的slab,每次失败主动会去移动那些slab { if (slab_automove_decision(&src, &dest) == 1)//这里是具体的选择src和dest的处理函数 { /* Blind to the return codes. It will retry on its own */ slabs_reassign(src, dest); } sleep(1); } else { /* Don't wake as often if we're not enabled. * This is lazier than setting up a condition right now. */ sleep(5); } } return NULL; }
/*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; //计算这个item的空间 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } //根据大小判断从属于哪个slab unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ //在LRU中尝试5次还没合适的空间,则执行申请空间的操作 int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=search->prev) { uint32_t hv = hash(ITEM_key(search), search->nkey, 0); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* FIXME: I think we need to mask the hv here for comparison? */ if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* Expired or flushed */ // search指向的item过期了,则直接复用这块内存 if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } //此刻,过期失效的item没有找到,申请内存又失败了。看来只能使用 //LRU淘汰一个item(即使这个item并没有过期失效) else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } //从slab分配器中申请内存 if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ int tries = 5; /* Avoid hangs if a slab has nothing but refcounted stuff in it. */ int tries_lrutail_reflocked = 1000; int tried_alloc = 0; item *search; item *next_it; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=next_it) { /* we might relink search mid-loop, so search->prev isn't reliable */ next_it = search->prev; if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ tries++; continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* Don't accidentally grab ourselves, or bail if we can't quicklock */ if (hv == cur_hv || (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { /* Avoid pathological case with ref'ed items in tail */ do_item_update_nolock(search); tries_lrutail_reflocked--; tries++; refcount_decr(&search->refcount); itemstats[id].lrutail_reflocked++; /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); if (tries_lrutail_reflocked < 1) break; continue; } /* Expired or flushed */ if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } shadow_item* new_shadow_it = create_shadow_item(search); hv = hash(new_shadow_it->key, new_shadow_it->nkey); shadow_assoc_insert(new_shadow_it, hv); insert_shadowq_item(new_shadow_it,new_shadow_it->slabs_clsid); it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/* Returns number of items remove, expired, or evicted. * Callable from worker threads or the LRU maintainer thread */ static int lru_pull_tail(const int orig_id, const int cur_lru, const uint64_t total_bytes, uint8_t flags) { item *it = NULL; int id = orig_id; int removed = 0; if (id == 0) return 0; int tries = 5; item *search; item *next_it; void *hold_lock = NULL; unsigned int move_to_lru = 0; uint64_t limit = 0; id |= cur_lru; pthread_mutex_lock(&lru_locks[id]); search = tails[id]; /* We walk up *only* for locked items, and if bottom is expired. */ for (; tries > 0 && search != NULL; tries--, search=next_it) { /* we might relink search mid-loop, so search->prev isn't reliable */ next_it = search->prev; if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ if (flags & LRU_PULL_CRAWL_BLOCKS) { pthread_mutex_unlock(&lru_locks[id]); return 0; } tries++; continue; } uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount. Also skip ourselves. */ if ((hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { /* Note pathological case with ref'ed items in tail. * Can still unlink the item, but it won't be reusable yet */ itemstats[id].lrutail_reflocked++; /* In case of refcount leaks, enable for quick workaround. */ /* WARNING: This can cause terrible corruption */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; /* This will call item_remove -> item_free since refcnt is 1 */ do_item_unlink_nolock(search, hv); item_trylock_unlock(hold_lock); continue; } } /* Expired or flushed */ if ((search->exptime != 0 && search->exptime < current_time) || item_is_flushed(search)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } /* refcnt 2 -> 1 */ do_item_unlink_nolock(search, hv); /* refcnt 1 -> 0 -> item_free */ do_item_remove(search); item_trylock_unlock(hold_lock); removed++; /* If all we're finding are expired, can keep going */ continue; } /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU. * If we're COLD_LRU, send to WARM_LRU unless we need to evict */ switch (cur_lru) { case HOT_LRU: limit = total_bytes * settings.hot_lru_pct / 100; case WARM_LRU: if (limit == 0) limit = total_bytes * settings.warm_lru_pct / 100; if (sizes_bytes[id] > limit) { itemstats[id].moves_to_cold++; move_to_lru = COLD_LRU; do_item_unlink_q(search); it = search; removed++; break; } else if ((search->it_flags & ITEM_ACTIVE) != 0) { /* Only allow ACTIVE relinking if we're not too large. */ itemstats[id].moves_within_lru++; search->it_flags &= ~ITEM_ACTIVE; do_item_update_nolock(search); do_item_remove(search); item_trylock_unlock(hold_lock); } else { /* Don't want to move to COLD, not active, bail out */ it = search; } break; case COLD_LRU: it = search; /* No matter what, we're stopping */ if (flags & LRU_PULL_EVICT) { if (settings.evict_to_free == 0) { /* Don't think we need a counter for this. It'll OOM. */ break; } itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search); do_item_unlink_nolock(search, hv); removed++; if (settings.slab_automove == 2) { slabs_reassign(-1, orig_id); } } else if ((search->it_flags & ITEM_ACTIVE) != 0 && settings.lru_maintainer_thread) { itemstats[id].moves_to_warm++; search->it_flags &= ~ITEM_ACTIVE; move_to_lru = WARM_LRU; do_item_unlink_q(search); removed++; } break; } if (it != NULL) break; } pthread_mutex_unlock(&lru_locks[id]); if (it != NULL) { if (move_to_lru) { it->slabs_clsid = ITEM_clsid(it); it->slabs_clsid |= move_to_lru; item_link_q(it); } do_item_remove(it); item_trylock_unlock(hold_lock); } return removed; }
//分配一个item,这个函数包含了memcached具体item分配的逻辑 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); //item总大小 if (settings.use_cas) { ntotal += sizeof(uint64_t); //如果有用到cas,那么item大小还要加上unit64_t的size } unsigned int id = slabs_clsid(ntotal); //根据item的大小,找到适合的slabclass if (id == 0) return 0; mutex_lock(&cache_lock); //cache锁 /* do a quick check if we have any expired items in the tail.. */ int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; //全局变量,tails[x]是id为x的slabclass lru链表的尾部 /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ //首先从lru链表尾部查找有没有过期的item,tries = 5,最多循环5次 //注意这里是最多查找5次,只要找到一个没有被其他地方引用的item,那么就不再继续查找,如果这个item过期,就使用这个item的空间,否则创建新的slab for (; tries > 0 && search != NULL; tries--, search=search->prev) { if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) { /* We are a crawler, ignore it. */ //这里只是搜索过期的item,对于异常的item,直接忽略继续查找 tries++; continue; } //计算item的hash值,hv有两个作用:1.用于hash表保存item 2.用于item lock表中锁住item,通过hv计算出item_lock中哪个锁对当前item加锁 //不同item的hash值可能相同,hash表中用链表的方式解决冲突;item lock中多个item共享一个锁 uint32_t hv = hash(ITEM_key(search), search->nkey); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* Don't accidentally grab ourselves, or bail if we can't quicklock */ //锁住当前item if (hv == cur_hv || (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ //检查这个指向的这个item是否被其他地方引用,如果是的话,继续向前查找 if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (settings.tail_repair_time && search->time + settings.tail_repair_time < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* Expired or flushed */ //如果找到过期的item if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; //更新统计数据 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); //把旧的item从hash表和LRU链表中移除 do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } //如果没有找到过期的item,则调用slabs_alloc分配空间 //如果slabs_alloc返回null,表示分配失败,内存空间已满 //需要按LRU进行淘汰 else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; //标记一下,表示有尝试调用slabs_alloc分配空间 //记录被淘汰item的信息, 使用memcached经常会查看的evicted_time就是在这里赋值的 if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; } else { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; //被淘汰item距离上次使用的时间 if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); //更新统计数据 do_item_unlink_nolock(it, hv); //从hash表和LRU链表中移除 /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ //默认情况下,slab_automove=1,会合理地更具淘汰统计数据来分析怎么进行slabclass空间的分配 //如果slab_automove=2,只要分配失败了,马上进行slabclass空间的重分配 if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } //查找5次过期的item都失败,并且也没有淘汰可用且没有过期的item //分配新的内存空间 if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); //分配失败,返回null if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ //item内存空间分配成功,做一些初始化工作 it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
/*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ item *search; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; if (search != NULL && (refcount_incr(&search->refcount) == 2)) { if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { STATS_LOCK(); stats.expired_unfetched++; STATS_UNLOCK(); itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ it->slabs_clsid = 0; } else if ((it = slabs_alloc(ntotal, id)) == NULL) { if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { STATS_LOCK(); stats.evicted_unfetched++; STATS_UNLOCK(); itemstats[id].evicted_unfetched++; } STATS_LOCK(); stats.evictions++; STATS_UNLOCK(); it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just evicted an item, and the automover is set to * angry bird mode, attempt to rip memory into this slab class. * TODO: Move valid object detection into a function, and on a * "successful" memory pull, look behind and see if the next alloc * would be an eviction. Then kick off the slab mover before the * eviction happens. */ if (settings.slab_automove == 2) slabs_reassign(-1, id, 1); } else { refcount_decr(&search->refcount); } } else { /* If the LRU is empty or locked, attempt to allocate memory */ it = slabs_alloc(ntotal, id); if (search != NULL) refcount_decr(&search->refcount); } if (it == NULL) { itemstats[id].outofmemory++; /* Last ditch effort. There was a very rare bug which caused * refcount leaks. We leave this just in case they ever happen again. * We can reasonably assume no item can stay locked for more than * three hours, so if we find one in the tail which is that old, * free it anyway. */ if (search != NULL && search->refcount != 2 && search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0)); } mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
void process_command(conn *c, char *command) { int comm = 0; int incr = 0; /* * for commands set/add/replace, we build an item and read the data * directly into it, then continue in nread_complete(). */ if (settings.verbose > 1) fprintf(stderr, "<%d %s\n", c->sfd, command); /* All incoming commands will require a response, so we cork at the beginning, and uncork at the very end (usually by means of out_string) */ set_cork(c, 1); if ((strncmp(command, "add ", 4) == 0 && (comm = NREAD_ADD)) || (strncmp(command, "set ", 4) == 0 && (comm = NREAD_SET)) || (strncmp(command, "replace ", 8) == 0 && (comm = NREAD_REPLACE))) { char key[251]; int flags; time_t expire; int len, res; item *it; res = sscanf(command, "%*s %250s %u %ld %d\n", key, &flags, &expire, &len); if (res!=4 || strlen(key)==0 ) { out_string(c, "CLIENT_ERROR bad command line format"); return; } expire = realtime(expire); it = item_alloc(key, flags, expire, len+2); if (it == 0) { out_string(c, "SERVER_ERROR out of memory"); /* swallow the data line */ c->write_and_go = conn_swallow; c->sbytes = len+2; return; } c->item_comm = comm; c->item = it; c->rcurr = ITEM_data(it); c->rlbytes = it->nbytes; c->state = conn_nread; return; } if ((strncmp(command, "incr ", 5) == 0 && (incr = 1)) || (strncmp(command, "decr ", 5) == 0)) { char temp[32]; unsigned int value; item *it; unsigned int delta; char key[251]; int res; char *ptr; time_t now = time(0); res = sscanf(command, "%*s %250s %u\n", key, &delta); if (res!=2 || strlen(key)==0 ) { out_string(c, "CLIENT_ERROR bad command line format"); return; } it = assoc_find(key); if (it && (it->it_flags & ITEM_DELETED)) { it = 0; } if (it && it->exptime && it->exptime < now) { item_unlink(it); it = 0; } if (!it) { out_string(c, "NOT_FOUND"); return; } ptr = ITEM_data(it); while (*ptr && (*ptr<'0' && *ptr>'9')) ptr++; value = atoi(ptr); if (incr) value+=delta; else { if (delta >= value) value = 0; else value-=delta; } sprintf(temp, "%u", value); res = strlen(temp); if (res + 2 > it->nbytes) { /* need to realloc */ item *new_it; new_it = item_alloc(ITEM_key(it), it->flags, it->exptime, res + 2 ); if (new_it == 0) { out_string(c, "SERVER_ERROR out of memory"); return; } memcpy(ITEM_data(new_it), temp, res); memcpy(ITEM_data(new_it) + res, "\r\n", 2); item_replace(it, new_it); } else { /* replace in-place */ memcpy(ITEM_data(it), temp, res); memset(ITEM_data(it) + res, ' ', it->nbytes-res-2); } out_string(c, temp); return; } if (strncmp(command, "get ", 4) == 0) { char *start = command + 4; char key[251]; int next; int i = 0; item *it; time_t now = time(0); while(sscanf(start, " %250s%n", key, &next) >= 1) { start+=next; stats.get_cmds++; it = assoc_find(key); if (it && (it->it_flags & ITEM_DELETED)) { it = 0; } if (settings.oldest_live && it && it->time <= settings.oldest_live) { item_unlink(it); it = 0; } if (it && it->exptime && it->exptime < now) { item_unlink(it); it = 0; } if (it) { if (i >= c->isize) { item **new_list = realloc(c->ilist, sizeof(item *)*c->isize*2); if (new_list) { c->isize *= 2; c->ilist = new_list; } else break; } stats.get_hits++; it->refcount++; item_update(it); *(c->ilist + i) = it; i++; } else stats.get_misses++; } c->icurr = c->ilist; c->ileft = i; if (c->ileft) { c->ipart = 0; c->state = conn_mwrite; c->ibytes = 0; return; } else { out_string(c, "END"); return; } } if (strncmp(command, "delete ", 7) == 0) { char key[251]; item *it; int res; time_t exptime = 0; res = sscanf(command, "%*s %250s %ld", key, &exptime); it = assoc_find(key); if (!it) { out_string(c, "NOT_FOUND"); return; } if (exptime == 0) { item_unlink(it); out_string(c, "DELETED"); return; } if (delcurr >= deltotal) { item **new_delete = realloc(todelete, sizeof(item *) * deltotal * 2); if (new_delete) { todelete = new_delete; deltotal *= 2; } else { /* * can't delete it immediately, user wants a delay, * but we ran out of memory for the delete queue */ out_string(c, "SERVER_ERROR out of memory"); return; } } exptime = realtime(exptime); it->refcount++; /* use its expiration time as its deletion time now */ it->exptime = exptime; it->it_flags |= ITEM_DELETED; todelete[delcurr++] = it; out_string(c, "DELETED"); return; } if (strncmp(command, "stats", 5) == 0) { process_stat(c, command); return; } if (strcmp(command, "flush_all") == 0) { settings.oldest_live = time(0); out_string(c, "OK"); return; } if (strcmp(command, "version") == 0) { out_string(c, "VERSION " VERSION); return; } if (strcmp(command, "quit") == 0) { c->state = conn_closing; return; } if (strncmp(command, "slabs reassign ", 15) == 0) { int src, dst; char *start = command+15; if (sscanf(start, "%u %u\r\n", &src, &dst) == 2) { int rv = slabs_reassign(src, dst); if (rv == 1) { out_string(c, "DONE"); return; } if (rv == 0) { out_string(c, "CANT"); return; } if (rv == -1) { out_string(c, "BUSY"); return; } } out_string(c, "CLIENT_ERROR bogus command"); return; } out_string(c, "ERROR"); return; }
/* 从 slab 系统分配一个空闲 item */ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes, const uint32_t cur_hv) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ int tries = 5; int tried_alloc = 0; item *search; void *hold_lock = NULL; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; /* We walk up *only* for locked items. Never searching for expired. * Waste of CPU for almost all deployments */ for (; tries > 0 && search != NULL; tries--, search=search->prev) { uint32_t hv = hash(ITEM_key(search), search->nkey, 0); /* Attempt to hash item lock the "search" item. If locked, no * other callers can incr the refcount */ /* FIXME: I think we need to mask the hv here for comparison? */ if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL) continue; /* Now see if the item is refcount locked */ if (refcount_incr(&search->refcount) != 2) { refcount_decr(&search->refcount); /* Old rare bug could cause a refcount leak. We haven't seen * it in years, but we leave this code in to prevent failures * just in case */ if (search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hv); } if (hold_lock) item_trylock_unlock(hold_lock); continue; } /* 先检查 LRU 队列最后一个 item 是否超时, 超时的话就把这个 item 分配给用户 */ if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); /* 把这个 item 从 LRU 队列和哈希表中移除 */ do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; } /* 没有超时的item, 那就尝试从slabclass分配, 运气不好的话, 分配失败, 那就把 LRU 队列最后一个 item 剔除, 然后分配给用户 */ else if ((it = slabs_alloc(ntotal, id)) == NULL) { tried_alloc = 1; if (settings.evict_to_free == 0) { itemstats[id].outofmemory++;//显示出的统计信息 } else { itemstats[id].evicted++;//这个slab的分配失败次数加1,后面的分析统计信息的线程会用到这个统计信息 itemstats[id].evicted_time = current_time - search->time;//显示的统计信息 if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { itemstats[id].evicted_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);//不用请求新的item了,减少相关的统计信息 /* 把老的item从hash表和lru队列中删除 */ do_item_unlink_nolock(it, hv); /* Initialize the item block: */ it->slabs_clsid = 0; /* If we've just 回收 an item, and the automover is set to angry bird mode, attempt to rip memory into this slab class. TODO: Move valid object detection into a function, and on a "successful" memory pull, look behind and see if the next alloc would be an eviction. Then kick off the slab mover before the eviction happens.可以看到如果slab_automove=2(默认是1),这样会导致angry模式,就是只要分配失败了,马上就选择一个slab(旧的slagclass 释放的),把这个slab移动到当前slab-class中(不会有通过统计信息有选择的移动slab)*/ if (settings.slab_automove == 2) slabs_reassign(-1, id); } } refcount_decr(&search->refcount); /* If hash values were equal, we don't grab a second lock */ if (hold_lock) item_trylock_unlock(hold_lock); break; } if (!tried_alloc && (tries == 0 || search == NULL)) it = slabs_alloc(ntotal, id); if (it == NULL) { itemstats[id].outofmemory++; mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* Item initialization can happen outside of the lock; the item's already been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }