/* Note: this isn't an assoc_update. The key must not already exist to call this */ int assoc_insert(struct default_engine *engine, uint32_t hash, hash_item *it) { unsigned int oldbucket; assert(assoc_find(engine, hash, item_get_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */ // inserting actual hash_item to appropriate assoc_t if (engine->assoc.expanding && (oldbucket = (hash & hashmask(engine->assoc.hashpower - 1))) >= engine->assoc.expand_bucket) { it->h_next = engine->assoc.old_hashtable[oldbucket]; engine->assoc.old_hashtable[oldbucket] = it; } else { it->h_next = engine->assoc.primary_hashtable[hash & hashmask(engine->assoc.hashpower)]; engine->assoc.primary_hashtable[hash & hashmask(engine->assoc.hashpower)] = it; } engine->assoc.hash_items++; if (! engine->assoc.expanding && engine->assoc.hash_items > (hashsize(engine->assoc.hashpower) * 3) / 2) { assoc_expand(engine); } MEMCACHED_ASSOC_INSERT(item_get_key(it), it->nkey, engine->assoc.hash_items); return 1; }
/* Note: this isn't an assoc_update. The key must not already exist to call this */ int assoc_insert(struct default_engine *engine, uint32_t hash, hash_item *it) { struct assoc *assoc = &engine->assoc; uint32_t bucket = GET_HASH_BUCKET(hash, assoc->hashmask); uint32_t tabidx; assert(assoc_find(engine, hash, item_get_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */ if (assoc->infotable[bucket].curpower != assoc->rootpower && assoc->infotable[bucket].refcount == 0) { redistribute(engine, bucket); } tabidx = GET_HASH_TABIDX(hash, assoc->hashpower, hashmask(assoc->infotable[bucket].curpower)); // inserting actual hash_item to appropriate assoc_t it->h_next = assoc->roottable[tabidx].hashtable[bucket]; assoc->roottable[tabidx].hashtable[bucket] = it; assoc->hash_items++; if (assoc->hash_items > (hashsize(assoc->hashpower + assoc->rootpower) * 3) / 2) { assoc_expand(engine); } MEMCACHED_ASSOC_INSERT(item_get_key(it), it->nkey, assoc->hash_items); return 1; }
/* Note: this isn't an assoc_update. The key must not already exist to call this */ int assoc_insert(item *it) { uint32_t hv; unsigned int oldbucket; assert(assoc_find(ITEM_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */ hv = hash(ITEM_key(it), it->nkey, 0); if (expanding && (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket) { it->h_next = old_hashtable[oldbucket]; old_hashtable[oldbucket] = it; } else { it->h_next = primary_hashtable[hv & hashmask(hashpower)]; primary_hashtable[hv & hashmask(hashpower)] = it; } hash_items++; if (! expanding && hash_items > (hashsize(hashpower) * 3) / 2) { assoc_expand(); } MEMCACHED_ASSOC_INSERT(ITEM_key(it), it->nkey, hash_items); return 1; }
static void *assoc_maintenance_thread(void *arg) { while (do_run_maintenance_thread) { int ii = 0; /* Lock the cache, and bulk move multiple buckets to the new * hash table. */ item_lock_global(); mutex_lock(&cache_lock); for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey, 0) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } mutex_unlock(&cache_lock); item_unlock_global(); if (!expanding) { /* finished expanding. tell all threads to use fine-grained locks */ switch_item_lock_type(ITEM_LOCK_GRANULAR); slabs_rebalancer_resume(); /* We are done expanding.. just wait for next invocation */ mutex_lock(&cache_lock); started_expanding = false; pthread_cond_wait(&maintenance_cond, &cache_lock); /* Before doing anything, tell threads to use a global lock */ mutex_unlock(&cache_lock); slabs_rebalancer_pause(); switch_item_lock_type(ITEM_LOCK_GLOBAL); mutex_lock(&cache_lock); assoc_expand(); mutex_unlock(&cache_lock); } } return NULL; }
/* Note: this isn't an assoc_update. The key must not already exist to call this */ int assoc_insert(struct persistent_engine *engine, uint32_t hash, hash_item *it) { unsigned int oldbucket; assert(assoc_find(engine, hash, item_get_key(&it->item), it->item.nkey) == 0); /* shouldn't have duplicately named things defined */ if (engine->assoc.expanding && (oldbucket = (hash & hashmask(engine->assoc.hashpower - 1))) >= engine->assoc.expand_bucket) { it->h_next = engine->assoc.old_hashtable[oldbucket]; engine->assoc.old_hashtable[oldbucket] = it; } else { it->h_next = engine->assoc.primary_hashtable[hash & hashmask(engine->assoc.hashpower)]; engine->assoc.primary_hashtable[hash & hashmask(engine->assoc.hashpower)] = it; } engine->assoc.hash_items++; if (! engine->assoc.expanding && engine->assoc.hash_items > (hashsize(engine->assoc.hashpower) * 3) / 2) { assoc_expand(engine); } return 1; }
static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ if ((item_lock = item_trylock(expand_bucket))) { for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) { /* We are done expanding.. just wait for next invocation */ started_expanding = false; pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand(); pause_threads(RESUME_ALL_THREADS); } } return NULL; }
//启动扩容线程,扩容线程在main函数中会启动,启动运行一遍之后会阻塞在条件变量maintenance_cond上面, //当插入元素超过规定,唤醒条件变量,再次运行 static void *assoc_maintenance_thread(void *arg) { //do_run_maintenance_thread是全局变量,初始值为1 //在stop_assoc_maintenance_thread函数中会被赋值0,用来终止迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* Lock the cache, and bulk move multiple buckets to the new * hash table. */ item_lock_global(); mutex_lock(&cache_lock); //hash_bulk_move用来控制每次迁移,移动多少个桶的item。默认是一个. //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; //遍历旧哈希表中由expand_bucket指明的桶,将该桶的所有item迁移到新扩容的哈希表中 for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey, 0) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; //迁移完一个桶,接着把expand_bucket指向下一个待迁移的桶 expand_bucket++; //数据迁移完毕 if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } //释放锁 mutex_unlock(&cache_lock); item_unlock_global(); //不再迁移数据 if (!expanding) { /* finished expanding. tell all threads to use fine-grained locks */ switch_item_lock_type(ITEM_LOCK_GRANULAR); slabs_rebalancer_resume(); /* We are done expanding.. just wait for next invocation */ mutex_lock(&cache_lock); started_expanding = false; //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &cache_lock); /* Before doing anything, tell threads to use a global lock */ mutex_unlock(&cache_lock); slabs_rebalancer_pause(); switch_item_lock_type(ITEM_LOCK_GLOBAL); mutex_lock(&cache_lock); assoc_expand();//??? mutex_unlock(&cache_lock); } } return NULL; }
/* 扩容线程函数,扩容策略如下: * 扩容线程在main函数中创建,在assoc_insert后发现item数目大于哈希表容量1.5倍,唤醒扩容线程。 * 扩容线程先创建一个2倍容量的新哈希表,然后进行把数据从旧哈希表迁移到新哈希表。 * 迁移从旧表索引0开始,每次迁移一个桶(可以增加迁移粒度,但由于迁移需要加锁,可能导致work线程获取锁的等待时间增加), * 迁移完成后释放旧表。 */ static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ //获取单个桶锁 if ((item_lock = item_trylock(expand_bucket))) { //迁移一个桶中所有item for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //重新计算哈希值 bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; //迁移完成 if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; //将迁移标志设置0 free(old_hashtable); //释放旧表 STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) { /* We are done expanding.. just wait for next invocation */ //不需要迁移,挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 started_expanding = false; pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand(); //申请更大的哈希表,并将expanding设置为true pause_threads(RESUME_ALL_THREADS); } } return NULL; }
static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); //do_run_maintenance_thread是全局变量,初始值为1,在stop_assoc_maintenance_thread //函数中会被赋值0,终止迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ //hash_bulk_move用来控制每次迁移,移动多少个桶的item。默认是一个. //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 if ((item_lock = item_trylock(expand_bucket))) { //在assoc_expand函数中expand_bucket会被赋值0 //遍历旧哈希表中由expand_bucket指明的桶,将该桶的所有item //迁移到新哈希表中。 for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //重新计算新的哈希值,得到其在新哈希表的位置 bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); //将这个item插入到新哈希表中 it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } //不需要清空旧桶。直接将冲突链的链头赋值为NULL即可 old_hashtable[expand_bucket] = NULL; //迁移完一个桶,接着把expand_bucket指向下一个待迁移的桶 expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) {//全部数据迁移完毕 expanding = false;//将扩展标志设置为false free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { //遍历完hash_bulk_move个桶的所有item后,就释放锁 item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) {//不需要迁移数据(了) /* We are done expanding.. just wait for next invocation */ started_expanding = false;//重置 //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand();//申请更大的哈希表,并将expanding设置为true pause_threads(RESUME_ALL_THREADS); } } return NULL; }
//数据迁移线程回调函数 static void *assoc_maintenance_thread(void *arg) { //do_run_maintenance_thread 是全局变量,初始值为1,在stop_assoc_mainternance_thread //函数中会被赋值0,之中迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* Lock the cache, and bulk move multiple buckets to the new * hash table. */ //上锁 item_lock_global();//锁上全局级别的锁,全部的item都在全局锁的控制之下 //锁住哈希表里面的item。不然别的线程对哈希表进行增删操作时,会出现 //数据不一致的情况.在item.c的do_item_link和do_item_unlink可以看到 //其内部也会锁住cache_lock锁. mutex_lock(&cache_lock); //进行item迁移 for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } //遍历完就释放锁 mutex_unlock(&cache_lock); item_unlock_global(); //不需要迁移数据了 if (!expanding) { /* 迁移线程为什么要这么迂回曲折地切换workers线程的锁类型呢?直接修改所有线程的LIBEVENT_THREAD结构的item_lock_type 成员变量不就行了吗? 这主要是因为迁移线程不知道worker线程此刻在干些什么。如果worker线程正在访问item,并抢占了段级别锁。此时你把worker 线程的锁切换到全局锁,等worker线程解锁的时候就会解全局锁(参考前面的item_lock和item_unlock代码),这样程序就崩溃了。 所以不能迁移线程去切换,只能迁移线程通知worker线程,然后worker线程自己去切换。当然是要worker线程忙完了手头上的事情 后,才会去修改切换的。所以迁移线程在通知完所有的worker线程后,会调用wait_for_thread_registration函数休眠等待所有的 worker线程都切换到指定的锁类型后才醒来。 */ /* finished expanding. tell all threads to use fine-grained locks */ //进入到这里,说明已经不需要迁移数据(停止扩展了)。 //告诉所有的workers线程,访问item时,切换到段级别的锁。 //会阻塞到所有workers线程都切换到段级别的锁 switch_item_lock_type(ITEM_LOCK_GRANULAR); slabs_rebalancer_resume(); /* We are done expanding.. just wait for next invocation */ mutex_lock(&cache_lock); // 重置 started_expanding = false; //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5被哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &cache_lock); /* Before doing anything, tell threads to use a global lock */ mutex_unlock(&cache_lock); slabs_rebalancer_pause(); //从maintenance_cond条件变量中醒来,说明又要开始扩展哈希表和迁移数据了。 //迁移线程在迁移一个桶的数据时是锁上全局级别的锁. //此时workers线程不能使用段级别的锁,而是要使用全局级别的锁, //所有的workers线程和迁移线程一起,争抢全局级别的锁. //哪个线程抢到了,才有权利访问item. //下面一行代码就是通知所有的workers线程,把你们访问item的锁切换 //到全局级别的锁。switch_item_lock_type会通过条件变量休眠等待, //直到,所有的workers线程都切换到全局级别的锁,才会醒来过 switch_item_lock_type(ITEM_LOCK_GLOBAL); mutex_lock(&cache_lock); //申请更大的哈希表,并将expanding设置为true assoc_expand(); mutex_unlock(&cache_lock); } } return NULL; }