ENGINE_ERROR_CODE assoc_init(struct default_engine *engine) { engine->assoc.primary_hashtable = calloc(hashsize(engine->assoc.hashpower), sizeof(void *)); return (engine->assoc.primary_hashtable != NULL) ? ENGINE_SUCCESS : ENGINE_ENOMEM; }
static void *shadow_assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_shadow_lock); while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { shadow_item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(shadow_hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ if ((item_lock = item_trylock(expand_bucket))) { for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(it->key, it->nkey) & hashmask(shadow_hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(shadow_hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(shadow_hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) { /* We are done expanding.. just wait for next invocation */ started_expanding = false; pthread_cond_wait(&maintenance_shadow_cond, &maintenance_shadow_lock); /* shadow_assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); shadow_assoc_expand(); pause_threads(RESUME_ALL_THREADS); } } return NULL; }
static void *assoc_maintenance_thread(void *arg) { mutex_lock(&maintenance_lock); //do_run_maintenance_thread是全局变量,初始值为1,在stop_assoc_maintenance_thread //函数中会被赋值0,终止迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* There is only one expansion thread, so no need to global lock. */ for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; void *item_lock = NULL; /* bucket = hv & hashmask(hashpower) =>the bucket of hash table * is the lowest N bits of the hv, and the bucket of item_locks is * also the lowest M bits of hv, and N is greater than M. * So we can process expanding with only one item_lock. cool! */ //hash_bulk_move用来控制每次迁移,移动多少个桶的item。默认是一个. //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体 if ((item_lock = item_trylock(expand_bucket))) { //在assoc_expand函数中expand_bucket会被赋值0 //遍历旧哈希表中由expand_bucket指明的桶,将该桶的所有item //迁移到新哈希表中。 for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //重新计算新的哈希值,得到其在新哈希表的位置 bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); //将这个item插入到新哈希表中 it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } //不需要清空旧桶。直接将冲突链的链头赋值为NULL即可 old_hashtable[expand_bucket] = NULL; //迁移完一个桶,接着把expand_bucket指向下一个待迁移的桶 expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) {//全部数据迁移完毕 expanding = false;//将扩展标志设置为false free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } else { usleep(10*1000); } if (item_lock) { //遍历完hash_bulk_move个桶的所有item后,就释放锁 item_trylock_unlock(item_lock); item_lock = NULL; } } if (!expanding) {//不需要迁移数据(了) /* We are done expanding.. just wait for next invocation */ started_expanding = false;//重置 //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal //唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &maintenance_lock); /* assoc_expand() swaps out the hash table entirely, so we need * all threads to not hold any references related to the hash * table while this happens. * This is instead of a more complex, possibly slower algorithm to * allow dynamic hash table expansion without causing significant * wait times. */ pause_threads(PAUSE_ALL_THREADS); assoc_expand();//申请更大的哈希表,并将expanding设置为true pause_threads(RESUME_ALL_THREADS); } } return NULL; }
//数据迁移线程回调函数 static void *assoc_maintenance_thread(void *arg) { //do_run_maintenance_thread 是全局变量,初始值为1,在stop_assoc_mainternance_thread //函数中会被赋值0,之中迁移线程 while (do_run_maintenance_thread) { int ii = 0; /* Lock the cache, and bulk move multiple buckets to the new * hash table. */ //上锁 item_lock_global();//锁上全局级别的锁,全部的item都在全局锁的控制之下 //锁住哈希表里面的item。不然别的线程对哈希表进行增删操作时,会出现 //数据不一致的情况.在item.c的do_item_link和do_item_unlink可以看到 //其内部也会锁住cache_lock锁. mutex_lock(&cache_lock); //进行item迁移 for (ii = 0; ii < hash_bulk_move && expanding; ++ii) { item *it, *next; int bucket; for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; } old_hashtable[expand_bucket] = NULL; expand_bucket++; if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); STATS_LOCK(); stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *); stats.hash_is_expanding = 0; STATS_UNLOCK(); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done\n"); } } //遍历完就释放锁 mutex_unlock(&cache_lock); item_unlock_global(); //不需要迁移数据了 if (!expanding) { /* 迁移线程为什么要这么迂回曲折地切换workers线程的锁类型呢?直接修改所有线程的LIBEVENT_THREAD结构的item_lock_type 成员变量不就行了吗? 这主要是因为迁移线程不知道worker线程此刻在干些什么。如果worker线程正在访问item,并抢占了段级别锁。此时你把worker 线程的锁切换到全局锁,等worker线程解锁的时候就会解全局锁(参考前面的item_lock和item_unlock代码),这样程序就崩溃了。 所以不能迁移线程去切换,只能迁移线程通知worker线程,然后worker线程自己去切换。当然是要worker线程忙完了手头上的事情 后,才会去修改切换的。所以迁移线程在通知完所有的worker线程后,会调用wait_for_thread_registration函数休眠等待所有的 worker线程都切换到指定的锁类型后才醒来。 */ /* finished expanding. tell all threads to use fine-grained locks */ //进入到这里,说明已经不需要迁移数据(停止扩展了)。 //告诉所有的workers线程,访问item时,切换到段级别的锁。 //会阻塞到所有workers线程都切换到段级别的锁 switch_item_lock_type(ITEM_LOCK_GRANULAR); slabs_rebalancer_resume(); /* We are done expanding.. just wait for next invocation */ mutex_lock(&cache_lock); // 重置 started_expanding = false; //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5被哈希表大小, //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal唤醒迁移线程 pthread_cond_wait(&maintenance_cond, &cache_lock); /* Before doing anything, tell threads to use a global lock */ mutex_unlock(&cache_lock); slabs_rebalancer_pause(); //从maintenance_cond条件变量中醒来,说明又要开始扩展哈希表和迁移数据了。 //迁移线程在迁移一个桶的数据时是锁上全局级别的锁. //此时workers线程不能使用段级别的锁,而是要使用全局级别的锁, //所有的workers线程和迁移线程一起,争抢全局级别的锁. //哪个线程抢到了,才有权利访问item. //下面一行代码就是通知所有的workers线程,把你们访问item的锁切换 //到全局级别的锁。switch_item_lock_type会通过条件变量休眠等待, //直到,所有的workers线程都切换到全局级别的锁,才会醒来过 switch_item_lock_type(ITEM_LOCK_GLOBAL); mutex_lock(&cache_lock); //申请更大的哈希表,并将expanding设置为true assoc_expand(); mutex_unlock(&cache_lock); } } return NULL; }
static ENGINE_ERROR_CODE do_assoc_get_prefix_stats(struct default_engine *engine, const char *prefix, const int nprefix, void *prefix_data) { struct assoc *assoc = &engine->assoc; prefix_t *pt; if (nprefix < 0) { // all prefix information char *buf; struct tm *t; const char *format = "PREFIX %s itm %llu kitm %llu litm %llu sitm %llu bitm %llu " "tsz %llu ktsz %llu ltsz %llu stsz %llu btsz %llu " "time %04d%02d%02d%02d%02d%02d\r\n"; uint32_t i, hsize = hashsize(DEFAULT_PREFIX_HASHPOWER); uint32_t num_prefixes = assoc->tot_prefix_items; uint32_t tot_prefix_name_len = 0; uint32_t msize, pos, written; pt = root_pt; if (pt != NULL && (pt->hash_items > 0 || pt->list_hash_items > 0 || pt->set_hash_items > 0 || pt->btree_hash_items > 0)) { /* including null prefix */ num_prefixes += 1; tot_prefix_name_len = strlen("<null>"); } for (i = 0; i < hsize; i++) { pt = assoc->prefix_hashtable[i]; while (pt) { tot_prefix_name_len += pt->nprefix; pt = pt->h_next; } } msize = sizeof(uint32_t) + strlen(format) + tot_prefix_name_len + num_prefixes * (strlen(format) - 2 /* %s */ + (10 * (20 - 4))) /* %llu replaced by 20-digit num */ - (5 * (4 - 2)) /* %02d replaced by 2-digit num */ + sizeof("END\r\n"); buf = malloc(msize); if (buf == NULL) { return ENGINE_ENOMEM; } pos = sizeof(uint32_t); pt = root_pt; if (pt != NULL && (pt->hash_items > 0 || pt->list_hash_items > 0 || pt->set_hash_items > 0 || pt->btree_hash_items > 0)) { /* including null prefix */ t = localtime(&pt->create_time); written = snprintf(buf+pos, msize-pos, format, "<null>", pt->hash_items+pt->list_hash_items+pt->set_hash_items+pt->btree_hash_items, pt->hash_items,pt->list_hash_items,pt->set_hash_items,pt->btree_hash_items, pt->hash_items_bytes+pt->list_hash_items_bytes+pt->set_hash_items_bytes+pt->btree_hash_items_bytes, pt->hash_items_bytes,pt->list_hash_items_bytes,pt->set_hash_items_bytes,pt->btree_hash_items_bytes, t->tm_year+1900, t->tm_mon+1, t->tm_mday, t->tm_hour, t->tm_min, t->tm_sec); pos += written; } for (i = 0; i < hsize; i++) { pt = assoc->prefix_hashtable[i]; while (pt) { t = localtime(&pt->create_time); written = snprintf(buf+pos, msize-pos, format, _get_prefix(pt), pt->hash_items+pt->list_hash_items+pt->set_hash_items+pt->btree_hash_items, pt->hash_items,pt->list_hash_items,pt->set_hash_items,pt->btree_hash_items, pt->hash_items_bytes+pt->list_hash_items_bytes+pt->set_hash_items_bytes+pt->btree_hash_items_bytes, pt->hash_items_bytes,pt->list_hash_items_bytes,pt->set_hash_items_bytes,pt->btree_hash_items_bytes, t->tm_year+1900, t->tm_mon+1, t->tm_mday, t->tm_hour, t->tm_min, t->tm_sec); pos += written; assert(pos < msize); pt = pt->h_next; } } memcpy(buf+pos, "END\r\n", 6); *(uint32_t*)buf = pos + 5 - sizeof(uint32_t); *(char**)prefix_data = buf; } else { prefix_engine_stats *prefix_stats = (prefix_engine_stats*)prefix_data; if (prefix != NULL) { pt = assoc_prefix_find(engine, engine->server.core->hash(prefix,nprefix,0), prefix, nprefix); } else { pt = root_pt; } if (pt == NULL) { return ENGINE_PREFIX_ENOENT; } prefix_stats->hash_items = pt->hash_items; prefix_stats->hash_items_bytes = pt->hash_items_bytes; prefix_stats->prefix_items = pt->prefix_items; if (prefix != NULL) prefix_stats->tot_prefix_items = pt->prefix_items; else prefix_stats->tot_prefix_items = assoc->tot_prefix_items; } return ENGINE_SUCCESS; }
/* * Initializes the thread subsystem, creating various worker threads. * * nthreads Number of worker event handler threads to spawn * main_base Event base for main thread */ void thread_init(int nthreads, struct event_base *main_base, void *(*join_request_listener_thread_routine)(void *), void *(*joining_thread_routine)(void *), void *(*node_removal_listener_thread_routine)(void *),void *(*node_propagation_thread_routine)(void *)) { int i; int power; pthread_mutex_init(&cache_lock, NULL); pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&init_lock, NULL); pthread_cond_init(&init_cond, NULL); pthread_mutex_init(&cqi_freelist_lock, NULL); cqi_freelist = NULL; /* Want a wide lock table, but don't waste memory */ if (nthreads < 3) { power = 10; } else if (nthreads < 4) { power = 11; } else if (nthreads < 5) { power = 12; } else { /* 8192 buckets, and central locks don't scale much past 5 threads */ power = 13; } item_lock_count = hashsize(power); item_locks = calloc(item_lock_count, sizeof(pthread_mutex_t)); if (! item_locks) { perror("Can't allocate item locks"); exit(1); } for (i = 0; i < item_lock_count; i++) { pthread_mutex_init(&item_locks[i], NULL); } pthread_key_create(&item_lock_type_key, NULL); pthread_mutex_init(&item_global_lock, NULL); threads = calloc(nthreads, sizeof(LIBEVENT_THREAD)); if (! threads) { perror("Can't allocate thread descriptors"); exit(1); } dispatcher_thread.base = main_base; dispatcher_thread.thread_id = pthread_self(); for (i = 0; i < nthreads; i++) { int fds[2]; if (pipe(fds)) { perror("Can't create notify pipe"); exit(1); } threads[i].notify_receive_fd = fds[0]; threads[i].notify_send_fd = fds[1]; setup_thread(&threads[i]); /* Reserve three fds for the libevent base, and two for the pipe */ stats.reserved_fds += 5; } /* Create threads after we've done all the libevent setup. */ for (i = 0; i < nthreads; i++) { create_worker(worker_libevent, &threads[i]); } /* Wait for all the threads to set themselves up before returning. */ pthread_mutex_lock(&init_lock); wait_for_thread_registration(nthreads); pthread_mutex_unlock(&init_lock); if(joining_thread_routine != NULL) connect_to_join_server(joining_thread_routine); else start_listening_on_join_port(join_request_listener_thread_routine); usleep(1000); start_listening_on_node_propagation_port(node_propagation_thread_routine); usleep(1000); start_listening_on_node_removal_port(node_removal_listener_thread_routine); ///we may want to comment this pthread_join(connect_and_split_thread,NULL); }
/* * 初始化线程子系统, 创建工作线程 * Initializes the thread subsystem, creating various worker threads. * * nthreads Number of worker event handler threads to spawn 需准备的线程数 * main_base Event base for main thread 分发线程 */ void thread_init(int nthreads, struct event_base *main_base) { int i; int power; // 互斥量初始化 pthread_mutex_init(&cache_lock, NULL); pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&init_lock, NULL); pthread_cond_init(&init_cond, NULL); pthread_mutex_init(&cqi_freelist_lock, NULL); cqi_freelist = NULL; /* Want a wide lock table, but don't waste memory */ // 锁表? if (nthreads < 3) { power = 10; } else if (nthreads < 4) { power = 11; } else if (nthreads < 5) { power = 12; } else { // 2^13 /* 8192 buckets, and central locks don't scale much past 5 threads */ power = 13; } // 预申请那么多的锁, 拿来做什么 // hashsize = 2^n item_lock_count = hashsize(power); item_locks = calloc(item_lock_count, sizeof(pthread_mutex_t)); if (! item_locks) { perror("Can't allocate item locks"); exit(1); } // 初始化 for (i = 0; i < item_lock_count; i++) { pthread_mutex_init(&item_locks[i], NULL); } pthread_key_create(&item_lock_type_key, NULL); pthread_mutex_init(&item_global_lock, NULL); // LIBEVENT_THREAD 是结合 libevent 使用的结构体, event_base, 读写管道 threads = calloc(nthreads, sizeof(LIBEVENT_THREAD)); // 生成 nthreads 数量的线程 if (! threads) { perror("Can't allocate thread descriptors"); exit(1); } // main_base 应该是分发任务的线程, 即主线程 dispatcher_thread.base = main_base; dispatcher_thread.thread_id = pthread_self(); // 管道, libevent 通知用的 for (i = 0; i < nthreads; i++) { int fds[2]; if (pipe(fds)) { perror("Can't create notify pipe"); exit(1); } // 读管道 threads[i].notify_receive_fd = fds[0]; // 写管道 threads[i].notify_send_fd = fds[1]; // 初始化线程信息数据结构, 其中就将 event 结构体的回调函数设置为 thread_libevent_process() setup_thread(&threads[i]); /* Reserve three fds for the libevent base, and two for the pipe */ stats.reserved_fds += 5; } /* Create threads after we've done all the libevent setup. */ // 创建并初始化线程, 线程的代码都是 work_libevent() for (i = 0; i < nthreads; i++) { // 调用 pthread_attr_init() 和 pthread_create() create_worker(worker_libevent, &threads[i]); } /* Wait for all the threads to set themselves up before returning. */ pthread_mutex_lock(&init_lock); // wait_for_thread_registration() 是 pthread_cond_wait 的调用 wait_for_thread_registration(nthreads); pthread_mutex_unlock(&init_lock); }