Ejemplo n.º 1
0
ENGINE_ERROR_CODE assoc_init(struct default_engine *engine) {
    engine->assoc.primary_hashtable = calloc(hashsize(engine->assoc.hashpower), sizeof(void *));
    return (engine->assoc.primary_hashtable != NULL) ? ENGINE_SUCCESS : ENGINE_ENOMEM;
}
Ejemplo n.º 2
0
static void *shadow_assoc_maintenance_thread(void *arg) {

    mutex_lock(&maintenance_shadow_lock);
    while (do_run_maintenance_thread) {
        int ii = 0;

        /* There is only one expansion thread, so no need to global lock. */
        for (ii = 0; ii < hash_bulk_move && expanding; ++ii) {
            shadow_item *it, *next;
            int bucket;
            void *item_lock = NULL;

            /* bucket = hv & hashmask(shadow_hashpower) =>the bucket of hash table
             * is the lowest N bits of the hv, and the bucket of item_locks is
             *  also the lowest M bits of hv, and N is greater than M.
             *  So we can process expanding with only one item_lock. cool! */
            if ((item_lock = item_trylock(expand_bucket))) {
                    for (it = old_hashtable[expand_bucket]; NULL != it; it = next) {
                        next = it->h_next;
                        bucket = hash(it->key, it->nkey) & hashmask(shadow_hashpower);
                        it->h_next = primary_hashtable[bucket];
                        primary_hashtable[bucket] = it;
                    }

                    old_hashtable[expand_bucket] = NULL;

                    expand_bucket++;
                    if (expand_bucket == hashsize(shadow_hashpower - 1)) {
                        expanding = false;
                        free(old_hashtable);
                        STATS_LOCK();
                        stats.hash_bytes -= hashsize(shadow_hashpower - 1) * sizeof(void *);
                        stats.hash_is_expanding = 0;
                        STATS_UNLOCK();
                        if (settings.verbose > 1)
                            fprintf(stderr, "Hash table expansion done\n");
                    }

            } else {
                usleep(10*1000);
            }

            if (item_lock) {
                item_trylock_unlock(item_lock);
                item_lock = NULL;
            }
        }

        if (!expanding) {
            /* We are done expanding.. just wait for next invocation */
            started_expanding = false;
            pthread_cond_wait(&maintenance_shadow_cond, &maintenance_shadow_lock);
            /* shadow_assoc_expand() swaps out the hash table entirely, so we need
             * all threads to not hold any references related to the hash
             * table while this happens.
             * This is instead of a more complex, possibly slower algorithm to
             * allow dynamic hash table expansion without causing significant
             * wait times.
             */
            pause_threads(PAUSE_ALL_THREADS);
            shadow_assoc_expand();
            pause_threads(RESUME_ALL_THREADS);
        }
    }
    return NULL;
}
Ejemplo n.º 3
0
static void *assoc_maintenance_thread(void *arg) {

    mutex_lock(&maintenance_lock);

    //do_run_maintenance_thread是全局变量,初始值为1,在stop_assoc_maintenance_thread
    //函数中会被赋值0,终止迁移线程
    while (do_run_maintenance_thread) {
        int ii = 0;

        /* There is only one expansion thread, so no need to global lock. */
        for (ii = 0; ii < hash_bulk_move && expanding; ++ii) {
            item *it, *next;
            int bucket;
            void *item_lock = NULL;

            /* bucket = hv & hashmask(hashpower) =>the bucket of hash table
             * is the lowest N bits of the hv, and the bucket of item_locks is
             *  also the lowest M bits of hv, and N is greater than M.
             *  So we can process expanding with only one item_lock. cool! */
            //hash_bulk_move用来控制每次迁移,移动多少个桶的item。默认是一个.  
            //如果expanding为true才会进入循环体,所以迁移线程刚创建的时候,并不会进入循环体
            if ((item_lock = item_trylock(expand_bucket))) {
                    //在assoc_expand函数中expand_bucket会被赋值0  
                    //遍历旧哈希表中由expand_bucket指明的桶,将该桶的所有item  
                    //迁移到新哈希表中。
                    for (it = old_hashtable[expand_bucket]; NULL != it; it = next) {
                        next = it->h_next;
                         //重新计算新的哈希值,得到其在新哈希表的位置 
                        bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower);
                         //将这个item插入到新哈希表中
                        it->h_next = primary_hashtable[bucket];
                        primary_hashtable[bucket] = it;
                    }

                    //不需要清空旧桶。直接将冲突链的链头赋值为NULL即可
                    old_hashtable[expand_bucket] = NULL;

                    //迁移完一个桶,接着把expand_bucket指向下一个待迁移的桶
                    expand_bucket++;
                    if (expand_bucket == hashsize(hashpower - 1)) {//全部数据迁移完毕
                        expanding = false;//将扩展标志设置为false
                        free(old_hashtable);
                        STATS_LOCK();
                        stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *);
                        stats.hash_is_expanding = 0;
                        STATS_UNLOCK();
                        if (settings.verbose > 1)
                            fprintf(stderr, "Hash table expansion done\n");
                    }

            } else {
                usleep(10*1000);
            }

            if (item_lock) {
                //遍历完hash_bulk_move个桶的所有item后,就释放锁 
                item_trylock_unlock(item_lock);
                item_lock = NULL;
            }
        }

        if (!expanding) {//不需要迁移数据(了)
            /* We are done expanding.. just wait for next invocation */
            started_expanding = false;//重置

            //挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5倍哈希表大小,  
            //此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal  
            //唤醒迁移线程
            pthread_cond_wait(&maintenance_cond, &maintenance_lock);
            /* assoc_expand() swaps out the hash table entirely, so we need
             * all threads to not hold any references related to the hash
             * table while this happens.
             * This is instead of a more complex, possibly slower algorithm to
             * allow dynamic hash table expansion without causing significant
             * wait times.
             */
            pause_threads(PAUSE_ALL_THREADS);
            assoc_expand();//申请更大的哈希表,并将expanding设置为true
            pause_threads(RESUME_ALL_THREADS);
        }
    }
    return NULL;
}
//数据迁移线程回调函数
static void *assoc_maintenance_thread(void *arg) {

	//do_run_maintenance_thread 是全局变量,初始值为1,在stop_assoc_mainternance_thread
	//函数中会被赋值0,之中迁移线程
    while (do_run_maintenance_thread) {
        int ii = 0;

        /* Lock the cache, and bulk move multiple buckets to the new
         * hash table. */
         //上锁
        item_lock_global();//锁上全局级别的锁,全部的item都在全局锁的控制之下  
        //锁住哈希表里面的item。不然别的线程对哈希表进行增删操作时,会出现  
        //数据不一致的情况.在item.c的do_item_link和do_item_unlink可以看到  
        //其内部也会锁住cache_lock锁.  
        mutex_lock(&cache_lock);
		//进行item迁移
        for (ii = 0; ii < hash_bulk_move && expanding; ++ii) {
            item *it, *next;
            int bucket;

            for (it = old_hashtable[expand_bucket]; NULL != it; it = next) {
                next = it->h_next;

                bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower);
                it->h_next = primary_hashtable[bucket];
                primary_hashtable[bucket] = it;
            }

            old_hashtable[expand_bucket] = NULL;

            expand_bucket++;
            if (expand_bucket == hashsize(hashpower - 1)) {
                expanding = false;
                free(old_hashtable);
                STATS_LOCK();
                stats.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *);
                stats.hash_is_expanding = 0;
                STATS_UNLOCK();
                if (settings.verbose > 1)
                    fprintf(stderr, "Hash table expansion done\n");
            }
        }

		//遍历完就释放锁
        mutex_unlock(&cache_lock);
        item_unlock_global();
		
		//不需要迁移数据了
        if (!expanding) { 
            /*
                迁移线程为什么要这么迂回曲折地切换workers线程的锁类型呢?直接修改所有线程的LIBEVENT_THREAD结构的item_lock_type
             成员变量不就行了吗?
                这主要是因为迁移线程不知道worker线程此刻在干些什么。如果worker线程正在访问item,并抢占了段级别锁。此时你把worker
             线程的锁切换到全局锁,等worker线程解锁的时候就会解全局锁(参考前面的item_lock和item_unlock代码),这样程序就崩溃了。
             所以不能迁移线程去切换,只能迁移线程通知worker线程,然后worker线程自己去切换。当然是要worker线程忙完了手头上的事情
             后,才会去修改切换的。所以迁移线程在通知完所有的worker线程后,会调用wait_for_thread_registration函数休眠等待所有的
             worker线程都切换到指定的锁类型后才醒来。
            */
            /* finished expanding. tell all threads to use fine-grained locks */

            //进入到这里,说明已经不需要迁移数据(停止扩展了)。  
            //告诉所有的workers线程,访问item时,切换到段级别的锁。  
            //会阻塞到所有workers线程都切换到段级别的锁  
            switch_item_lock_type(ITEM_LOCK_GRANULAR);
            slabs_rebalancer_resume();
            /* We are done expanding.. just wait for next invocation */
            mutex_lock(&cache_lock);
			// 重置
            started_expanding = false;

			//挂起迁移线程,直到worker线程插入数据后发现item数量已经到了1.5被哈希表大小,
			//此时调用worker线程调用assoc_start_expand函数,该函数会调用pthread_cond_signal唤醒迁移线程
            pthread_cond_wait(&maintenance_cond, &cache_lock);
            /* Before doing anything, tell threads to use a global lock */
            mutex_unlock(&cache_lock);
            slabs_rebalancer_pause();

            //从maintenance_cond条件变量中醒来,说明又要开始扩展哈希表和迁移数据了。  
            //迁移线程在迁移一个桶的数据时是锁上全局级别的锁.  
            //此时workers线程不能使用段级别的锁,而是要使用全局级别的锁,  
            //所有的workers线程和迁移线程一起,争抢全局级别的锁.  
            //哪个线程抢到了,才有权利访问item.  
            //下面一行代码就是通知所有的workers线程,把你们访问item的锁切换  
            //到全局级别的锁。switch_item_lock_type会通过条件变量休眠等待,  
            //直到,所有的workers线程都切换到全局级别的锁,才会醒来过  
            switch_item_lock_type(ITEM_LOCK_GLOBAL);
            mutex_lock(&cache_lock);
			//申请更大的哈希表,并将expanding设置为true
            assoc_expand();
            mutex_unlock(&cache_lock);
        }
    }
    return NULL;
}
Ejemplo n.º 5
0
static ENGINE_ERROR_CODE
do_assoc_get_prefix_stats(struct default_engine *engine,
                          const char *prefix, const int  nprefix,
                          void *prefix_data)
{
    struct assoc *assoc = &engine->assoc;
    prefix_t *pt;

    if (nprefix < 0) { // all prefix information
        char *buf;
        struct tm *t;
        const char *format = "PREFIX %s itm %llu kitm %llu litm %llu sitm %llu bitm %llu "
                                       "tsz %llu ktsz %llu ltsz %llu stsz %llu btsz %llu "
                                       "time %04d%02d%02d%02d%02d%02d\r\n";
        uint32_t i, hsize = hashsize(DEFAULT_PREFIX_HASHPOWER);
        uint32_t num_prefixes = assoc->tot_prefix_items;
        uint32_t tot_prefix_name_len = 0;
        uint32_t msize, pos, written;

        pt = root_pt;
        if (pt != NULL && (pt->hash_items > 0 || pt->list_hash_items > 0 ||
                           pt->set_hash_items > 0 || pt->btree_hash_items > 0)) {
            /* including null prefix */
            num_prefixes += 1;
            tot_prefix_name_len = strlen("<null>");
        }
        for (i = 0; i < hsize; i++) {
            pt = assoc->prefix_hashtable[i];
            while (pt) {
                tot_prefix_name_len += pt->nprefix;
                pt = pt->h_next;
            }
        }

        msize = sizeof(uint32_t) + strlen(format) + tot_prefix_name_len
                + num_prefixes * (strlen(format) - 2 /* %s */
                                  + (10 * (20 - 4))) /* %llu replaced by 20-digit num */
                - (5 * (4 - 2)) /* %02d replaced by 2-digit num */
                + sizeof("END\r\n");

        buf = malloc(msize);
        if (buf == NULL) {
            return ENGINE_ENOMEM;
        }
        pos = sizeof(uint32_t);

        pt = root_pt;
        if (pt != NULL && (pt->hash_items > 0 || pt->list_hash_items > 0 ||
                           pt->set_hash_items > 0 || pt->btree_hash_items > 0)) {
            /* including null prefix */
            t = localtime(&pt->create_time);
            written = snprintf(buf+pos, msize-pos, format, "<null>",
                               pt->hash_items+pt->list_hash_items+pt->set_hash_items+pt->btree_hash_items,
                               pt->hash_items,pt->list_hash_items,pt->set_hash_items,pt->btree_hash_items,
                               pt->hash_items_bytes+pt->list_hash_items_bytes+pt->set_hash_items_bytes+pt->btree_hash_items_bytes,
                               pt->hash_items_bytes,pt->list_hash_items_bytes,pt->set_hash_items_bytes,pt->btree_hash_items_bytes,
                               t->tm_year+1900, t->tm_mon+1, t->tm_mday, t->tm_hour, t->tm_min, t->tm_sec);
            pos += written;
        }

        for (i = 0; i < hsize; i++) {
            pt = assoc->prefix_hashtable[i];
            while (pt) {
                t = localtime(&pt->create_time);
                written = snprintf(buf+pos, msize-pos, format, _get_prefix(pt),
                               pt->hash_items+pt->list_hash_items+pt->set_hash_items+pt->btree_hash_items,
                               pt->hash_items,pt->list_hash_items,pt->set_hash_items,pt->btree_hash_items,
                               pt->hash_items_bytes+pt->list_hash_items_bytes+pt->set_hash_items_bytes+pt->btree_hash_items_bytes,
                               pt->hash_items_bytes,pt->list_hash_items_bytes,pt->set_hash_items_bytes,pt->btree_hash_items_bytes,
                               t->tm_year+1900, t->tm_mon+1, t->tm_mday, t->tm_hour, t->tm_min, t->tm_sec);
                pos += written;
                assert(pos < msize);
                pt = pt->h_next;
            }
        }
        memcpy(buf+pos, "END\r\n", 6);
        *(uint32_t*)buf = pos + 5 - sizeof(uint32_t);

        *(char**)prefix_data = buf;
    } else {
        prefix_engine_stats *prefix_stats = (prefix_engine_stats*)prefix_data;

        if (prefix != NULL) {
            pt = assoc_prefix_find(engine, engine->server.core->hash(prefix,nprefix,0), prefix, nprefix);
        } else {
            pt = root_pt;
        }
        if (pt == NULL) {
            return ENGINE_PREFIX_ENOENT;
        }

        prefix_stats->hash_items = pt->hash_items;
        prefix_stats->hash_items_bytes = pt->hash_items_bytes;
        prefix_stats->prefix_items = pt->prefix_items;
        if (prefix != NULL)
            prefix_stats->tot_prefix_items = pt->prefix_items;
        else
            prefix_stats->tot_prefix_items = assoc->tot_prefix_items;
    }
    return ENGINE_SUCCESS;
}
Ejemplo n.º 6
0
/*
 * Initializes the thread subsystem, creating various worker threads.
 *
 * nthreads  Number of worker event handler threads to spawn
 * main_base Event base for main thread
 */
void thread_init(int nthreads, struct event_base *main_base, void *(*join_request_listener_thread_routine)(void *), void *(*joining_thread_routine)(void *), void *(*node_removal_listener_thread_routine)(void *),void *(*node_propagation_thread_routine)(void *)) {
    int         i;
    int         power;

    pthread_mutex_init(&cache_lock, NULL);
    pthread_mutex_init(&stats_lock, NULL);

    pthread_mutex_init(&init_lock, NULL);
    pthread_cond_init(&init_cond, NULL);

    pthread_mutex_init(&cqi_freelist_lock, NULL);
    cqi_freelist = NULL;


    /* Want a wide lock table, but don't waste memory */
    if (nthreads < 3) {
        power = 10;
    } else if (nthreads < 4) {
        power = 11;
    } else if (nthreads < 5) {
        power = 12;
    } else {
        /* 8192 buckets, and central locks don't scale much past 5 threads */
        power = 13;
    }

    item_lock_count = hashsize(power);

    item_locks = calloc(item_lock_count, sizeof(pthread_mutex_t));
    if (! item_locks) {
        perror("Can't allocate item locks");
        exit(1);
    }
    for (i = 0; i < item_lock_count; i++) {
        pthread_mutex_init(&item_locks[i], NULL);
    }
    pthread_key_create(&item_lock_type_key, NULL);
    pthread_mutex_init(&item_global_lock, NULL);

    threads = calloc(nthreads, sizeof(LIBEVENT_THREAD));
    if (! threads) {
        perror("Can't allocate thread descriptors");
        exit(1);
    }

    dispatcher_thread.base = main_base;
    dispatcher_thread.thread_id = pthread_self();

    for (i = 0; i < nthreads; i++) {
        int fds[2];
        if (pipe(fds)) {
            perror("Can't create notify pipe");
            exit(1);
        }

        threads[i].notify_receive_fd = fds[0];
        threads[i].notify_send_fd = fds[1];

        setup_thread(&threads[i]);
        /* Reserve three fds for the libevent base, and two for the pipe */
        stats.reserved_fds += 5;
    }

    /* Create threads after we've done all the libevent setup. */
    for (i = 0; i < nthreads; i++) {
        create_worker(worker_libevent, &threads[i]);
    }

    /* Wait for all the threads to set themselves up before returning. */
    pthread_mutex_lock(&init_lock);
    wait_for_thread_registration(nthreads);
    pthread_mutex_unlock(&init_lock);


    if(joining_thread_routine != NULL)
       connect_to_join_server(joining_thread_routine);
    else
        start_listening_on_join_port(join_request_listener_thread_routine);

    usleep(1000);
    start_listening_on_node_propagation_port(node_propagation_thread_routine);
    usleep(1000);
    start_listening_on_node_removal_port(node_removal_listener_thread_routine);

	///we may want to comment this
	pthread_join(connect_and_split_thread,NULL);
}
Ejemplo n.º 7
0
/*
 * 初始化线程子系统, 创建工作线程
 * Initializes the thread subsystem, creating various worker threads.
 *
 * nthreads  Number of worker event handler threads to spawn
    需准备的线程数
 * main_base Event base for main thread
    分发线程
 */
void thread_init(int nthreads, struct event_base *main_base) {
    int         i;
    int         power;

    // 互斥量初始化
    pthread_mutex_init(&cache_lock, NULL);
    pthread_mutex_init(&stats_lock, NULL);

    pthread_mutex_init(&init_lock, NULL);
    pthread_cond_init(&init_cond, NULL);

    pthread_mutex_init(&cqi_freelist_lock, NULL);
    cqi_freelist = NULL;

    /* Want a wide lock table, but don't waste memory */
    // 锁表?
    if (nthreads < 3) {
        power = 10;
    } else if (nthreads < 4) {
        power = 11;
    } else if (nthreads < 5) {
        power = 12;
    } else {
        // 2^13
        /* 8192 buckets, and central locks don't scale much past 5 threads */
        power = 13;
    }

    // 预申请那么多的锁, 拿来做什么
    // hashsize = 2^n
    item_lock_count = hashsize(power);

    item_locks = calloc(item_lock_count, sizeof(pthread_mutex_t));
    if (! item_locks) {
        perror("Can't allocate item locks");
        exit(1);
    }
    // 初始化
    for (i = 0; i < item_lock_count; i++) {
        pthread_mutex_init(&item_locks[i], NULL);
    }
    pthread_key_create(&item_lock_type_key, NULL);
    pthread_mutex_init(&item_global_lock, NULL);


    // LIBEVENT_THREAD 是结合 libevent 使用的结构体, event_base, 读写管道
    threads = calloc(nthreads, sizeof(LIBEVENT_THREAD)); // 生成 nthreads 数量的线程
    if (! threads) {
        perror("Can't allocate thread descriptors");
        exit(1);
    }

    // main_base 应该是分发任务的线程, 即主线程
    dispatcher_thread.base = main_base;
    dispatcher_thread.thread_id = pthread_self();

    // 管道, libevent 通知用的
    for (i = 0; i < nthreads; i++) {
        int fds[2];
        if (pipe(fds)) {
            perror("Can't create notify pipe");
            exit(1);
        }

        // 读管道
        threads[i].notify_receive_fd = fds[0];
        // 写管道
        threads[i].notify_send_fd = fds[1];

        // 初始化线程信息数据结构, 其中就将 event 结构体的回调函数设置为 thread_libevent_process()
        setup_thread(&threads[i]);
        /* Reserve three fds for the libevent base, and two for the pipe */
        stats.reserved_fds += 5;
    }

    /* Create threads after we've done all the libevent setup. */
    // 创建并初始化线程, 线程的代码都是 work_libevent()
    for (i = 0; i < nthreads; i++) {
        // 调用 pthread_attr_init() 和 pthread_create()
        create_worker(worker_libevent, &threads[i]);
    }

    /* Wait for all the threads to set themselves up before returning. */
    pthread_mutex_lock(&init_lock);
    // wait_for_thread_registration() 是 pthread_cond_wait 的调用
    wait_for_thread_registration(nthreads);
    pthread_mutex_unlock(&init_lock);
}