コード例 #1
0
ファイル: RBT.c プロジェクト: Nesokas/sampleProject
struct _sthread *rbt_remove(struct rbt *tree, int vruntime)
{
	struct node *delete_node = rbt_find(tree, vruntime);
	struct node *y;
	struct node *x;

	struct _sthread *thread;

	if(delete_node == NULL){
		printf("Node with vruntime = %d doesn't exist\n", vruntime);
		return NULL;
	}

	if (delete_node->queue->first->next != NULL)
		return queue_remove(delete_node->queue);

	if(delete_node->left == tree->nil || delete_node->right == tree->nil)
		y = delete_node;
	else {	
		y = sucessor(tree, delete_node);	
		if(!(y->color == RED || !black_leef(tree, y)))
			y = predecessor(tree, delete_node);
	}

	if (y->left != tree->nil)
		x = y->left;
	else x = y->right;
	
	x->parent = y->parent;

	if (y->parent == tree->nil)
		tree->root = x;
	else { 
		if (y == y->parent->left)
			y->parent->left = x;
		else y->parent->right = x;
	}

	if(y != delete_node){
		substitute(tree, delete_node, y);
		if (isRoot(tree, y))
			tree->root = y;
	}

	if (y == tree->first){
		if (y->parent == tree->nil && y->right != tree->nil)
			tree->first = minimum(tree, y->right);
		else tree->first = x->parent; 
	}

	if(y->color == BLACK)
		delete_fixup(tree, x);

	treeRoot(tree);
	lower(tree);

	thread = queue_remove(y->queue);

	destroy_node(y);

	return thread;
}
コード例 #2
0
ファイル: blasx_dgemm.c プロジェクト: 529038378/BLASX
void blasx_gpu_dgemm(void *arg_data)
{
    int i;
    //----------GPU Argument Prepare------------//
    struct gpu_thread_data *arg = (struct gpu_thread_data *) arg_data;
    const int GPU_id = arg->GPU_id;
    cuda_err = cudaSetDevice(GPU_id);
    assert(cuda_err == cudaSuccess);
    //matrix configuration
    reader_tracker addr_track[1300]; //CRITICAL
    int x                       = arg->mat_conf->x;
    int y                       = arg->mat_conf->y;
    int z                       = arg->mat_conf->z;
    double *A                    = (double*) arg->mat_conf->A;
    double *B                    = (double*) arg->mat_conf->B;
    double *C                    = (double*) arg->mat_conf->C;
    int lda                     = arg->mat_conf->lda;
    int ldb                     = arg->mat_conf->ldb;
    int ldc                     = arg->mat_conf->ldc;
    double beta                  = arg->mat_conf->beta;
    double alpha                 = arg->mat_conf->alpha;
    int nrowa                   = arg->mat_conf->nrowa;
    int nrowb                   = arg->mat_conf->nrowb;
    int nrowc                   = arg->mat_conf->nrowc;
    int ncola                   = arg->mat_conf->ncola;
    int ncolb                   = arg->mat_conf->ncolb;
    int ncolc                   = arg->mat_conf->ncolc;
    enum CBLAS_TRANSPOSE TransA = arg->mat_conf->TransA;
    enum CBLAS_TRANSPOSE TransB = arg->mat_conf->TransB;
    int block_dim               = arg->mat_conf->block_dim;
    //GPU configuration
    const int GPUs              = arg->GPUs;
    LRU_t   **LRUs              = arg->LRUs;
    cublasHandle_t  handle      = handles_DGEMM[GPU_id];
    queue_t *tasks_queue        = arg->tasks_queue;
    //------------------------------------------//
    //hook C_dev
    double *C_dev[STREAMNUM*2];
    for (i = 0; i < STREAMNUM*2; i++) {
        C_dev[i] = C_dev_DGEMM[i+STREAMNUM*GPU_id*2];
    }
    cudaStream_t stream[STREAMNUM];
    cudaEvent_t task_event[STREAMNUM];
    for (i = 0 ; i < STREAMNUM; i++) {
        //hook event
        task_event[i] = event_DGEMM[i+GPU_id*STREAMNUM];
        //hook stream
        stream[i]     = streams_DGEMM[i+GPU_id*STREAMNUM];
    }
    
#ifdef affinity
    //thread setup
    assert( blasx_set_affinity(GPU_id) == 0);
#endif
#ifdef thread_barrier
    pthread_barrier_t* barr     = arg->barr;
    int rc = pthread_barrier_wait(barr);
    assert(!(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD));
#endif
#ifdef thread_profile
    printf("thread%d start@%f\n", GPU_id, get_cur_time());
#endif
    //------------------------------------------//


    //----------------GPU-START-----------------//
    int tasks_rs[STREAMNUM*2]; // mimic reseravation station
    int tasks_rs_size[2] = { 0, 0 };   // always tracking the first unused
    int switcher = 0;
    int task_batch_counter = 0;
    int mem_cpy_counter = 0;

    while (tasks_queue->TAIL >= 0) {
        /*------RS------*/
        int rs_counter          = 0;
        tasks_rs_size[switcher] = 0;
        for (rs_counter = 0; rs_counter < STREAMNUM; rs_counter++) {
            int task_id = dequeue(tasks_queue);
#ifdef task_tracker
            printf("==>GPU%d %d\n", GPU_id, task_id);
#endif
            if (task_id >= 0) {
                tasks_rs[tasks_rs_size[switcher]+STREAMNUM*switcher] = task_id;
                tasks_rs_size[switcher]++;
            }
        }
        
        /*--event_sync---*/
        while (cudaEventQuery(task_event[0]) != cudaSuccess);
        
        /*--reduce_reader--*/
        int addr_counter = 0;
        for (addr_counter = 0; addr_counter < mem_cpy_counter; addr_counter++) {
            void *key          = addr_track[addr_counter].addr;
            int target_GPU_id  = addr_track[addr_counter].GPU_id;
            int is_trans_done  = addr_track[addr_counter].is_trans_done;
            rbt_node *n        = rbt_find(key, &(LRUs[target_GPU_id]->hash_map));
            assert(n != NULL);
            if (is_trans_done == 0 && (target_GPU_id == GPU_id)) {
                assert(target_GPU_id == GPU_id);
                n->associated_LRU_elem->is_trans_done = 1;
            }
            atomic_reader_minus(n);
        }

        /*--kernel_exe---*/
        mem_cpy_counter = 0;
        int j = 0;
        for(j = 0; j <= z; j++){
            for (rs_counter = 0; rs_counter < tasks_rs_size[switcher]; rs_counter++) {
                int current_stream   = rs_counter;
                int current_task   = tasks_rs[rs_counter+STREAMNUM*switcher];
                int prior_task     = tasks_rs[rs_counter+(1-switcher)*STREAMNUM];
                cudaStream_t *curt_stream = &stream[current_stream];
                blasx_gpu_dgemm_kernel(j,
                                       nrowa, ncola,
                                       nrowb, ncolb,
                                       nrowc, ncolc,
                                       current_task, prior_task,
                                       TransA, TransB,
                                       A, B, C,
                                       lda, ldb, ldc,
                                       x, y, z,
                                       C_dev,
                                       curt_stream, &handle,
                                       current_stream,
                                       alpha, beta, block_dim,
                                       switcher, &task_batch_counter,
                                       LRUs, GPUs,
                                       &mem_cpy_counter,
                                       addr_track,
                                       GPU_id);
                if ( j == z && rs_counter == tasks_rs_size[switcher]-1) {
                    /*--event_record--*/
                    cudaError_t err = cudaEventRecord(task_event[0], stream[0]);
                    if(err != cudaSuccess) printf("event record fail\n");
                }
            }
        }
        switcher = 1 - switcher;
        task_batch_counter++;
    }
    //------------------------------------------//

    //---------------RESULT-HARVEST-------------//
    collect_final_result_dgemm(tasks_rs, tasks_rs_size, switcher, stream, C_dev, block_dim, STREAMNUM, x, y, z, nrowc, ncolc, ldc, C);
    //------------------------------------------//
#ifdef thread_profile
    printf("thread%d end@%f\n", GPU_id, get_cur_time());
#endif
}
コード例 #3
0
ファイル: blasx_mem_control.c プロジェクト: 529038378/BLASX
void mem_control_kernel_float(float *starting_point_A, float **A_dev,
                        LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim,
                        int *mem_cpy_counter, reader_tracker *addr_track,
                        cudaStream_t *stream,
                        int nrowa_dev, int ncola_dev, int lda) {
    rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map));
    if( block_A == NULL ) { //new element
        //fprintf(stderr, "==========new element========\n");
        //traverse_LRU_se(LRU);
        int search_l_GPU = GPU_id-1;
        int search_r_GPU = GPU_id+1;
        rbt_node *block_A_l = NULL;
        rbt_node *block_A_r = NULL;
        while (block_A_l == NULL && block_A_r == NULL) {
            if (search_l_GPU >= 0) {
                block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map));
                if (block_A_l != NULL) {
                    if (block_A_l->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU);
                        if(peer_access_check == 1) block_A_l = NULL;
                    }
                }
                search_l_GPU--;
            }
            if (search_r_GPU < GPUs) {
                block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map));
                if (block_A_r != NULL) {
                    if (block_A_r->associated_LRU_elem->is_trans_done == 0) {
                        int peer_access_check = 0;
                        cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU);
                        if(peer_access_check == 1) block_A_r = NULL;
                    }
                }
                search_r_GPU++;
            }
            if (search_l_GPU < 0 && search_r_GPU >= GPUs) {
                break;
            }
        }
        //rectitfication
        search_l_GPU++; search_r_GPU--;
        assert(search_l_GPU >= 0 && search_l_GPU < GPUs);
        assert(search_r_GPU >= 0 && search_r_GPU < GPUs);
        
        if ( !(block_A_l == NULL && block_A_r == NULL) ) {
            //inter GPU communication
            int target_GPU_id = 0;
            if (block_A_l != NULL && block_A_r != NULL) {
                if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else {
                    int rand_select = rand()%10;
                    if (rand_select < 5) {
                        target_GPU_id = search_l_GPU;
                        block_A       = block_A_l;
                    } else {
                        target_GPU_id = search_r_GPU;
                        block_A       = block_A_r;
                    }
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //fprintf(stderr, "==>3  block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r);
            } else {
                if (block_A_l != NULL && block_A_r == NULL) {
                    target_GPU_id = search_l_GPU;
                    block_A       = block_A_l;
                } else if(block_A_r != NULL && block_A_l == NULL) {
                    target_GPU_id = search_r_GPU;
                    block_A       = block_A_r;
                }
                if(block_A->associated_LRU_elem->is_trans_done != 1)
                goto new_block;
                //printf("==>2  block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done);
            }
            if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL)
            goto new_block;
            atomic_reader_plus(block_A);
            *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL);
            assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1);
            assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess );
            //cannot dequeue the GPU mem at the target GPU
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = target_GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 1;
            (*mem_cpy_counter) += 1;
            //cannnot dequeue the current new GPU mem
            addr_track[*mem_cpy_counter].addr   = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        } else {
        new_block:
            //(block_A_r == NULL && block_A_l == NULL) {
            //bring new blocks
            //printf("==>1  bring new block to GPU:%d\n", GPU_id);
            (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id);
            assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
            addr_track[*mem_cpy_counter].addr          = starting_point_A;
            addr_track[*mem_cpy_counter].GPU_id        = GPU_id;
            addr_track[*mem_cpy_counter].is_trans_done = 0;
            (*mem_cpy_counter) += 1;
        }
    } else {
        atomic_reader_plus(block_A);
        assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL);
        *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]);
        addr_track[*mem_cpy_counter].addr   = starting_point_A;
        addr_track[*mem_cpy_counter].GPU_id = GPU_id;
        (*mem_cpy_counter) += 1;
    }
}