struct _sthread *rbt_remove(struct rbt *tree, int vruntime) { struct node *delete_node = rbt_find(tree, vruntime); struct node *y; struct node *x; struct _sthread *thread; if(delete_node == NULL){ printf("Node with vruntime = %d doesn't exist\n", vruntime); return NULL; } if (delete_node->queue->first->next != NULL) return queue_remove(delete_node->queue); if(delete_node->left == tree->nil || delete_node->right == tree->nil) y = delete_node; else { y = sucessor(tree, delete_node); if(!(y->color == RED || !black_leef(tree, y))) y = predecessor(tree, delete_node); } if (y->left != tree->nil) x = y->left; else x = y->right; x->parent = y->parent; if (y->parent == tree->nil) tree->root = x; else { if (y == y->parent->left) y->parent->left = x; else y->parent->right = x; } if(y != delete_node){ substitute(tree, delete_node, y); if (isRoot(tree, y)) tree->root = y; } if (y == tree->first){ if (y->parent == tree->nil && y->right != tree->nil) tree->first = minimum(tree, y->right); else tree->first = x->parent; } if(y->color == BLACK) delete_fixup(tree, x); treeRoot(tree); lower(tree); thread = queue_remove(y->queue); destroy_node(y); return thread; }
void blasx_gpu_dgemm(void *arg_data) { int i; //----------GPU Argument Prepare------------// struct gpu_thread_data *arg = (struct gpu_thread_data *) arg_data; const int GPU_id = arg->GPU_id; cuda_err = cudaSetDevice(GPU_id); assert(cuda_err == cudaSuccess); //matrix configuration reader_tracker addr_track[1300]; //CRITICAL int x = arg->mat_conf->x; int y = arg->mat_conf->y; int z = arg->mat_conf->z; double *A = (double*) arg->mat_conf->A; double *B = (double*) arg->mat_conf->B; double *C = (double*) arg->mat_conf->C; int lda = arg->mat_conf->lda; int ldb = arg->mat_conf->ldb; int ldc = arg->mat_conf->ldc; double beta = arg->mat_conf->beta; double alpha = arg->mat_conf->alpha; int nrowa = arg->mat_conf->nrowa; int nrowb = arg->mat_conf->nrowb; int nrowc = arg->mat_conf->nrowc; int ncola = arg->mat_conf->ncola; int ncolb = arg->mat_conf->ncolb; int ncolc = arg->mat_conf->ncolc; enum CBLAS_TRANSPOSE TransA = arg->mat_conf->TransA; enum CBLAS_TRANSPOSE TransB = arg->mat_conf->TransB; int block_dim = arg->mat_conf->block_dim; //GPU configuration const int GPUs = arg->GPUs; LRU_t **LRUs = arg->LRUs; cublasHandle_t handle = handles_DGEMM[GPU_id]; queue_t *tasks_queue = arg->tasks_queue; //------------------------------------------// //hook C_dev double *C_dev[STREAMNUM*2]; for (i = 0; i < STREAMNUM*2; i++) { C_dev[i] = C_dev_DGEMM[i+STREAMNUM*GPU_id*2]; } cudaStream_t stream[STREAMNUM]; cudaEvent_t task_event[STREAMNUM]; for (i = 0 ; i < STREAMNUM; i++) { //hook event task_event[i] = event_DGEMM[i+GPU_id*STREAMNUM]; //hook stream stream[i] = streams_DGEMM[i+GPU_id*STREAMNUM]; } #ifdef affinity //thread setup assert( blasx_set_affinity(GPU_id) == 0); #endif #ifdef thread_barrier pthread_barrier_t* barr = arg->barr; int rc = pthread_barrier_wait(barr); assert(!(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)); #endif #ifdef thread_profile printf("thread%d start@%f\n", GPU_id, get_cur_time()); #endif //------------------------------------------// //----------------GPU-START-----------------// int tasks_rs[STREAMNUM*2]; // mimic reseravation station int tasks_rs_size[2] = { 0, 0 }; // always tracking the first unused int switcher = 0; int task_batch_counter = 0; int mem_cpy_counter = 0; while (tasks_queue->TAIL >= 0) { /*------RS------*/ int rs_counter = 0; tasks_rs_size[switcher] = 0; for (rs_counter = 0; rs_counter < STREAMNUM; rs_counter++) { int task_id = dequeue(tasks_queue); #ifdef task_tracker printf("==>GPU%d %d\n", GPU_id, task_id); #endif if (task_id >= 0) { tasks_rs[tasks_rs_size[switcher]+STREAMNUM*switcher] = task_id; tasks_rs_size[switcher]++; } } /*--event_sync---*/ while (cudaEventQuery(task_event[0]) != cudaSuccess); /*--reduce_reader--*/ int addr_counter = 0; for (addr_counter = 0; addr_counter < mem_cpy_counter; addr_counter++) { void *key = addr_track[addr_counter].addr; int target_GPU_id = addr_track[addr_counter].GPU_id; int is_trans_done = addr_track[addr_counter].is_trans_done; rbt_node *n = rbt_find(key, &(LRUs[target_GPU_id]->hash_map)); assert(n != NULL); if (is_trans_done == 0 && (target_GPU_id == GPU_id)) { assert(target_GPU_id == GPU_id); n->associated_LRU_elem->is_trans_done = 1; } atomic_reader_minus(n); } /*--kernel_exe---*/ mem_cpy_counter = 0; int j = 0; for(j = 0; j <= z; j++){ for (rs_counter = 0; rs_counter < tasks_rs_size[switcher]; rs_counter++) { int current_stream = rs_counter; int current_task = tasks_rs[rs_counter+STREAMNUM*switcher]; int prior_task = tasks_rs[rs_counter+(1-switcher)*STREAMNUM]; cudaStream_t *curt_stream = &stream[current_stream]; blasx_gpu_dgemm_kernel(j, nrowa, ncola, nrowb, ncolb, nrowc, ncolc, current_task, prior_task, TransA, TransB, A, B, C, lda, ldb, ldc, x, y, z, C_dev, curt_stream, &handle, current_stream, alpha, beta, block_dim, switcher, &task_batch_counter, LRUs, GPUs, &mem_cpy_counter, addr_track, GPU_id); if ( j == z && rs_counter == tasks_rs_size[switcher]-1) { /*--event_record--*/ cudaError_t err = cudaEventRecord(task_event[0], stream[0]); if(err != cudaSuccess) printf("event record fail\n"); } } } switcher = 1 - switcher; task_batch_counter++; } //------------------------------------------// //---------------RESULT-HARVEST-------------// collect_final_result_dgemm(tasks_rs, tasks_rs_size, switcher, stream, C_dev, block_dim, STREAMNUM, x, y, z, nrowc, ncolc, ldc, C); //------------------------------------------// #ifdef thread_profile printf("thread%d end@%f\n", GPU_id, get_cur_time()); #endif }
void mem_control_kernel_float(float *starting_point_A, float **A_dev, LRU_t **LRUs, const int GPUs, const int GPU_id, int block_dim, int *mem_cpy_counter, reader_tracker *addr_track, cudaStream_t *stream, int nrowa_dev, int ncola_dev, int lda) { rbt_node* block_A = rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)); if( block_A == NULL ) { //new element //fprintf(stderr, "==========new element========\n"); //traverse_LRU_se(LRU); int search_l_GPU = GPU_id-1; int search_r_GPU = GPU_id+1; rbt_node *block_A_l = NULL; rbt_node *block_A_r = NULL; while (block_A_l == NULL && block_A_r == NULL) { if (search_l_GPU >= 0) { block_A_l = rbt_find(starting_point_A, &(LRUs[search_l_GPU]->hash_map)); if (block_A_l != NULL) { if (block_A_l->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_l_GPU); if(peer_access_check == 1) block_A_l = NULL; } } search_l_GPU--; } if (search_r_GPU < GPUs) { block_A_r = rbt_find(starting_point_A, &(LRUs[search_r_GPU]->hash_map)); if (block_A_r != NULL) { if (block_A_r->associated_LRU_elem->is_trans_done == 0) { int peer_access_check = 0; cudaDeviceCanAccessPeer(&peer_access_check, GPU_id, search_r_GPU); if(peer_access_check == 1) block_A_r = NULL; } } search_r_GPU++; } if (search_l_GPU < 0 && search_r_GPU >= GPUs) { break; } } //rectitfication search_l_GPU++; search_r_GPU--; assert(search_l_GPU >= 0 && search_l_GPU < GPUs); assert(search_r_GPU >= 0 && search_r_GPU < GPUs); if ( !(block_A_l == NULL && block_A_r == NULL) ) { //inter GPU communication int target_GPU_id = 0; if (block_A_l != NULL && block_A_r != NULL) { if (ABS(search_l_GPU - GPU_id) > ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_r_GPU; block_A = block_A_r; } else if(ABS(search_l_GPU - GPU_id) < ABS(search_r_GPU - GPU_id)) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { int rand_select = rand()%10; if (rand_select < 5) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else { target_GPU_id = search_r_GPU; block_A = block_A_r; } } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //fprintf(stderr, "==>3 block on GPUs:(%d, %d), but chose %d(done:%d) as curt GPU is %d (block_A_l:%p, block_A_r:%p)\n", search_l_GPU, search_r_GPU, target_GPU_id, block_A->associated_LRU_elem->is_trans_done, GPU_id, block_A_l, block_A_r); } else { if (block_A_l != NULL && block_A_r == NULL) { target_GPU_id = search_l_GPU; block_A = block_A_l; } else if(block_A_r != NULL && block_A_l == NULL) { target_GPU_id = search_r_GPU; block_A = block_A_r; } if(block_A->associated_LRU_elem->is_trans_done != 1) goto new_block; //printf("==>2 block on GPUs:%d, and curt GPU is %d (done:%d)\n", target_GPU_id, GPU_id, block_A->associated_LRU_elem->is_trans_done); } if (rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) == NULL) goto new_block; atomic_reader_plus(block_A); *A_dev = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map)) != NULL); assert( rbt_find(starting_point_A, &(LRUs[target_GPU_id]->hash_map))->associated_LRU_elem->is_trans_done == 1); assert( cudaMemcpyPeerAsync(*A_dev, GPU_id, block_A->associated_LRU_elem->GPU_p, target_GPU_id, sizeof(float)*block_dim*block_dim, *stream) == cudaSuccess ); //cannot dequeue the GPU mem at the target GPU addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = target_GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 1; (*mem_cpy_counter) += 1; //cannnot dequeue the current new GPU mem addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } else { new_block: //(block_A_r == NULL && block_A_l == NULL) { //bring new blocks //printf("==>1 bring new block to GPU:%d\n", GPU_id); (*A_dev) = (float*) LRU_in(starting_point_A, LRUs[GPU_id], sizeof(float)*block_dim*block_dim, GPU_id); assert( cublasSetMatrixAsync(nrowa_dev, ncola_dev, sizeof(float), starting_point_A, lda, *A_dev, block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; addr_track[*mem_cpy_counter].is_trans_done = 0; (*mem_cpy_counter) += 1; } } else { atomic_reader_plus(block_A); assert( rbt_find(starting_point_A, &(LRUs[GPU_id]->hash_map)) != NULL); *A_dev = (float*) LRU_reorder(starting_point_A, LRUs[GPU_id]); addr_track[*mem_cpy_counter].addr = starting_point_A; addr_track[*mem_cpy_counter].GPU_id = GPU_id; (*mem_cpy_counter) += 1; } }