/* in the First-Central-Then-Halo mode (FCTH) */ void g2l_FCTH(cl_mem d_in, cl_mem d_out, uint radius, uint w, uint h){ uint range_x = w; uint range_y = h; uint group_x = 16; uint group_y = 16; uint length_x = group_x + 2 * radius; uint length_y = group_y + 2 * radius; uint kernel_id = 1; uint arg_idx = 0; _clSetArgs(kernel_id, arg_idx++, d_in); _clSetArgs(kernel_id, arg_idx++, d_out); _clSetArgs(kernel_id, arg_idx++, NULL, length_x * length_y * sizeof(uint)); _clSetArgs(kernel_id, arg_idx++, &w, sizeof(uint)); _clSetArgs(kernel_id, arg_idx++, &h, sizeof(uint)); _clSetArgs(kernel_id, arg_idx++, &radius, sizeof(uint)); _clSetArgs(kernel_id, arg_idx++, &length_x, sizeof(uint)); _clSetArgs(kernel_id, arg_idx++, &length_y, sizeof(uint)); _clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y); return ; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \ bool *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; bool h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; // try{ //--1 transfer data from host to device //printf("initializing\n"); _clInit(); //printf("allocating\n"); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask); d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask); d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(bool), &h_over); //printf("copyin\n"); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif int kerId=0; // printf("launching kernel\n"); do{ // printf("copy in\n"); h_over = false; _clMemcpyH2D(d_over, sizeof(bool), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; // printf("set arg 1\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; // printf("invoke 1\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; // printf("set arg 2\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; // printf("invoke 2\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); // printf("copy back\n"); _clMemcpyD2H(d_over,sizeof(bool), &h_over); // printf("done\n"); // printf("K%d\n",kerId++); }while(h_over); // printf("done!"); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); // } // catch(std::string msg){ // _clFree(d_graph_nodes); // _clFree(d_graph_edges); // _clFree(d_graph_mask); // _clFree(d_updating_graph_mask); // _clFree(d_graph_visited); // _clFree(d_cost); // _clFree(d_over); // _clRelease(); // std::string e_str = "in run_transpose_gpu -> "; // e_str += msg; // throw(e_str); // } return ; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ char *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; char h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif do{ h_over = false; _clMemcpyH2D(d_over, sizeof(char), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(char), &h_over); }while(h_over); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
double run_gpu(datatype *h_imatrix_a, datatype *h_imatrix_b, datatype *h_omatrix_c, datatype *h_omatrix_ref, int size, int kernel_id, bool verify) throw(std::string){ int number_elements = size * size; cl_mem d_imatrix_a, d_imatrix_b, d_omatrix_c; try{ //--1 transfer data from host to device d_imatrix_a = _clMalloc(number_elements*sizeof(datatype)); d_imatrix_b = _clMalloc(number_elements*sizeof(datatype)); d_omatrix_c = _clMalloc(number_elements*sizeof(datatype)); _clMemcpyH2D(d_imatrix_a, h_imatrix_a, number_elements*sizeof(datatype)); _clMemcpyH2D(d_imatrix_b, h_imatrix_b, number_elements*sizeof(datatype)); //--2 invoke kernel int iterations = 1; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_imatrix_a); _clSetArgs(kernel_id, kernel_idx++, d_imatrix_b); _clSetArgs(kernel_id, kernel_idx++, d_omatrix_c); _clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int)); int work_group_unit = 64; int range_x, range_y; int vf = 1; switch(kernel_id){ case 0: range_x = size, range_y = size; vf = 1; break; case 1: range_x = size/2, range_y = size; vf = 2; break; case 2: range_x = size/4, range_y = size; vf = 4; break; case 3: range_x = size/8, range_y = size; vf = 8; break; case 4: range_x = size/16, range_y = size; vf = 16; break; default: throw("->unknown kernle id->"); break; } int group_x = work_group_unit, group_y = 1; unsigned long deltaT = 0; unsigned long totalT = 0; for(int i=-1; i<iterations; i++){ _clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &deltaT); if(i==0) totalT += deltaT; } //totalT = totalT/iterations; //--3 transfer data from device to host _clMemcpyD2H(h_omatrix_c,d_omatrix_c, number_elements*sizeof(datatype)); //--statistics //--4 release cl resources. if(verify){ verify_array<datatype>(h_omatrix_ref, h_omatrix_c, number_elements); } _clFree(d_imatrix_a); _clFree(d_imatrix_b); _clFree(d_omatrix_c); return totalT*(1e-9); } catch(std::string msg){ std::string e_str = "in run_gpu -> "; e_str += msg; throw(e_str); } return 0.0; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \ int *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; int h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(int), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif struct timespec startT, endT; clock_gettime(CLOCK_MONOTONIC, &startT); do{ h_over = false; _clMemcpyH2D(d_over, sizeof(int), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(int), &h_over); }while(h_over); _clFinish(); clock_gettime(CLOCK_MONOTONIC, &endT); uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec); uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec; //printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff); //printf("start time seconds%u \n", startT.tv_sec); //printf("end time seconds %u \n", endT.tv_sec); //printf("difference %u \n", diff); //printf("start time nanoseconds %u \n", startT.tv_nsec); //printf("end time nanoseconds %u \n", endT.tv_nsec); printf(" accelerator time %u \n", nanodiff + diff); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
double run_gpu(datatype *h_i_vector, datatype *h_o_vector, datatype *h_o_vector_ref,\ int w, int h, int kernel_id, bool verify)throw(std::string){ cl_mem d_i_vector, d_o_vector; int number_elements_out = w * h; int number_elements_in = w * h; try{ //--1 transfer data from host to device d_i_vector = _clMalloc(number_elements_in * sizeof(datatype)); d_o_vector = _clMalloc(number_elements_out * sizeof(datatype)); _clMemcpyH2D(d_i_vector, h_i_vector, number_elements_in*sizeof(datatype)); _clFinish(); //--2 invoke kernel int args_idx = 0; _clSetArgs(kernel_id, args_idx++, d_i_vector); _clSetArgs(kernel_id, args_idx++, d_o_vector); _clSetArgs(kernel_id, args_idx++, &w, sizeof(int)); _clSetArgs(kernel_id, args_idx++, &h, sizeof(int)); int work_group_unit = 16; int range_x = -1; int range_y = -1; switch(kernel_id){ case 0: range_x = w, range_y = h; break; case 1: range_x = w/2, range_y = h; break; case 2: range_x = w/4, range_y = h; break; case 3: range_x = w/8, range_y = h; break; case 4: range_x = w/16, range_y = h; break; default: throw(string("Unknown kernel id!!!")); break; } int group_x = work_group_unit * 4; int group_y = 1; int number_iterations = 1; unsigned long deltaT = 0.0f; unsigned long kernel_exe_time = 0.0f; std::cout<<"--testing..."<<std::endl; for(int i=-1; i<number_iterations; i++){ _clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &kernel_exe_time); if(i==0) deltaT += kernel_exe_time; } deltaT = deltaT/number_iterations; std::cout<<"--done."<<std::endl; _clMemcpyD2H(h_o_vector, d_o_vector, number_elements_out*sizeof(datatype)); if(verify){ verify_array<datatype>(h_o_vector, h_o_vector_ref, number_elements_out); } //--4 release cl resources. _clFree(d_i_vector); _clFree(d_o_vector); return (double)(((double)w*(double)h*(double)(h+1))*sizeof(datatype))/(double)deltaT; } catch(std::string msg){ std::string e_str = "in run_gpu -> "; e_str += msg; throw(e_str); } return 0.0; }