//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ char *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; char h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif do{ h_over = false; _clMemcpyH2D(d_over, sizeof(char), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(char), &h_over); }while(h_over); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \ bool *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; bool h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; // try{ //--1 transfer data from host to device //printf("initializing\n"); _clInit(); //printf("allocating\n"); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask); d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask); d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(bool), &h_over); //printf("copyin\n"); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif int kerId=0; // printf("launching kernel\n"); do{ // printf("copy in\n"); h_over = false; _clMemcpyH2D(d_over, sizeof(bool), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; // printf("set arg 1\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; // printf("invoke 1\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; // printf("set arg 2\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; // printf("invoke 2\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); // printf("copy back\n"); _clMemcpyD2H(d_over,sizeof(bool), &h_over); // printf("done\n"); // printf("K%d\n",kerId++); }while(h_over); // printf("done!"); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); // } // catch(std::string msg){ // _clFree(d_graph_nodes); // _clFree(d_graph_edges); // _clFree(d_graph_mask); // _clFree(d_updating_graph_mask); // _clFree(d_graph_visited); // _clFree(d_cost); // _clFree(d_over); // _clRelease(); // std::string e_str = "in run_transpose_gpu -> "; // e_str += msg; // throw(e_str); // } return ; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \ int *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; int h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(int), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif struct timespec startT, endT; clock_gettime(CLOCK_MONOTONIC, &startT); do{ h_over = false; _clMemcpyH2D(d_over, sizeof(int), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(int), &h_over); }while(h_over); _clFinish(); clock_gettime(CLOCK_MONOTONIC, &endT); uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec); uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec; //printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff); //printf("start time seconds%u \n", startT.tv_sec); //printf("end time seconds %u \n", endT.tv_sec); //printf("difference %u \n", diff); //printf("start time nanoseconds %u \n", startT.tv_nsec); //printf("end time nanoseconds %u \n", endT.tv_nsec); printf(" accelerator time %u \n", nanodiff + diff); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }