//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \ bool *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; bool h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; // try{ //--1 transfer data from host to device //printf("initializing\n"); _clInit(); //printf("allocating\n"); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask); d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask); d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(bool), &h_over); //printf("copyin\n"); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif int kerId=0; // printf("launching kernel\n"); do{ // printf("copy in\n"); h_over = false; _clMemcpyH2D(d_over, sizeof(bool), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; // printf("set arg 1\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; // printf("invoke 1\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; // printf("set arg 2\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; // printf("invoke 2\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); // printf("copy back\n"); _clMemcpyD2H(d_over,sizeof(bool), &h_over); // printf("done\n"); // printf("K%d\n",kerId++); }while(h_over); // printf("done!"); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); // } // catch(std::string msg){ // _clFree(d_graph_nodes); // _clFree(d_graph_edges); // _clFree(d_graph_mask); // _clFree(d_updating_graph_mask); // _clFree(d_graph_visited); // _clFree(d_cost); // _clFree(d_over); // _clRelease(); // std::string e_str = "in run_transpose_gpu -> "; // e_str += msg; // throw(e_str); // } return ; }
int main(int argc, char ** argv) { uint * in = NULL, * out_cpu = NULL, * out_gpu = NULL; cl_mem d_in = NULL, d_out = NULL; try{ if(argc!=2){ printf("need 1 parameter here!!!"); exit(-1); } _clInit(1, "gpu", 0); uint iter = 100; #if defined TIME double start_time = 0.0; double end_time = 0.0; double deltaT = 0.0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif // parameters uint side = atoi(argv[1]); uint wData = side; uint hData = side; uint size = wData * hData; printf("wData=%d, hData=%d\n", wData, hData); // allocate memory space on the host and device side in = (uint * )malloc(size * sizeof(uint)); out_cpu = (uint * )malloc(size * sizeof(uint)); out_gpu = (uint * )malloc(size * sizeof(uint)); d_in = _clMalloc(size * sizeof(uint)); d_out = _clMalloc(size * sizeof(uint)); // initialization fill<uint>(in, size, 16); // copy data from host to device _clMemcpyH2D(d_in, in, size * sizeof(uint)); // warm-up mt_1(d_in, d_out, wData, hData); mt_2(d_in, d_out, wData, hData); mt_3(d_in, d_out, wData, hData); #ifdef VARIFY CPURun(in, out_cpu, wData, hData); #endif //VARIFY /**************************1****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_1(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY /**************************2****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_2(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY /**************************3****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_3(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_in); _clFree(d_out); _clRelease(); if(in!=NULL) free(in); if(out_cpu!=NULL) free(out_cpu); if(out_gpu!=NULL) free(out_gpu); return 1; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ char *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; char h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif do{ h_over = false; _clMemcpyH2D(d_over, sizeof(char), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(char), &h_over); }while(h_over); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
double run_gpu(datatype *h_imatrix_a, datatype *h_imatrix_b, datatype *h_omatrix_c, datatype *h_omatrix_ref, int size, int kernel_id, bool verify) throw(std::string){ int number_elements = size * size; cl_mem d_imatrix_a, d_imatrix_b, d_omatrix_c; try{ //--1 transfer data from host to device d_imatrix_a = _clMalloc(number_elements*sizeof(datatype)); d_imatrix_b = _clMalloc(number_elements*sizeof(datatype)); d_omatrix_c = _clMalloc(number_elements*sizeof(datatype)); _clMemcpyH2D(d_imatrix_a, h_imatrix_a, number_elements*sizeof(datatype)); _clMemcpyH2D(d_imatrix_b, h_imatrix_b, number_elements*sizeof(datatype)); //--2 invoke kernel int iterations = 1; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_imatrix_a); _clSetArgs(kernel_id, kernel_idx++, d_imatrix_b); _clSetArgs(kernel_id, kernel_idx++, d_omatrix_c); _clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int)); _clSetArgs(kernel_id, kernel_idx++, &size, sizeof(int)); int work_group_unit = 64; int range_x, range_y; int vf = 1; switch(kernel_id){ case 0: range_x = size, range_y = size; vf = 1; break; case 1: range_x = size/2, range_y = size; vf = 2; break; case 2: range_x = size/4, range_y = size; vf = 4; break; case 3: range_x = size/8, range_y = size; vf = 8; break; case 4: range_x = size/16, range_y = size; vf = 16; break; default: throw("->unknown kernle id->"); break; } int group_x = work_group_unit, group_y = 1; unsigned long deltaT = 0; unsigned long totalT = 0; for(int i=-1; i<iterations; i++){ _clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &deltaT); if(i==0) totalT += deltaT; } //totalT = totalT/iterations; //--3 transfer data from device to host _clMemcpyD2H(h_omatrix_c,d_omatrix_c, number_elements*sizeof(datatype)); //--statistics //--4 release cl resources. if(verify){ verify_array<datatype>(h_omatrix_ref, h_omatrix_c, number_elements); } _clFree(d_imatrix_a); _clFree(d_imatrix_b); _clFree(d_omatrix_c); return totalT*(1e-9); } catch(std::string msg){ std::string e_str = "in run_gpu -> "; e_str += msg; throw(e_str); } return 0.0; }
int main(int argc, char ** argv) { cl_mem out = NULL; try{ if(argc!=3){ printf("need 2 parameter here!!!\n"); exit(-1); } #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif uint bins = atoi(argv[1]); uint size = atoi(argv[2]); uint iter = 100; printf("bins=%d, size=%d\n", bins, size); _clInit(1, "gpu", 0); out = _clMalloc((size/BS)*bins); layout_cyclic(out, bins, size); /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_blocked(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_cyclic(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif /**************************3****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_cyclic_2(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(out); _clRelease(); return 1; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \ int *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; int h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(int), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif struct timespec startT, endT; clock_gettime(CLOCK_MONOTONIC, &startT); do{ h_over = false; _clMemcpyH2D(d_over, sizeof(int), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(int), &h_over); }while(h_over); _clFinish(); clock_gettime(CLOCK_MONOTONIC, &endT); uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec); uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec; //printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff); //printf("start time seconds%u \n", startT.tv_sec); //printf("end time seconds %u \n", endT.tv_sec); //printf("difference %u \n", diff); //printf("start time nanoseconds %u \n", startT.tv_nsec); //printf("end time nanoseconds %u \n", endT.tv_nsec); printf(" accelerator time %u \n", nanodiff + diff); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
int main(int argc, char ** argv) { float * h_raw, * h_out, * outCPU; cl_mem d_raw, d_out; try { if(argc!=2) { printf("need one parameter here!!!"); exit(-1); } _clInit(1, "gpu", 0); #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif int cdim = atoi(argv[1]); //{384}; int rdim = atoi(argv[1]); //{288}; printf("cdim=%d, rdim=%d\n", cdim, rdim); h_raw = (float *)malloc(cdim * rdim * sizeof(float)); h_out = (float *)malloc(cdim * rdim * sizeof(float)); outCPU = (float *)malloc(cdim * rdim * sizeof(float)); fill<float>(h_raw, cdim * rdim, 5); d_raw = _clMalloc(cdim * rdim * sizeof(float)); d_out = _clMalloc(cdim * rdim * sizeof(float)); _clMemcpyH2D(d_raw, h_raw, cdim * rdim * sizeof(float)); printf("-0\n"); #ifdef VARIFY CPURun(h_raw, outCPU, cdim, rdim); #endif //VARIFY /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif printf("-1.1\n"); broadcast(d_raw, d_out, cdim, rdim); printf("-1.2\n"); #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time - start_time)); #endif #ifdef VARIFY _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float)); verify_array<float>(outCPU, h_out, cdim * rdim); #endif //VARIFY /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif broadcast_lm(d_raw, d_out, cdim, rdim); printf("-2\n"); #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time - start_time)); #endif #ifdef VARIFY _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float)); verify_array<float>(outCPU, h_out, cdim * rdim); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg) { printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_raw); _clFree(d_out); _clRelease(); if(h_raw!=NULL) free(h_raw); if(h_out!=NULL) free(h_out); if(outCPU!=NULL) free(outCPU); return 1; }
double run_gpu(datatype *h_i_vector, datatype *h_o_vector, datatype *h_o_vector_ref,\ int w, int h, int kernel_id, bool verify)throw(std::string){ cl_mem d_i_vector, d_o_vector; int number_elements_out = w * h; int number_elements_in = w * h; try{ //--1 transfer data from host to device d_i_vector = _clMalloc(number_elements_in * sizeof(datatype)); d_o_vector = _clMalloc(number_elements_out * sizeof(datatype)); _clMemcpyH2D(d_i_vector, h_i_vector, number_elements_in*sizeof(datatype)); _clFinish(); //--2 invoke kernel int args_idx = 0; _clSetArgs(kernel_id, args_idx++, d_i_vector); _clSetArgs(kernel_id, args_idx++, d_o_vector); _clSetArgs(kernel_id, args_idx++, &w, sizeof(int)); _clSetArgs(kernel_id, args_idx++, &h, sizeof(int)); int work_group_unit = 16; int range_x = -1; int range_y = -1; switch(kernel_id){ case 0: range_x = w, range_y = h; break; case 1: range_x = w/2, range_y = h; break; case 2: range_x = w/4, range_y = h; break; case 3: range_x = w/8, range_y = h; break; case 4: range_x = w/16, range_y = h; break; default: throw(string("Unknown kernel id!!!")); break; } int group_x = work_group_unit * 4; int group_y = 1; int number_iterations = 1; unsigned long deltaT = 0.0f; unsigned long kernel_exe_time = 0.0f; std::cout<<"--testing..."<<std::endl; for(int i=-1; i<number_iterations; i++){ _clInvokeKernel2D(kernel_id, range_x, range_y, group_x, group_y, &kernel_exe_time); if(i==0) deltaT += kernel_exe_time; } deltaT = deltaT/number_iterations; std::cout<<"--done."<<std::endl; _clMemcpyD2H(h_o_vector, d_o_vector, number_elements_out*sizeof(datatype)); if(verify){ verify_array<datatype>(h_o_vector, h_o_vector_ref, number_elements_out); } //--4 release cl resources. _clFree(d_i_vector); _clFree(d_o_vector); return (double)(((double)w*(double)h*(double)(h+1))*sizeof(datatype))/(double)deltaT; } catch(std::string msg){ std::string e_str = "in run_gpu -> "; e_str += msg; throw(e_str); } return 0.0; }
int main(int argc, char ** argv) { //float *hIn1, *hIn2; //cl_mem dIn1, dIn2; @hIn; @dIn; float *hOut, *rOut; cl_mem dOut; try{ _clParseCommandLine(argc, argv); string strSubfix = string(argv[2]); _clInit(platform_id, device_type, device_id); int cdim = atoi(argv[1]); int rdim = atoi(argv[1]); int r = atoi(argv[3]); @cdimIn @rdimIn // different between iMAP1 and iMAP2 printf("cdim=%d, rdim=%d, radius=%d\n", cdim, rdim, r); int iIter = 10; int elems = @elems; double dataAmount = (double)cdim * (double)rdim * (double)(elems) * (double)sizeof(float) * 1e-6; #if defined TIME double start_time = 0; double end_time = 0; double delta_time = 0; int cnt = 0; string dat_name= string("data.") + strSubfix + string(".dat"); FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif //hIn1 = (float *)malloc(cdim * rdim * sizeof(float)); //hIn2 = (float *)malloc(cdim * rdim * sizeof(float)); @hAlc hOut = (float *)malloc(cdim * rdim * sizeof(float)); rOut = (float *)malloc(cdim * rdim * sizeof(float)); //fill<float>(hIn1, cdim * rdim, 5); //fill<float>(hIn2, cdim * rdim, 5); @hFill //dIn1 = _clMalloc(cdim * rdim * sizeof(float)); //dIn2 = _clMalloc(cdim * rdim * sizeof(float)); @dAlc dOut = _clMalloc(cdim * rdim * sizeof(float)); //_clMemcpyH2D(dIn1, hIn1, cdim * rdim * sizeof(float)); //_clMemcpyH2D(dIn2, hIn2, cdim * rdim * sizeof(float)); @h2dTrans _clFinish(); // warmup //OCLRun(dIn1, dIn2, dOut, cdim, rdim); OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn); #ifdef VARIFY //OMPRun(hIn1, hIn2, rOut, cdim, rdim); OMPRun(@ompArgs, rOut, cdim, rdim, cdimIn, rdimIn); #endif //VARIFY #ifdef TIME delta_time = 0; cnt = 0; #endif for(int i=0; i<iIter; i++) { #ifdef TIME cnt++; start_time = gettime(); #endif OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn); #ifdef TIME end_time = gettime(); delta_time += end_time - start_time; if(fabs(delta_time-600000.0)>0.1) break; // ???? #endif } #ifdef TIME fprintf(fp, "%lf\t", dataAmount * (double)cnt/delta_time); #endif #ifdef VARIFY _clMemcpyD2H(hOut, dOut, cdim * rdim * sizeof(float)); verify_array<float>(rOut, hOut, cdim * rdim); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); exit(-1); } //_clFree(dIn1); //_clFree(dIn2); @clFree _clFree(dOut); _clRelease(); //if(hIn1!=NULL) free(hIn1); //if(hIn2!=NULL) free(hIn2); @hFree if(hOut!=NULL) free(hOut); if(rOut!=NULL) free(rOut); return 1; }
int main(int argc, char ** argv) { uint * h_in = NULL, * h_out_1 = NULL, * h_out_2 = NULL, * h_out = NULL; cl_mem d_in = NULL, d_out_1 = NULL, d_out_2 = NULL; try{ if(argc!=3){ printf("need 2 parameter here!!!\n"); exit(-1); } #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif uint w, h; uint side = atoi(argv[1]); w = side, h = side; uint size = w * h; uint radius = atoi(argv[2]); uint iter = 100; printf("w=%d, h=%d, radius=%d\n", w, h, radius); _clInit(1, "gpu", 0); h_in = (uint *)malloc(size * sizeof(uint)); h_out_1 = (uint *)malloc(size * sizeof(uint)); h_out_2 = (uint *)malloc(size * sizeof(uint)); h_out = (uint *)malloc(size * sizeof(uint)); d_in = _clMalloc(size * sizeof(uint)); d_out_1 = _clMalloc(size * sizeof(uint)); d_out_2 = _clMalloc(size * sizeof(uint)); fill<uint>(h_in, size, 10); _clMemcpyH2D(d_in, h_in, size * sizeof(uint)); //g2l_CPU(h_in, h_out, w, h, radius); /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { g2l_TBT(d_in, d_out_1, radius, w, h); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef VARI _clMemcpyD2H(h_out_1, d_out_1, size * sizeof(uint)); #endif /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { g2l_FCTH(d_in, d_out_2, radius, w, h); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef VARI _clMemcpyD2H(h_out_2, d_out_2, size * sizeof(uint)); verify_array_int<uint>(h_out_1, h_out_2, w, h); #endif #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_in); _clFree(d_out_1); _clFree(d_out_2); _clRelease(); if(h_in!=NULL) free(h_in); if(h_out_1!=NULL) free(h_out_1); if(h_out_2!=NULL) free(h_out_2); if(h_out!=NULL) free(h_out); return 1; }