//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, bool *h_graph_mask, bool *h_updating_graph_mask, \ bool *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; bool h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; // try{ //--1 transfer data from host to device //printf("initializing\n"); _clInit(); //printf("allocating\n"); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_graph_mask); d_updating_graph_mask = _clMalloc(no_of_nodes*sizeof(bool), h_updating_graph_mask); d_graph_visited = _clMalloc(no_of_nodes*sizeof(bool), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(bool), &h_over); //printf("copyin\n"); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(bool), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(bool), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(bool), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif int kerId=0; // printf("launching kernel\n"); do{ // printf("copy in\n"); h_over = false; _clMemcpyH2D(d_over, sizeof(bool), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; // printf("set arg 1\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; // printf("invoke 1\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; // printf("set arg 2\n"); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; // printf("invoke 2\n"); _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); // printf("copy back\n"); _clMemcpyD2H(d_over,sizeof(bool), &h_over); // printf("done\n"); // printf("K%d\n",kerId++); }while(h_over); // printf("done!"); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); // } // catch(std::string msg){ // _clFree(d_graph_nodes); // _clFree(d_graph_edges); // _clFree(d_graph_mask); // _clFree(d_updating_graph_mask); // _clFree(d_graph_visited); // _clFree(d_cost); // _clFree(d_over); // _clRelease(); // std::string e_str = "in run_transpose_gpu -> "; // e_str += msg; // throw(e_str); // } return ; }
int main(int argc, char ** argv) { uint * in = NULL, * out_cpu = NULL, * out_gpu = NULL; cl_mem d_in = NULL, d_out = NULL; try{ if(argc!=2){ printf("need 1 parameter here!!!"); exit(-1); } _clInit(1, "gpu", 0); uint iter = 100; #if defined TIME double start_time = 0.0; double end_time = 0.0; double deltaT = 0.0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif // parameters uint side = atoi(argv[1]); uint wData = side; uint hData = side; uint size = wData * hData; printf("wData=%d, hData=%d\n", wData, hData); // allocate memory space on the host and device side in = (uint * )malloc(size * sizeof(uint)); out_cpu = (uint * )malloc(size * sizeof(uint)); out_gpu = (uint * )malloc(size * sizeof(uint)); d_in = _clMalloc(size * sizeof(uint)); d_out = _clMalloc(size * sizeof(uint)); // initialization fill<uint>(in, size, 16); // copy data from host to device _clMemcpyH2D(d_in, in, size * sizeof(uint)); // warm-up mt_1(d_in, d_out, wData, hData); mt_2(d_in, d_out, wData, hData); mt_3(d_in, d_out, wData, hData); #ifdef VARIFY CPURun(in, out_cpu, wData, hData); #endif //VARIFY /**************************1****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_1(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY /**************************2****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_2(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY /**************************3****************************/ #ifdef TIME deltaT = 0.0; #endif for(int i=0; i<iter; i++) { #ifdef TIME start_time = gettime(); #endif mt_3(d_in, d_out, wData, hData); #ifdef TIME end_time = gettime(); deltaT += end_time - start_time; #endif } #ifdef TIME fprintf(fp, "%lf\t", deltaT/(double)iter); #endif #ifdef VARIFY _clMemcpyD2H(out_gpu, d_out, size * sizeof(uint)); verify_array_int<uint>(out_cpu, out_gpu, size); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_in); _clFree(d_out); _clRelease(); if(in!=NULL) free(in); if(out_cpu!=NULL) free(out_cpu); if(out_gpu!=NULL) free(out_gpu); return 1; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ char *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; char h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(char), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif do{ h_over = false; _clMemcpyH2D(d_over, sizeof(char), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(char), &h_over); }while(h_over); _clFinish(); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
int main(int argc, char * argv[]) { float *h_imatrix_a, *h_imatrix_b, *h_omatrix_c, *h_omatrix_ref, * h_imatrix_b_trans; int a_size, b_size, c_size; int size = 2048; _clParseCommandLine(argc, argv); _clInit(platform_id, device_type, device_id); bool verify = false; a_size = size; b_size = size; c_size = size; int number_elements_a = a_size * a_size; int number_elements_b = b_size * b_size; int number_elements_c = c_size * c_size; try{ h_imatrix_a = (datatype *)malloc(number_elements_a*sizeof(datatype)); h_imatrix_b = (datatype *)malloc(number_elements_b*sizeof(datatype)); h_omatrix_c = (datatype *)malloc(number_elements_c*sizeof(datatype)); if(verify){ h_omatrix_ref = (datatype *)malloc(number_elements_c*sizeof(datatype)); } fill<datatype>(h_imatrix_a, number_elements_a, 10); fill<datatype>(h_imatrix_b, number_elements_b, 10); if(verify){ run_cpu<datatype>(h_imatrix_a, a_size, a_size, h_imatrix_b, b_size, b_size, h_omatrix_ref, c_size, c_size); } std::cout<<"--------------------testing..."<<std::endl; int number_trials = 20; double *trials = (double*)malloc(sizeof(double)*number_trials); double avg_t = 0.0; double std_t = 0.0; printf("total number of kernels: %d\n", total_kernels); for(int k=0; k<total_kernels; k++){ //std::cout<<"---kernel::"<<kernel_names[k]<<std::endl; FILE *fp = fopen("results.dat", "a"); avg_t = 0.0; std_t = 0.0; for(int i=0; i<number_trials; i++){ trials[i] = run_gpu<datatype>(h_imatrix_a, h_imatrix_b, h_omatrix_c, h_omatrix_ref, size, k, verify); avg_t += trials[i]; } avg_t = avg_t/(double)number_trials; for(int i=0; i<number_trials; i++){ std_t += (avg_t-trials[i])*(avg_t-trials[i]); } std_t /= (double)number_trials; std_t = sqrt(std_t); fprintf(fp, "%d\t%lf\t%lf\t", k, avg_t, std_t); for(int i=0; i<number_trials; i++){ fprintf(fp, "%lf\t", trials[i]); } fprintf(fp, "\n"); fclose(fp); } std::cout<<"--------------------done..."<<std::endl; _clRelease(); free(h_imatrix_a); free(h_imatrix_b); free(h_omatrix_c); if(verify){ free(h_omatrix_ref); } free(trials); } catch(std::string msg){ std::cout<<"--cambine: exception in main ->"<<msg<<std::endl; _clRelease(); } return 0; }
int main(int argc, char ** argv) { cl_mem out = NULL; try{ if(argc!=3){ printf("need 2 parameter here!!!\n"); exit(-1); } #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif uint bins = atoi(argv[1]); uint size = atoi(argv[2]); uint iter = 100; printf("bins=%d, size=%d\n", bins, size); _clInit(1, "gpu", 0); out = _clMalloc((size/BS)*bins); layout_cyclic(out, bins, size); /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_blocked(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_cyclic(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif /**************************3****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { layout_cyclic_2(out, bins, size); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(out); _clRelease(); return 1; }
//---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ int *h_graph_edges, int *h_graph_mask, int *h_updating_graph_mask, \ int *h_graph_visited, int *h_cost) throw(std::string){ //int number_elements = height*width; int h_over; cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ d_graph_visited, d_cost, d_over; try{ //--1 transfer data from host to device _clInit(); d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); d_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_graph_mask); d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(int), h_updating_graph_mask); d_graph_visited = _clMallocRW(no_of_nodes*sizeof(int), h_graph_visited); d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); d_over = _clMallocRW(sizeof(int), &h_over); _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(int), h_graph_mask); _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(int), h_updating_graph_mask); _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(int), h_graph_visited); _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); //--2 invoke kernel #ifdef PROFILING timer kernel_timer; double kernel_time = 0.0; kernel_timer.reset(); kernel_timer.start(); #endif struct timespec startT, endT; clock_gettime(CLOCK_MONOTONIC, &startT); do{ h_over = false; _clMemcpyH2D(d_over, sizeof(int), &h_over); //--kernel 0 int kernel_id = 0; int kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_cost); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //int work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); //--kernel 1 kernel_id = 1; kernel_idx = 0; _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); _clSetArgs(kernel_id, kernel_idx++, d_over); _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); //work_items = no_of_nodes; _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); _clMemcpyD2H(d_over,sizeof(int), &h_over); }while(h_over); _clFinish(); clock_gettime(CLOCK_MONOTONIC, &endT); uint64_t diff = 1000000000 * (endT.tv_sec - startT.tv_sec); uint64_t nanodiff = endT.tv_nsec - startT.tv_nsec; //printf("elapsed accelerator time = %llu nanoseconds\n", (long long unsigned int) diff); //printf("start time seconds%u \n", startT.tv_sec); //printf("end time seconds %u \n", endT.tv_sec); //printf("difference %u \n", diff); //printf("start time nanoseconds %u \n", startT.tv_nsec); //printf("end time nanoseconds %u \n", endT.tv_nsec); printf(" accelerator time %u \n", nanodiff + diff); #ifdef PROFILING kernel_timer.stop(); kernel_time = kernel_timer.getTimeInSeconds(); #endif //--3 transfer data from device to host _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); //--statistics #ifdef PROFILING std::cout<<"kernel time(s):"<<kernel_time<<std::endl; #endif //--4 release cl resources. _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); } catch(std::string msg){ _clFree(d_graph_nodes); _clFree(d_graph_edges); _clFree(d_graph_mask); _clFree(d_updating_graph_mask); _clFree(d_graph_visited); _clFree(d_cost); _clFree(d_over); _clRelease(); std::string e_str = "in run_transpose_gpu -> "; e_str += msg; throw(e_str); } return ; }
int main(int argc, char ** argv) { float * h_raw, * h_out, * outCPU; cl_mem d_raw, d_out; try { if(argc!=2) { printf("need one parameter here!!!"); exit(-1); } _clInit(1, "gpu", 0); #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif int cdim = atoi(argv[1]); //{384}; int rdim = atoi(argv[1]); //{288}; printf("cdim=%d, rdim=%d\n", cdim, rdim); h_raw = (float *)malloc(cdim * rdim * sizeof(float)); h_out = (float *)malloc(cdim * rdim * sizeof(float)); outCPU = (float *)malloc(cdim * rdim * sizeof(float)); fill<float>(h_raw, cdim * rdim, 5); d_raw = _clMalloc(cdim * rdim * sizeof(float)); d_out = _clMalloc(cdim * rdim * sizeof(float)); _clMemcpyH2D(d_raw, h_raw, cdim * rdim * sizeof(float)); printf("-0\n"); #ifdef VARIFY CPURun(h_raw, outCPU, cdim, rdim); #endif //VARIFY /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif printf("-1.1\n"); broadcast(d_raw, d_out, cdim, rdim); printf("-1.2\n"); #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time - start_time)); #endif #ifdef VARIFY _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float)); verify_array<float>(outCPU, h_out, cdim * rdim); #endif //VARIFY /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif broadcast_lm(d_raw, d_out, cdim, rdim); printf("-2\n"); #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time - start_time)); #endif #ifdef VARIFY _clMemcpyD2H(h_out, d_out, cdim * rdim * sizeof(float)); verify_array<float>(outCPU, h_out, cdim * rdim); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg) { printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_raw); _clFree(d_out); _clRelease(); if(h_raw!=NULL) free(h_raw); if(h_out!=NULL) free(h_out); if(outCPU!=NULL) free(outCPU); return 1; }
int main(int argc, char * argv[]){ _clParseCommandLine(argc, argv); int w = 2048; int h = 2048; bool verify = false; try{ _clInit(platform_id, device_type, device_id); int number_trials = 20; double *trials = (double*)malloc(sizeof(double)*number_trials); double avg_time = 0.0; double std_time = 0.0; FILE *fp = fopen("results.dat", "a"); if(fp==NULL){ throw(string("failed to open the output file!!!")); } datatype *h_i_vector; datatype *h_o_vector; datatype *h_o_vector_ref; int number_elements_out = w * h; int number_elements_in = w * h; h_i_vector = (datatype *)malloc(number_elements_in * sizeof(datatype)); h_o_vector = (datatype *)malloc(number_elements_out * sizeof(datatype)); fill<datatype>(h_i_vector, number_elements_in, 10); if(verify){ h_o_vector_ref = (datatype *)malloc(number_elements_out * sizeof(datatype)); run_cpu<datatype>(h_i_vector, h_o_vector_ref, w, h); } for(int k=0; k<total_kernels; k++){ avg_time = 0.0; std_time = 0.0; for(int r=0; r<number_trials; r++){ trials[r] = run_gpu<datatype>(h_i_vector, h_o_vector, h_o_vector_ref, w, h, k, verify); avg_time += trials[r]; } avg_time = avg_time/(double)number_trials; for(int i=0; i<number_trials; i++){ std_time += (avg_time-trials[i])*(avg_time-trials[i]); } std_time /= (double)number_trials; std_time = sqrt(std_time); fprintf(fp, "%d\t%lf\t%lf\t", k, avg_time, std_time); for(int i=0; i<number_trials; i++){ fprintf(fp, "%lf\t", trials[i]); } fprintf(fp, "\n"); } _clRelease(); free(h_i_vector); free(h_o_vector); if(verify){ free(h_o_vector_ref); } fclose(fp); } catch(std::string msg){ std::cout<<"Exception in main :: "<<msg<<std::endl; } return 0; }
int main(int argc, char ** argv) { //float *hIn1, *hIn2; //cl_mem dIn1, dIn2; @hIn; @dIn; float *hOut, *rOut; cl_mem dOut; try{ _clParseCommandLine(argc, argv); string strSubfix = string(argv[2]); _clInit(platform_id, device_type, device_id); int cdim = atoi(argv[1]); int rdim = atoi(argv[1]); int r = atoi(argv[3]); @cdimIn @rdimIn // different between iMAP1 and iMAP2 printf("cdim=%d, rdim=%d, radius=%d\n", cdim, rdim, r); int iIter = 10; int elems = @elems; double dataAmount = (double)cdim * (double)rdim * (double)(elems) * (double)sizeof(float) * 1e-6; #if defined TIME double start_time = 0; double end_time = 0; double delta_time = 0; int cnt = 0; string dat_name= string("data.") + strSubfix + string(".dat"); FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif //hIn1 = (float *)malloc(cdim * rdim * sizeof(float)); //hIn2 = (float *)malloc(cdim * rdim * sizeof(float)); @hAlc hOut = (float *)malloc(cdim * rdim * sizeof(float)); rOut = (float *)malloc(cdim * rdim * sizeof(float)); //fill<float>(hIn1, cdim * rdim, 5); //fill<float>(hIn2, cdim * rdim, 5); @hFill //dIn1 = _clMalloc(cdim * rdim * sizeof(float)); //dIn2 = _clMalloc(cdim * rdim * sizeof(float)); @dAlc dOut = _clMalloc(cdim * rdim * sizeof(float)); //_clMemcpyH2D(dIn1, hIn1, cdim * rdim * sizeof(float)); //_clMemcpyH2D(dIn2, hIn2, cdim * rdim * sizeof(float)); @h2dTrans _clFinish(); // warmup //OCLRun(dIn1, dIn2, dOut, cdim, rdim); OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn); #ifdef VARIFY //OMPRun(hIn1, hIn2, rOut, cdim, rdim); OMPRun(@ompArgs, rOut, cdim, rdim, cdimIn, rdimIn); #endif //VARIFY #ifdef TIME delta_time = 0; cnt = 0; #endif for(int i=0; i<iIter; i++) { #ifdef TIME cnt++; start_time = gettime(); #endif OCLRun(@oclArgs, dOut, cdim, rdim, cdimIn, rdimIn); #ifdef TIME end_time = gettime(); delta_time += end_time - start_time; if(fabs(delta_time-600000.0)>0.1) break; // ???? #endif } #ifdef TIME fprintf(fp, "%lf\t", dataAmount * (double)cnt/delta_time); #endif #ifdef VARIFY _clMemcpyD2H(hOut, dOut, cdim * rdim * sizeof(float)); verify_array<float>(rOut, hOut, cdim * rdim); #endif //VARIFY #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); exit(-1); } //_clFree(dIn1); //_clFree(dIn2); @clFree _clFree(dOut); _clRelease(); //if(hIn1!=NULL) free(hIn1); //if(hIn2!=NULL) free(hIn2); @hFree if(hOut!=NULL) free(hOut); if(rOut!=NULL) free(rOut); return 1; }
int main(int argc, char ** argv) { uint * h_in = NULL, * h_out_1 = NULL, * h_out_2 = NULL, * h_out = NULL; cl_mem d_in = NULL, d_out_1 = NULL, d_out_2 = NULL; try{ if(argc!=3){ printf("need 2 parameter here!!!\n"); exit(-1); } #if defined TIME double start_time = 0; double end_time = 0; string dat_name="data.dat"; FILE * fp = fopen(dat_name.c_str(), "a+"); if(fp==NULL) { printf("failed to open file!!!\n"); exit(-1); } #endif uint w, h; uint side = atoi(argv[1]); w = side, h = side; uint size = w * h; uint radius = atoi(argv[2]); uint iter = 100; printf("w=%d, h=%d, radius=%d\n", w, h, radius); _clInit(1, "gpu", 0); h_in = (uint *)malloc(size * sizeof(uint)); h_out_1 = (uint *)malloc(size * sizeof(uint)); h_out_2 = (uint *)malloc(size * sizeof(uint)); h_out = (uint *)malloc(size * sizeof(uint)); d_in = _clMalloc(size * sizeof(uint)); d_out_1 = _clMalloc(size * sizeof(uint)); d_out_2 = _clMalloc(size * sizeof(uint)); fill<uint>(h_in, size, 10); _clMemcpyH2D(d_in, h_in, size * sizeof(uint)); //g2l_CPU(h_in, h_out, w, h, radius); /**************************1****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { g2l_TBT(d_in, d_out_1, radius, w, h); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef VARI _clMemcpyD2H(h_out_1, d_out_1, size * sizeof(uint)); #endif /**************************2****************************/ #ifdef TIME start_time = gettime(); #endif for(int i=1; i<iter; i++) { g2l_FCTH(d_in, d_out_2, radius, w, h); } #ifdef TIME end_time = gettime(); fprintf(fp, "%lf\t", (end_time-start_time)/(double)iter); #endif #ifdef VARI _clMemcpyD2H(h_out_2, d_out_2, size * sizeof(uint)); verify_array_int<uint>(h_out_1, h_out_2, w, h); #endif #ifdef TIME fprintf(fp, "\n"); fclose(fp); #endif } catch(string msg){ printf("ERR:%s\n", msg.c_str()); printf("Error catched\n"); } _clFree(d_in); _clFree(d_out_1); _clFree(d_out_2); _clRelease(); if(h_in!=NULL) free(h_in); if(h_out_1!=NULL) free(h_out_1); if(h_out_2!=NULL) free(h_out_2); if(h_out!=NULL) free(h_out); return 1; }