int main(int argc, char *argv[]) { /* Retrieve problem size. */ int ni = NI; int nj = NJ; /* Variable declaration/allocation. */ DATA_TYPE alpha; DATA_TYPE beta; POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj); POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj); POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NI,ni,ni); POLYBENCH_2D_ARRAY_DECL(C_outputFromGpu,DATA_TYPE,NI,NI,ni,ni); init_arrays(ni, nj, &alpha, &beta, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C)); read_cl_file(); cl_initialization(); cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C)); cl_load_prog(); cl_launch_kernel(ni, nj, alpha, beta); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), POLYBENCH_ARRAY(C_outputFromGpu), 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); #ifdef RUN_ON_CPU /* Start timer. */ polybench_start_instruments; syr2kCpu(ni, nj, alpha, beta, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C)); /* Stop and print timer. */ printf("CPU Time in seconds:\n"); polybench_stop_instruments; polybench_print_instruments; compareResults(ni, POLYBENCH_ARRAY(C), POLYBENCH_ARRAY(C_outputFromGpu)); #else //prevent dead code elimination polybench_prevent_dce(print_array(ni, POLYBENCH_ARRAY(C_outputFromGpu))); #endif //RUN_ON_CPU cl_clean_up(); POLYBENCH_FREE_ARRAY(A); POLYBENCH_FREE_ARRAY(B); POLYBENCH_FREE_ARRAY(C); POLYBENCH_FREE_ARRAY(C_outputFromGpu); return 0; }
int main(int argc, char *argv[]) { int tmax = TMAX; int nx = NX; int ny = NY; POLYBENCH_1D_ARRAY_DECL(_fict_,DATA_TYPE,TMAX,TMAX); POLYBENCH_2D_ARRAY_DECL(ex,DATA_TYPE,NX,NY,nx,ny); POLYBENCH_2D_ARRAY_DECL(ey,DATA_TYPE,NX,NY,nx,ny); POLYBENCH_2D_ARRAY_DECL(hz,DATA_TYPE,NX,NY,nx,ny); POLYBENCH_2D_ARRAY_DECL(hz_outputFromGpu,DATA_TYPE,NX,NY,nx,ny); init_arrays(tmax, nx, ny, POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz)); read_cl_file(); cl_initialization(); cl_mem_init(POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz)); cl_load_prog(); cl_launch_kernel(tmax, nx, ny); errcode = clEnqueueReadBuffer(clCommandQue, hz_mem_obj, CL_TRUE, 0, NX * NY * sizeof(DATA_TYPE), POLYBENCH_ARRAY(hz_outputFromGpu), 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); #ifdef RUN_ON_CPU /* Start timer. */ polybench_start_instruments; runFdtd(tmax, nx, ny, POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz)); /* Stop and print timer. */ printf("CPU Time in seconds:\n"); polybench_stop_instruments; polybench_print_instruments; compareResults(nx, ny, POLYBENCH_ARRAY(hz), POLYBENCH_ARRAY(hz_outputFromGpu)); #else //prevent dead code elimination polybench_prevent_dce(print_array(nx, ny, POLYBENCH_ARRAY(hz_outputFromGpu))); #endif //RUN_ON_CPU POLYBENCH_FREE_ARRAY(_fict_); POLYBENCH_FREE_ARRAY(ex); POLYBENCH_FREE_ARRAY(ey); POLYBENCH_FREE_ARRAY(hz); POLYBENCH_FREE_ARRAY(hz_outputFromGpu); cl_clean_up(); return 0; }
int main(void) { int nx = NX; int ny = NY; POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NX,NY,nx,ny); POLYBENCH_1D_ARRAY_DECL(x,DATA_TYPE,NY,ny); POLYBENCH_1D_ARRAY_DECL(y,DATA_TYPE,NY,ny); POLYBENCH_1D_ARRAY_DECL(y_outputFromGpu,DATA_TYPE,NY,ny); POLYBENCH_1D_ARRAY_DECL(tmp,DATA_TYPE,NX,nx); init_array(nx, ny, POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(A)); read_cl_file(); cl_initialization(); cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(tmp)); cl_load_prog(); cl_launch_kernel(nx, ny); errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), POLYBENCH_ARRAY(y_outputFromGpu), 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); #ifdef RUN_ON_CPU /* Start timer. */ polybench_start_instruments; atax_cpu(nx, ny, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(tmp)); /* Stop and print timer. */ printf("CPU Time in seconds:\n"); polybench_stop_instruments; polybench_print_instruments; compareResults(ny, POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(y_outputFromGpu)); #else print_array(ny, POLYBENCH_ARRAY(y_outputFromGpu)); #endif //RUN_ON_CPU cl_clean_up(); POLYBENCH_FREE_ARRAY(A); POLYBENCH_FREE_ARRAY(x); POLYBENCH_FREE_ARRAY(y); POLYBENCH_FREE_ARRAY(y_outputFromGpu); POLYBENCH_FREE_ARRAY(tmp); return 0; }
int main(void) { DATA_TYPE* A; DATA_TYPE* A_outputFromGpu; DATA_TYPE* R; DATA_TYPE* Q; ///////////////////////// // Kernel 1. size_t oldSizes[2] = { M, N }; size_t newSizes[2]; getNewSizes(oldSizes, NULL, newSizes, NULL, "gramschmidt_kernel1", 2); M = newSizes[0]; N = newSizes[1]; // Kernel 2. getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel2", 2); M = newSizes[0]; N = newSizes[1]; // Kernel 3. getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel3", 2); M = newSizes[0]; N = newSizes[1]; ///////////////////////// A = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); A_outputFromGpu = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); R = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); Q = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); init_array(A); read_cl_file(); cl_initialization(device_id, clGPUContext, clCommandQue); cl_mem_init(A); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, M*N*sizeof(DATA_TYPE), A_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); // gramschmidt(A, R, Q); // compareResults(A, A_outputFromGpu); cl_clean_up(); free(A); free(A_outputFromGpu); free(R); free(Q); return 0; }
int main(int argc, char *argv[]) { int ni = NI; int nj = NJ; int nk = NK; POLYBENCH_3D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,NK,ni,nj,nk); POLYBENCH_3D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,NK,ni,nj,nk); POLYBENCH_3D_ARRAY_DECL(B_outputFromGpu,DATA_TYPE,NI,NJ,NK,ni,nj,nk); init(ni, nj, nk, POLYBENCH_ARRAY(A)); read_cl_file(); cl_initialization(); cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B)); cl_load_prog(); cl_launch_kernel(ni, nj, nk); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, NI * NJ * NK * sizeof(DATA_TYPE), POLYBENCH_ARRAY(B_outputFromGpu), 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); #ifdef RUN_ON_CPU /* Start timer. */ polybench_start_instruments; conv3D(ni, nj, nk, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B)); /* Stop and print timer. */ printf("CPU Time in seconds:\n"); polybench_stop_instruments; polybench_print_instruments; compareResults(ni, nj, nk, POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(B_outputFromGpu)); #else //prevent dead code elimination polybench_prevent_dce(print_array(ni, nj, nk, POLYBENCH_ARRAY(B_outputFromGpu))); #endif //RUN_ON_CPU cl_clean_up(); POLYBENCH_FREE_ARRAY(A); POLYBENCH_FREE_ARRAY(B); POLYBENCH_FREE_ARRAY(B_outputFromGpu); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* r; DATA_TYPE* s; DATA_TYPE* p; DATA_TYPE* q; DATA_TYPE* s_outputFromGpu; DATA_TYPE* q_outputFromGpu; A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE)); r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s_outputFromGpu = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q_outputFromGpu = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); init_array(A, p, r); read_cl_file(); cl_initialization(); cl_mem_init(A, r, s, p, q); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, s_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), s_outputFromGpu, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, q_mem_obj, CL_TRUE, 0, NX*sizeof(DATA_TYPE), q_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); bicg_cpu(A, r, s, p, q); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(s, s_outputFromGpu, q, q_outputFromGpu); cl_clean_up(); free(A); free(r); free(s); free(p); free(q); free(s_outputFromGpu); free(q_outputFromGpu); return 0; }
int main(void) { double t_start, t_end; int i; DATA_TYPE* A, *A_2; DATA_TYPE* C4, *C4_2; DATA_TYPE* sum, *sum_2; A = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); C4 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE)); sum = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); A_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); C4_2 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE)); sum_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); init_array(A, C4); init_array(A_2, C4_2); read_cl_file(); cl_initialization(); cl_mem_init(A, C4, sum); cl_load_prog(); t_start = rtclock(); int r; for (r = 0; r < NR; r++) { cl_launch_kernel1(r); cl_launch_kernel2(r); } t_end = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NR * NQ * NP * sizeof(DATA_TYPE), sum, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); t_start = rtclock(); doitgen(sum_2, A_2, C4_2); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(sum, sum_2); cl_clean_up(); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* x; DATA_TYPE* y; DATA_TYPE* y_outputFromGpu; DATA_TYPE* tmp; A = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); x = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); tmp = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); init(A, x); read_cl_file(); cl_initialization(); cl_mem_init(A, B, x, y, tmp); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), y_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); gesummv(A, B, x, y, tmp); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(y, y_outputFromGpu); cl_clean_up(); free(A); free(B); free(x); free(y); free(y_outputFromGpu); free(tmp); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* _fict_; DATA_TYPE* ex; DATA_TYPE* ey; DATA_TYPE* hz; DATA_TYPE* hz_outputFromGpu; _fict_ = (DATA_TYPE*)malloc(TMAX*sizeof(DATA_TYPE)); ex = (DATA_TYPE*)malloc(NX*(NY+1)*sizeof(DATA_TYPE)); ey = (DATA_TYPE*)malloc((NX+1)*NY*sizeof(DATA_TYPE)); hz = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE)); hz_outputFromGpu = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE)); int i; init_arrays(_fict_, ex, ey, hz); read_cl_file(); cl_initialization(); cl_mem_init(_fict_, ex, ey, hz); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, hz_mem_obj, CL_TRUE, 0, NX * NY * sizeof(DATA_TYPE), hz_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); runFdtd(_fict_, ex, ey, hz); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(hz, hz_outputFromGpu); cl_clean_up(); free(_fict_); free(ex); free(ey); free(hz); free(hz_outputFromGpu); return 0; }
int main(void) { DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* C_outputFromGpu; ///////////////////////// size_t oldSizes[2] = { NJ, NI }; size_t newSizes[2]; getNewSizes(oldSizes, NULL, newSizes, NULL, "gemm", 2); NJ = newSizes[0]; NI = newSizes[1]; NK = NJ; ///////////////////////// A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); init(A, B, C); read_cl_file(); cl_initialization(device_id, clGPUContext, clCommandQue); cl_mem_init(A, B, C); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); gemm(A, B, C, C_outputFromGpu); cl_clean_up(); free(A); free(B); free(C); free(C_outputFromGpu); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* C_outputFromGpu; A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C_outputFromGpu = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); init_arrays(A, B, C); read_cl_file(); cl_initialization(); cl_mem_init(A, B, C); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, N*M*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); syr2k(A, B, C); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(C, C_outputFromGpu); cl_clean_up(); free(A); free(B); free(C); free(C_outputFromGpu); return 0; }
int main(int argc, char** argv) { /* Retrieve problem size. */ int n = N; int tsteps = TSTEPS; POLYBENCH_1D_ARRAY_DECL(a,DATA_TYPE,N,n); POLYBENCH_1D_ARRAY_DECL(b,DATA_TYPE,N,n); POLYBENCH_1D_ARRAY_DECL(a_outputFromGpu,DATA_TYPE,N,n); POLYBENCH_1D_ARRAY_DECL(b_outputFromGpu,DATA_TYPE,N,n); init_array(n, POLYBENCH_ARRAY(a), POLYBENCH_ARRAY(b)); #if OPENCL_DEVICE_SELECTION!=CL_DEVICE_TYPE_ACCELERATOR read_cl_file(); #else if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return EXIT_FAILURE; } read_cl_file(argv); #endif cl_initialization(); cl_mem_init(POLYBENCH_ARRAY(a), POLYBENCH_ARRAY(b)); cl_load_prog(); /* Start timer. */ polybench_start_instruments; int t; //for (t = 0; t < _PB_TSTEPS ; t++) for (t = 0; t < _PB_TSTEPS / TT; t++) { cl_launch_kernels(0); } /* Stop and print timer. */ #if OPENCL_DEVICE_SELECTION==CL_DEVICE_TYPE_CPU printf("OpenCL-CPU Time in seconds: "); #elif OPENCL_DEVICE_SELECTION==CL_DEVICE_TYPE_GPU printf("OpenCL-GPU Time in seconds: "); #else printf("OpenCL-FPGA Time in seconds: "); #endif polybench_stop_instruments; polybench_print_instruments; errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k1, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu), 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k2, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu)+N*1/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k3, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu)+N*2/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k4, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu)+N*3/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k5, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu)+N*4/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj_k6, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(a_outputFromGpu)+N*5/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k1, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu), 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k2, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu)+N*1/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k3, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu)+N*2/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k4, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu)+N*3/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k5, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu)+N*4/6, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj_k6, CL_TRUE, 0, N/6 * sizeof(DATA_TYPE), POLYBENCH_ARRAY(b_outputFromGpu)+N*5/6, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); #ifdef RUN_ON_CPU /* Start timer. */ polybench_start_instruments; runJacobi1DCpu(tsteps, n, POLYBENCH_ARRAY(a), POLYBENCH_ARRAY(b)); /* Stop and print timer. */ printf("CPU Time in seconds: "); polybench_stop_instruments; polybench_print_instruments; compareResults(n, POLYBENCH_ARRAY(a), POLYBENCH_ARRAY(a_outputFromGpu), POLYBENCH_ARRAY(b), POLYBENCH_ARRAY(b_outputFromGpu)); #else //prevent dead code elimination polybench_prevent_dce(print_array(n, POLYBENCH_ARRAY(a_outputFromGpu))); #endif //RUN_ON_CPU cl_clean_up(); POLYBENCH_FREE_ARRAY(a); POLYBENCH_FREE_ARRAY(b); POLYBENCH_FREE_ARRAY(a_outputFromGpu); POLYBENCH_FREE_ARRAY(b_outputFromGpu); return 0; }
int main(void) { #ifdef ALOCACAO_NORMAL printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n"); #else printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n"); #endif double t_start, t_end; double t_start_init, t_end_init; double t_start_init_off, t_end_init_off; double t_offload_start, t_offload_end; double total_kernel; int i; #ifdef ALOCACAO_NORMAL a = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); x1_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); x2_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); #endif x1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); x2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); t_start_init = rtclock(); read_cl_file(); t_end_init = rtclock(); tmp_read_cl_file = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); #ifndef MALI cl_initialization(); #else cl_initialization_Mali(); #endif t_end_init = rtclock(); tmp_cl_initialization = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); cl_mem_init(); t_end_init = rtclock(); tmp_cl_mem_init= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; //Está dentro da função a contagem init(); //------------GPU--------------- //Inicia tempo GPU #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N * N, a, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x1, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x2, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, y1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_1, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, y2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_2, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in writing buffers\n"); t_end_init_off = rtclock(); tmp_clEnqueueWriteBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif t_start_init = rtclock(); cl_load_prog(); t_end_init = rtclock(); tmp_cl_load_prog= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; t_start_init = rtclock(); cl_launch_kernel(); t_end_init = rtclock(); tmp_cl_launch_kernel += t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x1_outputFromGpu, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x2_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_end_init_off = rtclock(); tmp_clEnqueueReadBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif //--------------CPU------------------ t_start = rtclock(); runMvt(); t_end = rtclock(); tmp_serial = t_end - t_start; compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu); t_start_init_off = rtclock(); cl_clean_up(); t_end_init_off = rtclock(); tmp_cl_clean_up+=t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; free(x1); free(x2); #ifdef ALOCACAO_NORMAL free(a); free(x1_outputFromGpu); free(x2_outputFromGpu); free(y_1); free(y_2); #endif printf("\n-------RESULTS-------\n"); printf("Sizes N=%d\n\n", N); printf("read_cl_file -------------> %lf\n", tmp_read_cl_file); printf("cl_initialization --------> %lf\n", tmp_cl_initialization); printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init); printf("init ---------------------> %lf\n", tmp_init); printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog); printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel); printf("serialExecution ----------> %lf\n", tmp_serial); printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up); printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer); printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer); printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer); printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject); return 0; }
int main(void) { /* Prepare ctuning vars */ long ct_repeat=0; long ct_repeat_max=1; DATA_TYPE* data; DATA_TYPE* mean; DATA_TYPE* stddev; DATA_TYPE* symmat; DATA_TYPE* symmat_outputFromGpu; #ifdef OPENME openme_init(NULL,NULL,NULL,0); openme_callback("PROGRAM_START", NULL); #endif /* Run kernel. */ if (getenv("CT_REPEAT_MAIN")!=NULL) ct_repeat_max=atol(getenv("CT_REPEAT_MAIN")); data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE)); stddev = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE)); symmat = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); srand(1); init_arrays(data); read_cl_file(); cl_initialization(); cl_mem_init(data, mean, stddev, symmat); cl_load_prog(); #ifdef OPENME openme_callback("ACC_KERNEL_START", NULL); #endif for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++) { cl_launch_kernel(); err_code = clEnqueueReadBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in reading GPU mem\n"); exit(1); } } #ifdef OPENME openme_callback("ACC_KERNEL_END", NULL); #endif srand(1); init_arrays(data); #ifdef OPENME openme_callback("KERNEL_START", NULL); #endif for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++) { correlation(data, mean, stddev, symmat); } #ifdef OPENME openme_callback("KERNEL_END", NULL); #endif compareResults(symmat, symmat_outputFromGpu); cl_clean_up(); free(data); free(mean); free(stddev); free(symmat); free(symmat_outputFromGpu); #ifdef OPENME openme_callback("PROGRAM_END", NULL); #endif return 0; }
int main(void) { /* Prepare ctuning vars */ long ct_repeat=0; long ct_repeat_max=1; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* C_outputFromGpu; #ifdef OPENME openme_init(NULL,NULL,NULL,0); openme_callback("PROGRAM_START", NULL); #endif /* Run kernel. */ if (getenv("CT_REPEAT_MAIN")!=NULL) ct_repeat_max=atol(getenv("CT_REPEAT_MAIN")); A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); srand(1); init(A, B, C); read_cl_file(); cl_initialization(); cl_mem_init(A, B, C); cl_load_prog(); #ifdef OPENME openme_callback("ACC_KERNEL_START", NULL); #endif for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++) { cl_launch_kernel(); err_code = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in reading GPU mem\n"); exit(1); } } #ifdef OPENME openme_callback("ACC_KERNEL_END", NULL); #endif srand(1); init(A, B, C); #ifdef OPENME openme_callback("KERNEL_START", NULL); #endif for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++) { gemm(A, B, C); } #ifdef OPENME openme_callback("KERNEL_END", NULL); #endif compareResults(C, C_outputFromGpu); cl_clean_up(); free(A); free(B); free(C); free(C_outputFromGpu); #ifdef OPENME openme_callback("PROGRAM_END", NULL); #endif return 0; }
int main(void) { #ifdef ALOCACAO_NORMAL printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n"); #else printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n"); #endif double t_start, t_end; double t_start_init, t_end_init; double t_offload_start, t_offload_end, t_start_init_off, t_end_init_off; double total_kernel; #ifdef ALOCACAO_NORMAL A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); D = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE)); E_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); #endif C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); E = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); t_start_init = rtclock(); read_cl_file(); t_end_init = rtclock(); tmp_read_cl_file = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); #ifndef MALI cl_initialization(); #else cl_initialization_Mali(); #endif t_end_init = rtclock(); tmp_cl_initialization = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); cl_mem_init(); t_end_init = rtclock(); tmp_cl_mem_init= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; //Está dentro da função a contagem init(); //------------GPU--------------- //Inicia tempo GPU #ifdef ALOCACAO_NORMAL t_offload_start = rtclock(); errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NK, A, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NK * NJ, B, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NJ, C, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, d_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NJ * NL, D, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, e_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, E, 0, NULL, NULL); if(errcode != CL_SUCCESS)printf("Error in writing buffers\n"); t_offload_end = rtclock(); tmp_clEnqueueWriteBuffer += t_offload_end - t_offload_start; total_kernel+=t_offload_end - t_offload_start; #endif t_start_init = rtclock(); cl_load_prog(); t_end_init = rtclock(); tmp_cl_load_prog= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; t_start_init = rtclock(); cl_launch_kernel(); t_end_init = rtclock(); tmp_cl_launch_kernel += t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; #ifdef ALOCACAO_NORMAL t_offload_start = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, e_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, E_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_offload_end = rtclock(); tmp_clEnqueueReadBuffer += t_offload_end - t_offload_start; total_kernel+=t_offload_end - t_offload_start; #endif //-------------CPU--------------- t_start = rtclock(); mm2_cpu(A, B, C, D, E); t_end = rtclock(); tmp_serial = t_end - t_start; compareResults(); t_start_init_off = rtclock(); cl_clean_up(); t_end_init_off = rtclock(); tmp_cl_clean_up+=t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #ifdef ALOCACAO_NORMAL free(C); free(A); free(B); free(D); free(E_outputFromGpu); #endif free(E); printf("\n-------RESULTS-------\n"); printf("Sizes NI=%d, NJ=%d, NK=%d e NL=%d\n\n", NI, NJ, NK, NL); printf("read_cl_file -------------> %lf\n", tmp_read_cl_file); printf("cl_initialization --------> %lf\n", tmp_cl_initialization); printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init); printf("init ---------------------> %lf\n", tmp_init); printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog); printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel); printf("serialExecution ----------> %lf\n", tmp_serial); printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up); printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer); printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer); printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer); printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject); return 0; }
int main(void) { #ifdef ALOCACAO_NORMAL printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n"); #else printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n"); #endif double t_start, t_end; double t_start_init, t_end_init; double t_start_init_off, t_end_init_off; double t_offload_start, t_offload_end; double total_kernel; int i; #ifdef ALOCACAO_NORMAL mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE)); symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(M + 1)*sizeof(DATA_TYPE)); #endif data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); symmat = (DATA_TYPE*)malloc((M + 1)*(M + 1)*sizeof(DATA_TYPE)); t_start_init = rtclock(); read_cl_file(); t_end_init = rtclock(); tmp_read_cl_file = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); #ifndef MALI cl_initialization(); #else cl_initialization_Mali(); #endif t_end_init = rtclock(); tmp_cl_initialization = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); cl_mem_init(); t_end_init = rtclock(); tmp_cl_mem_init= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; //Está dentro da função a contagem init(); //------------GPU--------------- //Inicia tempo GPU #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueWriteBuffer(clCommandQue, data_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1) * (N+1), data, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1) * (N+1), symmat, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, mean_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1), mean, 0, NULL, NULL); if(errcode != CL_SUCCESS)printf("Error in writing buffers\n"); t_end_init_off = rtclock(); tmp_clEnqueueWriteBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif t_start_init = rtclock(); cl_load_prog(); t_end_init = rtclock(); tmp_cl_load_prog= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; t_start_init = rtclock(); cl_launch_kernel(); t_end_init = rtclock(); tmp_cl_launch_kernel += t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem =%d\n", errcode); t_end_init_off = rtclock(); tmp_clEnqueueReadBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif //--------------CPU------------------ t_start = rtclock(); covariance(); t_end = rtclock(); tmp_serial = t_end - t_start; compareResults(symmat, symmat_outputFromGpu); t_start_init_off = rtclock(); cl_clean_up(); t_end_init_off = rtclock(); tmp_cl_clean_up+=t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #ifdef ALOCACAO_NORMAL free(symmat_outputFromGpu); free(mean); #endif free(symmat); free(data); printf("\n-------RESULTS-------\n"); printf("Sizes N=%d e M=%d\n\n", N, M); printf("read_cl_file -------------> %lf\n", tmp_read_cl_file); printf("cl_initialization --------> %lf\n", tmp_cl_initialization); printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init); printf("init ---------------------> %lf\n", tmp_init); printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog); printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel); printf("serialExecution ----------> %lf\n", tmp_serial); printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up); printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer); printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer); printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer); printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject); return 0; }