int main() { int t, i, j, k, l; double t_start, t_end; init_array(); IF_TIME(t_start = rtclock()); #pragma scop for (t=1; t<=T; t++){ for (i=1; i<=N-1; i++) e[i] = e[i] - coeff1*(h[i]-h[i-1]); for (i=0; i<=N-1; i++) h[i] = h[i] - coeff2*(e[i+1]-e[i]); } #pragma endscop IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stderr, "%0.6lfs\n", t_end - t_start)); if (fopen(".test", "r")) { print_array(); } return 0; }
int main() { int i, j, k, t; init_array() ; #ifdef PERFCTR PERF_INIT; #endif IF_TIME(t_start = rtclock()); /* pluto start (N) */ #pragma scop for (i=1; i<=N-2; i++) { for (j=1; j<=N-2; j++) { f[i][j] = f[j][i] + f[i][j-1]; } } #pragma endscop /* pluto end */ IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stderr, "%0.6lfs\n", t_end - t_start)); if (fopen(".test", "r")) { print_array(); } return 0; }
int main() { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* C; DATA_TYPE* D; A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); D = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); fprintf(stdout, "<< Symmetric rank-k operations >>\n"); init_arrays(A, C, D); syrkGPU(A, D); t_start = rtclock(); syrk(A, C); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(C, D); free(A); free(C); free(D); return 0; }
void syrkGPU(DATA_TYPE* A, DATA_TYPE* D) { int i, j; double t_start, t_end; t_start = rtclock(); #pragma omp target device (GPU_DEVICE) #pragma omp target map(to: A[:N*M]) map(tofrom: D[:N*M]) { #pragma omp parallel for for (i = 0; i < N; i++) { for (j = 0; j < M; j++) { D[i * M + j] *= beta; } } #pragma omp parallel for collapse(2) for (i = 0; i < N; i++) { for (j = 0; j < M; j++) { int k; for(k=0; k< M; k++) { D[i * M + j] += alpha * A[i * M + k] * A[j * M + k]; } } } } t_end = rtclock(); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); }
void cl_launch_kernel() { double t_start, t_end; int m = M; int n = N; size_t localWorkSize[2], globalWorkSize[2]; localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y; globalWorkSize[0] = (size_t)ceil(((float)N) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X; globalWorkSize[1] = (size_t)ceil(((float)M) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y; t_start = rtclock(); // Set the arguments of the kernel errcode = clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&c_mem_obj); errcode |= clSetKernelArg(clKernel1, 2, sizeof(DATA_TYPE), (void *)&alpha); errcode |= clSetKernelArg(clKernel1, 3, sizeof(DATA_TYPE), (void *)&beta); errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&m); errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&n); if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n"); // Execute the OpenCL kernel errcode = clEnqueueNDRangeKernel_fusion(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); //errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n"); // clFinish(clCommandQue); t_end = rtclock(); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, t_end - t_start); }
int main() { int i, j, k, l, t; double t_start, t_end; init_array() ; IF_TIME(t_start = rtclock()); #pragma scop for (i=0; i<N; i++) { for (j=0; j<N; j++) { x1[i] = x1[i] + a[i][j] * y_1[j]; } } for (i=0; i<N; i++) { for (j=0; j<N; j++) { x2[i] = x2[i] + a[j][i] * y_2[j]; } } #pragma endscop IF_TIME(t_end = rtclock()); IF_TIME(printf("%0.6lfs\n", t_end - t_start)); #ifdef TEST print_array(); #endif return 0; }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; int v1,v2,o1,o2,ox; int tv1,tv2,to1,to2,tox; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); for (v1=0; v1<=V-1; v1=v1+1) for (v2=0; v2<=V-1; v2=v2+1) for (o1=0; o1<=O-1; o1=o1+1) for (o2=0; o2<=O-1; o2=o2+1) for (ox=0; ox<=O-1; ox=ox+1) R[v1][v2][o1][o2]=R[v1][v2][o1][o2]+T[v1][ox][o1][o2]*A2[v2][ox]; annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return 1; }
int main(int argc, char** argv) { double t_start, t_end; /* Array declaration */ DATA_TYPE A[N][M]; DATA_TYPE C[N][N]; DATA_TYPE C_outputFromGpu[N][N]; /* Initialize array. */ init_arrays(A, C, C_outputFromGpu); #pragma hmpp syrk allocate #pragma hmpp syrk advancedload, args[a,c] t_start = rtclock(); #pragma hmpp syrk callsite, args[a,c].advancedload=true, asynchronous runSyrk(A, C_outputFromGpu); #pragma hmpp syrk synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp syrk delegatedstore, args[c] #pragma hmpp syrk release t_start = rtclock(); runSyrk(A, C); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(C, C_outputFromGpu); return 0; }
void foo(){ int y,x,trial; IF_TIME(t_start = rtclock()); for (trial=0;trial<10;++trial) { #pragma scop for (y = 0; y <= M-1; ++y) for(x = 0; x <= N-1; ++x) { blurx[y][x]=in[x][y]+in[x+1][y]+in[x+2][y]; if (y >= 2) out[x][y-2]=blurx[y-2][x]+blurx[y-1][x]+blurx[y][x]; } #pragma endscop } IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stdout, "%s\t\t(M=%d,N=%d) \t %0.6lfs\n", __FILE__, M, N, (t_end - t_start)/trial)); #ifdef VERIFY for(x = 0; x <= N-1; ++x) for(y = 0; y <= M-1; ++y) A[x][y]=in[x][y]+in[x+1][y]+in[x+2][y]; // Stage 2: vertical blur for(x = 0; x <= N-1; ++x) for(y = 2; y <= M-1; ++y) { if(out[x][y-2] != A[x][y]+A[x][y-1]+A[x][y-2]) { printf("blur-smo.c: Difference at (%d, %d) : %f versus %f\n", x, y, out[x][y-2], A[x][y]+A[x][y-1]+A[x][y-2]); } } #endif }
int main(int argc, char* argv[]) //int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* D; DATA_TYPE* E; DATA_TYPE* F; DATA_TYPE* G; DATA_TYPE* G_outputFromGpu; if(argc==2){ printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]); cpu_offset = atoi(argv[1]); } A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(NJ*NM*sizeof(DATA_TYPE)); D = (DATA_TYPE*)malloc(NM*NL*sizeof(DATA_TYPE)); E = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); F = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE)); G = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); G_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); int i; init_array(A, B, C, D); read_cl_file(); cl_initialization_fusion(); //cl_initialization(); cl_mem_init(A, B, C, D, E, F, G); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue[0], g_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, G_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); mm3_cpu(A, B, C, D, E, F, G); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(G, G_outputFromGpu); cl_clean_up(); free(A); free(B); free(C); free(D); free(E); free(F); free(G); free(G_outputFromGpu); return 0; }
int main(int argc, char** argv) { double t_start, t_end; /* Array declaration */ DATA_TYPE A[NI][NK]; DATA_TYPE B[NK][NJ]; DATA_TYPE C[NJ][NM]; DATA_TYPE D[NM][NL]; DATA_TYPE E[NI][NJ]; DATA_TYPE E_gpu[NI][NJ]; DATA_TYPE F[NJ][NL]; DATA_TYPE F_gpu[NJ][NL]; DATA_TYPE G[NI][NL]; DATA_TYPE G_outputFromGpu[NI][NL]; /* INItialize array. */ iNIt_array(A, B, C, D); #pragma hmpp <group1> allocate #pragma hmpp <group1> loopa advancedload, args[a;b;e] #pragma hmpp <group1> loopb advancedload, args[f;c;d] #pragma hmpp <group1> loopc advancedload, args[g] t_start = rtclock(); #pragma hmpp <group1> loopa callsite, args[a;b;e].advancedload=true, asynchronous threeMMloopa(A, B, E_gpu); #pragma hmpp <group1> loopa synchronize #pragma hmpp <group1> loopb callsite, args[f;c;d].advancedload=true, asynchronous threeMMloopb(C, D, F_gpu); #pragma hmpp <group1> loopb synchronize #pragma hmpp <group1> loopc callsite, args[g;e;f].advancedload=true, asynchronous threeMMloopc(E_gpu, F_gpu, G_outputFromGpu); #pragma hmpp <group1> loopc synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp <group1> loopa delegatedstore, args[a;b] #pragma hmpp <group1> loopb delegatedstore, args[c;d] #pragma hmpp <group1> loopc delegatedstore, args[g;e;f] #pragma hmpp <group1> release t_start = rtclock(); threeMMloopa(A, B, E); threeMMloopb(C, D, F); threeMMloopc(E, F, G); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(G, G_outputFromGpu); return 0; }
int main(int argc, char** argv) { int m = M; int n = N; double t_start, t_end; /* Array declaration */ DATA_TYPE float_n = 321414134.01; DATA_TYPE data[M + 1][N + 1]; DATA_TYPE data_Gpu[M + 1][N + 1]; DATA_TYPE symmat[M + 1][M + 1]; DATA_TYPE symmat_outputFromGpu[M + 1][M + 1]; DATA_TYPE mean[M + 1]; DATA_TYPE mean_Gpu[M + 1]; /* Initialize array. */ init_arrays(data, data_Gpu); #pragma hmpp <group1> allocate #pragma hmpp <group1> loopa advancedload, args[pmean;pdata;pfloat_n] #pragma hmpp <group1> loopc advancedload, args[psymmat] t_start = rtclock(); #pragma hmpp <group1> loopa callsite, args[pmean;pdata;pfloat_n].advancedload=true, asynchronous covarLoopa(mean_Gpu, data_Gpu, float_n); #pragma hmpp <group1> loopa synchronize #pragma hmpp <group1> loopb callsite, args[pdata;pmean].advancedload=true, asynchronous covarLoopb(data_Gpu, mean_Gpu); #pragma hmpp <group1> loopb synchronize #pragma hmpp <group1> loopc callsite, args[psymmat;pdata].advancedload=true, asynchronous covarLoopc(symmat_outputFromGpu, data_Gpu); #pragma hmpp <group1> loopc synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp <group1> loopb delegatedstore, args[pmean] #pragma hmpp <group1> loopc delegatedstore, args[psymmat;pdata] #pragma hmpp <group1> release t_start = rtclock(); covarLoopa(mean, data, float_n); covarLoopb(data, mean); covarLoopc(symmat, data); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(symmat, symmat_outputFromGpu); return 0; }
int main(int argc, char* argv[]) //int main(void) { double t_start, t_end; DATA_TYPE* data; DATA_TYPE* mean; DATA_TYPE* stddev; DATA_TYPE* symmat; DATA_TYPE* symmat_outputFromGpu; if(argc==2){ printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]); cpu_offset = atoi(argv[1]); } data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE)); stddev = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE)); symmat = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE)); init_arrays(data); read_cl_file(); cl_initialization_fusion(); //cl_initialization(); cl_mem_init(data, mean, stddev, symmat); cl_load_prog(); double start = rtclock(); cl_launch_kernel(); double end = rtclock(); fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, (end - start)); //fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, 1000*(end - start)); errcode = clEnqueueReadBuffer(clCommandQue[0], symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); correlation(data, mean, stddev, symmat); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(symmat, symmat_outputFromGpu); cl_clean_up(); free(data); free(mean); free(stddev); free(symmat); free(symmat_outputFromGpu); return 0; }
int main() { double t_start, t_end; init_arrays(); syrkGPU(); t_start = rtclock(); syrk(); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(); return 0; }
void SpMM(Csr<ValueType>* m1, Csr<ValueType>* m2, int num_buckets) { vector<FastHash<int, ValueType>* > result_map(m1->num_rows); for (auto& v : result_map) { v = new FastHash<int, ValueType>(num_buckets); } cout << "Starting SpMM..." << endl; float res = 0; double before = rtclock(); for(int i=0;i<m1->num_rows;i++) { for(int j=m1->rows[i];j<m1->rows[i+1];j++) { int cola = m1->cols[j]; __m512d a = _mm512_set1_pd(m1->vals[j]); for(int k=m2->rows[cola];k<m2->rows[cola] + m2->row_lens[cola];k+=16) { __m512d *pb1 = (__m512d *)(&(m2->vals[k])); __m512d *pb2 = (__m512d *)(&(m2->vals[k]) + 8); __m512i *pcols = (__m512i *)(&(m2->cols[k])); __m512d c1 = _mm512_mul_pd(a, *pb1); __m512d c2 = _mm512_mul_pd(a, *pb2); for(int x=0;x<8;x++) { int col = ((int *)pcols)[x]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c1))[x]; result_map[i]->Reduce(col, val); res += val; } for (int x = 0; x < 8; ++x) { int col = ((int *)pcols)[x+8]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c2))[x]; result_map[i]->Reduce(col, val); res += val; } } } } double after = rtclock(); cout << "res: " << res << endl; cout << RED << "[****Result****] ========> *SIMD Naive* time: " << after - before << " secs." << RESET << endl; for (auto& v : result_map) { delete v; } }
int main(int argc, char** argv) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* D; DATA_TYPE* E; DATA_TYPE* F; DATA_TYPE* G; DATA_TYPE* G_outputFromGpu; A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(NJ*NM*sizeof(DATA_TYPE)); D = (DATA_TYPE*)malloc(NM*NL*sizeof(DATA_TYPE)); E = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); F = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE)); G = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); G_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE)); fprintf(stdout, "<< Linear Algebra: 3 Matrix Multiplications (E=A.B; F=C.D; G=E.F) >>\n"); init_array(A, B, C, D); t_start = rtclock(); mm3_OMP(A, B, C, D, E, F, G_outputFromGpu); t_end = rtclock(); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); t_start = rtclock(); mm3_cpu(A, B, C, D, E, F, G); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(G, G_outputFromGpu); free(A); free(B); free(C); free(D); free(E); free(F); free(G); free(G_outputFromGpu); return 0; }
int main() { int i, j, k, x, y; unsigned int distanceYtoX, distanceYtoK, distanceKtoX; /* * pathDistanceMatrix is the adjacency matrix (square) with * dimension length equal to number of nodes in the graph. */ unsigned int width = NUM_NODES; unsigned int yXwidth; init_array(); #ifdef PERFCTR PERF_INIT; #endif IF_TIME(t_start = rtclock()); #pragma scop for(k=0; k < NUM_NODES; k++) { for(y=0; y < NUM_NODES; y++) { for(x=0; x < NUM_NODES; x++) { pathDistanceMatrix[y][x] = ((pathDistanceMatrix[y][k] + pathDistanceMatrix[k][x]) < pathDistanceMatrix[y][x]) ? (pathDistanceMatrix[y][k] + pathDistanceMatrix[k][x]):pathDistanceMatrix[y][x]; } } } #pragma endscop IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stdout, "time = %0.6lfs\n", t_end - t_start)); #ifdef PERFCTR PERF_EXIT; #endif if (fopen(".test", "r")) { #ifdef MPI if (my_rank == 0) { print_array(); } #else print_array(); #endif } return 0; }
int main(int argc, char** argv) { double t_start, t_end; /* Array declaration */ DATA_TYPE A[NI][NK]; DATA_TYPE B[NK][NJ]; DATA_TYPE C[NI][NJ]; DATA_TYPE C_gpu[NI][NJ]; DATA_TYPE D[NJ][NL]; DATA_TYPE E[NI][NL]; DATA_TYPE E_outputFromGpu[NI][NL]; /* Initialize array. */ init_array(A, B, C, C_gpu, D, E, E_outputFromGpu); #pragma hmpp <group1> allocate #pragma hmpp <group1> loopa advancedload, args[a;b;c] #pragma hmpp <group1> loopb advancedload, args[d;e] t_start = rtclock(); #pragma hmpp <group1> loopa callsite, args[a;b;c].advancedload=true, asynchronous twoMMloopa(A, B, C_gpu); #pragma hmpp <group1> loopa synchronize #pragma hmpp <group1> loopb callsite, args[c;d;e].advancedload=true, asynchronous twoMMloopb(C_gpu, D, E_outputFromGpu); #pragma hmpp <group1> loopb synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp <group1> loopa delegatedstore, args[a;b] #pragma hmpp <group1> loopb delegatedstore, args[c;d;e] #pragma hmpp <group1> release t_start = rtclock(); twoMMloopa(A, B, C); twoMMloopb(C, D, E); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(E, E_outputFromGpu); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* r; DATA_TYPE* s; DATA_TYPE* p; DATA_TYPE* q; DATA_TYPE* s_outputFromGpu; DATA_TYPE* q_outputFromGpu; A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE)); r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s_outputFromGpu = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q_outputFromGpu = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); init_array(A, p, r); read_cl_file(); cl_initialization(); cl_mem_init(A, r, s, p, q); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, s_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), s_outputFromGpu, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, q_mem_obj, CL_TRUE, 0, NX*sizeof(DATA_TYPE), q_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); bicg_cpu(A, r, s, p, q); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(s, s_outputFromGpu, q, q_outputFromGpu); cl_clean_up(); free(A); free(r); free(s); free(p); free(q); free(s_outputFromGpu); free(q_outputFromGpu); return 0; }
int main(int argc, char** argv) { int m = M; int n = N; double t_start, t_end; /* Array declaration */ DATA_TYPE float_n = 321414134.01; DATA_TYPE eps = 0.005; DATA_TYPE data[M + 1][N + 1]; DATA_TYPE data_Gpu[M + 1][N + 1]; DATA_TYPE mean[M + 1]; DATA_TYPE mean_Gpu[M + 1]; DATA_TYPE stddev[M + 1]; DATA_TYPE stddev_Gpu[M + 1]; DATA_TYPE symmat[M + 1][M + 1]; DATA_TYPE symmat_outputFromGpu[M + 1][M + 1]; /* Initialize array. */ init_arrays(data, data_Gpu); #pragma hmpp corr allocate #pragma hmpp corr advancedload, args[pdata;psymmat;pstddev;pmean;pfloat_n;peps] t_start = rtclock(); #pragma hmpp corr callsite, args[pdata;psymmat;pstddev;pmean;pfloat_n;peps].advancedload=true, asynchronous runCorr(data_Gpu, symmat_outputFromGpu, stddev_Gpu, mean_Gpu, float_n, eps); #pragma hmpp corr synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp corr delegatedstore, args[pdata;psymmat;pstddev;pmean] #pragma hmpp corr release t_start = rtclock(); runCorr(data, symmat, stddev, mean, float_n, eps); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(symmat, symmat_outputFromGpu); return 0; }
int main(int argc, char** argv) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* r; DATA_TYPE* s; DATA_TYPE* p; DATA_TYPE* q; DATA_TYPE* s_GPU; DATA_TYPE* q_GPU; A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE)); r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); s_GPU = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE)); q_GPU = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE)); fprintf(stdout, "<< BiCG Sub Kernel of BiCGStab Linear Solver >>\n"); init_array(A, p, r); t_start = rtclock(); bicg_OMP(A, r, s_GPU, p, q_GPU); t_end = rtclock(); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); t_start = rtclock(); bicg_cpu(A, r, s, p, q); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(s, s_GPU, q, q_GPU); free(A); free(r); free(s); free(p); free(q); free(s_GPU); free(q_GPU); return 0; }
void cl_launch_kernel() { double t_start, t_end; int nx=NX; int ny=NY; size_t localWorkSize[2], globalWorkSize[2]; localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y; globalWorkSize[0] = (size_t)ceil(((float)NX) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X; globalWorkSize[1] = 1; t_start = rtclock(); // Set the arguments of the kernel errcode = clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&p_mem_obj); errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&q_mem_obj); errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), &nx); errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), &ny); if(errcode != CL_SUCCESS) printf("Error in seting arguments\n"); // Execute the 1st OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel\n"); clFinish(clCommandQue); globalWorkSize[0] = (size_t)ceil(((float)NY) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X; globalWorkSize[1] = 1; // Set the arguments of the kernel errcode = clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode |= clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&r_mem_obj); errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&s_mem_obj); errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), &nx); errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), &ny); if(errcode != CL_SUCCESS) printf("Error in seting arguments\n"); // Execute the 2nd OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel\n"); clFinish(clCommandQue); t_end = rtclock(); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); }
int main() { int t, i, j, k, l, m, n; init_array() ; #ifdef PERFCTR PERF_INIT; #endif IF_TIME(t_start = rtclock()); #pragma scop for(t=0; t<tmax; t++) { for (j=0; j<ny; j++) ey[0][j] = t; for (i=1; i<nx; i++) for (j=0; j<ny; j++) ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]); for (i=0; i<nx; i++) for (j=1; j<ny; j++) ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]); for (i=0; i<nx; i++) for (j=0; j<ny; j++) hz[i][j]=hz[i][j]-0.7*(ex[i][j+1]-ex[i][j]+ey[i+1][j]-ey[i][j]); } #pragma endscop IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stdout, "%0.6lfs\n", t_end - t_start)); #ifdef PERFCTR PERF_EXIT; #endif if (fopen(".test", "r")) { #ifdef __MPI if (my_rank == 0) { print_array(); } #else print_array(); #endif } return 0; }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); register int i,j,k; for (k=0; k<=N-1; k++) { for (j=k+1; j<=N-1; j++) A[k][j] = A[k][j]/A[k][k]; for(i=k+1; i<=N-1; i++) for (j=k+1; j<=N-1; j++) A[i][j] = A[i][j] - A[i][k]*A[k][j]; } annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; #ifndef TEST printf("%f\n", annot_t_total); #else { int i, j; for (i=0; i<N; i++) { for (j=0; j<N; j++) { if (j%100==0) printf("\n"); printf("%f ",A[i][j]); } printf("\n"); } } #endif return ((int) A[0][0]); }
int main() { double t_start, t_end; DATA_TYPE a[N][N]; DATA_TYPE x1[N]; DATA_TYPE x1_outputFromGpu[N]; DATA_TYPE x2[N]; DATA_TYPE x2_outputFromGpu[N]; DATA_TYPE y1[N]; DATA_TYPE y2[N]; //initialize the arrays for running on the CPU and GPU init_array(a, x1, x1_outputFromGpu, x2, x2_outputFromGpu, y1, y2); #pragma hmpp mvt allocate #pragma hmpp mvt advancedload, args[a,x1,x2,y1,y2] t_start = rtclock(); //run the algorithm on the GPU #pragma hmpp mvt callsite, args[x1,x2].advancedload=true, asynchronous runMvt(a, x1_outputFromGpu, x2_outputFromGpu, y1, y2); // parameters are initialized in decls.h and are initialized with init_array() #pragma hmpp mvt synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lf\n", t_end - t_start); #pragma hmpp mvt delegatedstore, args[x1,x2] #pragma hmpp mvt release t_start = rtclock(); //run the algorithm on the CPU runMvt(a, x1, x2, y1, y2); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lf\n", t_end - t_start); compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu); return 0; }
int main() { int i,j, k; double t_start, t_end; for (i = 0; i < NMAX; i++) { for (j = 0; j < NMAX; j++) { c[i][j] = 0.0; a[i][j] = b[i][j] = i*j*0.5 / NMAX; } } IF_TIME(t_start = rtclock()); #pragma scop for (i=0; i<NMAX; i++) { for (j=0; j<NMAX; j++) { for (k=0; k<j-1; k++) { c[i][k] += a[j][k] * b[i][j]; c[i][j] += a[j][j] * b[i][j]; } c[i][j] += a[j][j] * b[i][j]; } } #pragma endscop IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stdout, "%0.6lfs\n", t_end - t_start)); if (fopen(".test", "r")) { #ifdef MPI if (my_rank == 0) { #endif for (i = 0; i < NMAX; i++) { for (j = 0; j < NMAX; j++) { fprintf(stderr, "%lf ", c[i][j]); } fprintf(stderr, "\n"); } #ifdef MPI } #endif } return 0; }
int main(void) { double t_start, t_end; int i; DATA_TYPE* A, *A_2; DATA_TYPE* C4, *C4_2; DATA_TYPE* sum, *sum_2; A = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); C4 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE)); sum = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); A_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); C4_2 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE)); sum_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE)); init_array(A, C4); init_array(A_2, C4_2); read_cl_file(); cl_initialization(); cl_mem_init(A, C4, sum); cl_load_prog(); t_start = rtclock(); int r; for (r = 0; r < NR; r++) { cl_launch_kernel1(r); cl_launch_kernel2(r); } t_end = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NR * NQ * NP * sizeof(DATA_TYPE), sum, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); t_start = rtclock(); doitgen(sum_2, A_2, C4_2); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(sum, sum_2); cl_clean_up(); return 0; }
int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* x; DATA_TYPE* y; DATA_TYPE* y_outputFromGpu; DATA_TYPE* tmp; A = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); x = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); tmp = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); init(A, x); read_cl_file(); cl_initialization(); cl_mem_init(A, B, x, y, tmp); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), y_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); gesummv(A, B, x, y, tmp); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(y, y_outputFromGpu); cl_clean_up(); free(A); free(B); free(x); free(y); free(y_outputFromGpu); free(tmp); return 0; }
int main(int argc, char** argv) { double t_start, t_end; /* Array declaration. */ DATA_TYPE A[N][N]; DATA_TYPE x[N]; DATA_TYPE u1[N]; DATA_TYPE u2[N]; DATA_TYPE v2[N]; DATA_TYPE v1[N]; DATA_TYPE w[N]; DATA_TYPE wi[N]; DATA_TYPE y[N]; DATA_TYPE z[N]; /* Initialize array. */ init(A, x, u1, u2, v2, v1, w, wi, y, z); #pragma hmpp conv allocate #pragma hmpp conv advancedload, args[A;x;u1;u2;v2;v1;w;y;z] t_start = rtclock(); #pragma hmpp conv callsite, args[A;x;u1;u2;v2;v1;w;y;z].advancedload=true, asynchronous loop(A, x, u1, u2, v2, v1, wi, y, z); #pragma hmpp conv synchronize t_end = rtclock();//); fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start); #pragma hmpp conv delegatedstore, args[w] #pragma hmpp conv release t_start = rtclock(); loop(A, x, u1, u2, v2, v1, w, y, z); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(w, wi); return 0; }
void polybench_timer_start() { polybench_prepare_instruments (); #ifndef POLYBENCH_CYCLE_ACCURATE_TIMER polybench_t_start = rtclock (); #else polybench_c_start = rdtsc (); #endif }