int main(int argc,char *argv[]) { int i,j,k,nn; int mx,my,mz,it; float gosa; double cpu,cpu0,cpu1,flop,target; target= 60.0; omega= 0.8; mx= MX0-1; my= MY0-1; mz= MZ0-1; ndx= NDX0; ndy= NDY0; ndz= NDZ0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &npe); MPI_Comm_rank(MPI_COMM_WORLD, &id); int namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; MPI_Get_processor_name(processor_name,&namelen); fprintf(stderr, "[%d] %s\n", id, processor_name); initcomm(ndx,ndy,ndz); it= initmax(mx,my,mz); /* * Initializing matrixes */ initmt(mx,it); float *sendp2_buf = (float*)malloc(MIMAX*MKMAX*sizeof(float)*4); sendp2_lo_sendbuf = &sendp2_buf[MIMAX*MKMAX*0]; sendp2_lo_recvbuf = &sendp2_buf[MIMAX*MKMAX*1]; sendp2_hi_sendbuf = &sendp2_buf[MIMAX*MKMAX*2]; sendp2_hi_recvbuf = &sendp2_buf[MIMAX*MKMAX*3]; #pragma acc enter data create(sendp2_buf[0:MIMAX*MKMAX*4]) if(id==0){ printf("Sequential version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0); printf("Parallel version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX); printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax); printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz); } nn= 3; if(id==0){ printf(" Start rehearsal measurement process.\n"); printf(" Measure the performance in %d times.\n\n",nn); } #pragma acc data copyin(p, bnd, wrk1, wrk2, a, b, c) present(sendp2_buf[0:MIMAX*MKMAX*4]) { MPI_Barrier(MPI_COMM_WORLD); cpu0= gettime(); gosa= jacobi(nn); cpu1= gettime(); cpu = cpu1 - cpu0; MPI_Allreduce(MPI_IN_PLACE, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); flop= fflop(mz,my,mx); if(id == 0){ printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa); } nn= (int)(target/(cpu/3.0)); nn= LOOP_TIMES; halo_time = 0.0; if(id == 0){ printf(" Now, start the actual measurement process.\n"); printf(" The loop will be excuted in %d times\n",nn); printf(" This will take about one minute.\n"); printf(" Wait for a while\n\n"); } /* * Start measuring */ MPI_Barrier(MPI_COMM_WORLD); cpu0= gettime(); gosa= jacobi(nn); cpu1= gettime(); cpu = cpu1 - cpu0; MPI_Allreduce(MPI_IN_PLACE, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&halo_time, &max_halo_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&halo_time, &ave_halo_time, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); ave_halo_time /= npe; }//end of acc data if(id == 0){ printf("cpu : %f sec. halo(AVE.) %f sec. halo(MAX) %f sec.\n", cpu, ave_halo_time, max_halo_time); printf("Loop executed for %d times\n",nn); printf("Gosa : %e \n",gosa); printf("MFLOPS measured : %f\n",mflops(nn,cpu,flop)); printf("Score based on Pentium III 600MHz : %f\n", mflops(nn,cpu,flop)/82.84); } free(sendp2_buf); MPI_Finalize(); return (0); }
int main(int argc,char *argv[]) { int i,j,k,nn; int mx,my,mz,it; float gosa; double cpu,cpu0,cpu1,flop,target; target= 60.0; omega= 0.8; mx= MX0-1; my= MY0-1; mz= MZ0-1; ndx= NDX0; ndy= NDY0; ndz= NDZ0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &npe); MPI_Comm_rank(MPI_COMM_WORLD, &id); hime_err_init(id); if (argc != 3) { if (id == 0) { printf("./bmt <Restart #> <Checkpoint interval (steps)>\n"); printf("\n"); printf(" Restart #:\n"); printf(" Checkpiont id at which bmt starts\n"); printf(" Checkpoint interval (steps):\n"); printf(" # of Steps to skip checkpointing\n"); printf(""); } MPI_Finalize(); exit(0); } restart_id = atoi(argv[1]); interval = atoi(argv[2]); hime_dbgi(0, "Checkpoint directory: %s", CHECKPOINT_DIR); hime_dbgi(0, "Checkpoint interval: %d", interval); if (restart_id > 0) { hime_dbgi(0, "Restart ID: %d", restart_id); restart(restart_id); } initcomm(ndx,ndy,ndz); it= initmax(mx,my,mz); /* * Initializing matrixes */ initmt(mx,it); if(id==0){ printf("Sequential version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0); printf("Parallel version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX); printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax); printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz); } nn= 3; if(id==0){ printf(" Start rehearsal measurement process.\n"); printf(" Measure the performance in %d times.\n\n",nn); } MPI_Barrier(MPI_COMM_WORLD); cpu0= MPI_Wtime(); gosa= jacobi(nn); cpu1= MPI_Wtime() - cpu0; MPI_Allreduce(&cpu1, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); flop= fflop(mz,my,mx); if(id == 0){ printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa); } nn= (int)(target/(cpu/3.0)); if(id == 0){ printf(" Now, start the actual measurement process.\n"); printf(" The loop will be excuted in %d times\n",nn); printf(" This will take about one minute.\n"); printf(" Wait for a while\n\n"); } /* * Start measuring */ MPI_Barrier(MPI_COMM_WORLD); cpu0 = MPI_Wtime(); gosa = jacobi(nn); cpu1 = MPI_Wtime() - cpu0; MPI_Allreduce(&cpu1, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if(id == 0){ printf("cpu : %f sec.\n", cpu); printf("Loop executed for %d times\n",nn); printf("Gosa : %e \n",gosa); printf("MFLOPS measured : %f\n",mflops(nn,cpu,flop)); printf("Score based on Pentium III 600MHz : %f\n", mflops(nn,cpu,flop)/82.84); } MPI_Finalize(); return (0); }
int main(int argc,char *argv[]) { int i,j,k,nn; int mx,my,mz,it; float gosa; double cpu,cpu0,cpu1,flop,target; target= 60.0; omega= 0.8; mx= MX0-1; my= MY0-1; mz= MZ0-1; ndx= NDX0; ndy= NDY0; ndz= NDZ0; MPI_Init(&argc, &argv); #ifdef SCR_ENABLE SCR_Init(); #endif MPI_Comm_size(MPI_COMM_WORLD, &npe); MPI_Comm_rank(MPI_COMM_WORLD, &id); initcomm(ndx,ndy,ndz); it= initmax(mx,my,mz); /* * Initializing matrixes */ initmt(mx,it); if(id==0){ printf("Sequential version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MX0,MY0,MZ0); printf("Parallel version array size\n"); printf(" mimax = %d mjmax = %d mkmax = %d\n",MIMAX,MJMAX,MKMAX); printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax); printf("I-decomp = %d J-decomp = %d K-decomp =%d\n",ndx,ndy,ndz); } nn= 3; if(id==0){ printf(" Start rehearsal measurement process.\n"); printf(" Measure the performance in %d times.\n\n",nn); } MPI_Barrier(MPI_COMM_WORLD); cpu0= MPI_Wtime(); gosa= jacobi(nn); cpu1= MPI_Wtime() - cpu0; MPI_Allreduce(&cpu1, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); flop= fflop(mz,my,mx); if(id == 0){ printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa); } nn= (int)(target/(cpu/3.0)); if(id == 0){ printf(" Now, start the actual measurement process.\n"); printf(" The loop will be excuted in %d times\n",nn); printf(" This will take about one minute.\n"); printf(" Wait for a while\n\n"); } /* * Start measuring */ MPI_Barrier(MPI_COMM_WORLD); cpu0 = MPI_Wtime(); // nn = 10000000; gosa = jacobi(nn); cpu1 = MPI_Wtime() - cpu0; MPI_Allreduce(&cpu1, &cpu, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if(id == 0){ fprintf(stderr, "cpu : %f sec.\n", cpu); fprintf(stderr, "Loop executed for %d times\n",nn); fprintf(stderr, "Gosa : %e \n",gosa); fprintf(stderr, "GFLOPS measured : %f\n",mflops(nn,cpu,flop)/1000.0); fprintf(stderr, "Score based on Pentium III 600MHz : %f\n", mflops(nn,cpu,flop)/82.84); } #ifdef SCR_ENABLE SCR_Finalize(); #endif MPI_Finalize(); return (0); }
int main( int argc, char*argv[] ) { float *p_old,*p_new,*p_tmp; int n,nn; float gosa,gflops,thruput,thruput2; double time_start,time_max,target,bytes; cudaStream_t stream_top,stream_btm; NP=1; gpu=0; ME=0; target= 60.0; omega= 0.8f; imax = MIMAX-1; jmax = MJMAX-1; kmax = MKMAX-1; imax_global = NP*(imax-2)+2; nn = ITERS; if(ME==0) { printf("\n mimax = %d mjmax = %d mkmax = %d pitch = %d\n",MIMAX, MJMAX, MKMAX, PITCH); printf(" imax = %d jmax = %d kmax = %d\n",imax_global,jmax,kmax); printf(" gridX = %d gridY = %d blockX = %d blockY = %d\n", GRID_X, GRID_Y, BLOCK_X, BLOCK_Y); } //printf("There are %d processes, I am process# %d using GPU %d\n",NP,ME,gpu); CUDA_SAFE_CALL(cudaSetDevice(gpu)); stream_top = 0; stream_btm = 0; #if (CUDART_VERSION >= 3000) { #if (CUDART_VERSION > 3000) struct cudaDeviceProp prop; // display ECC configuration, only queryable post r3.0 CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, gpu)); printf (" ECC on GPU %d is %s\n", gpu, prop.ECCEnabled ? "ON" : "OFF"); #endif /* CUDART_VERSION > 3000 */ // configure kernels for large shared memory to get better occupancy printf (" Configuring GPU L1 cache size ...\n"); set_kernel_cache_config (cudaFuncCachePreferShared); } #endif /* CUDART_VERSION >= 3000 */ CUDA_SAFE_CALL(cudaStreamCreate(&stream_top)); CUDA_SAFE_CALL(cudaStreamCreate(&stream_btm)); if(ME==0) printf(" Allocating Memory...\n"); allocate_memory(); if(ME==0) printf(" Initializing Data...\n\n"); initmt(); if(ME==0) { printf(" Now, start GPU measurement process.\n"); printf(" The loop will be excuted %d times\n",nn); printf(" Wait for a while\n\n"); } time_start = wallclock(); gosa = 0.0f; p_new = p2_d; p_old = p1_d; for(n=0 ; n<nn; n++) { //swap pointers p_tmp = p_new; p_new = p_old; p_old = p_tmp; jacobi_GPU_btm_even (stream_btm,a0_d,a1_d,a2_d,a3_d,b0_d,b1_d,b2_d,c0_d, c1_d,c2_d,wrk_d,bnd_d,p_old,p_new,gosa_d,omega,n); cudaMemcpyAsync (gosa_btm, gosa_d, sizeof(float), cudaMemcpyDeviceToHost, stream_btm); // Since we want to print intermediate values of gosa every PRINT_ITER // iterations, we need to synchronize before picking up the asynchronously // updated value. if (!(n % PRINT_ITER)) { cudaStreamSynchronize(stream_btm); gosa = *gosa_btm; } if(ME==0 && n%PRINT_ITER==0) printf(" iter: %d \tgosa: %e\n",n,gosa); } cudaThreadSynchronize(); gosa = *gosa_btm; time_max = wallclock() - time_start; gflops = (float)(34.0*( (double)nn*(double)(imax_global-2)*(double)(jmax-2)*(double)(kmax-2) ) / time_max * 1e-9); bytes = NP*((double)nn*(56.0*(imax-2)+8.0)*(double)(jmax)*(double)(kmax)); thruput = (float)(bytes / time_max / 1024.0 / 1024.0 / 1024.0); thruput2 = (float)(bytes / time_max / 1e9); if(ME==0) { printf(" \nLoop executed for %d times\n",nn); printf(" Gosa : %e \n",gosa); printf(" total Compute : %4.1f GFLOPS\ttime : %f seconds\n",gflops,time_max); printf(" total Bandwidth : %4.1f GB/s\n", thruput); printf(" total Bandwidth : %4.1f GB/s (STREAM equivalent)\n",thruput2); printf(" Score based on Pentium III 600MHz : %f\n\n",1000.0*gflops/82.0); } cleanup(); CUDA_SAFE_CALL(cudaStreamDestroy(stream_top)); CUDA_SAFE_CALL(cudaStreamDestroy(stream_btm)); //check_results(); return (EXIT_SUCCESS); }
int main() { int i, j, k, nn; float gosa; double cpu, cpu0, cpu1, flop, target; int myrank = xmp_node_num() - 1; target = 60.0; omega = 0.8; imax = MIMAX; jmax = MJMAX; kmax = MKMAX; /* * Initializing matrixes */ initmt(); if (myrank == 0) { printf("mimax = %d mjmax = %d mkmax = %d\n",MIMAX, MJMAX, MKMAX); printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax); } nn= 3; if (myrank == 0) { printf(" Start rehearsal measurement process.\n"); printf(" Measure the performance in %d times.\n\n",nn); } cpu0= xmp_wtime(); gosa= jacobi(nn); cpu1= xmp_wtime(); cpu= cpu1 - cpu0; flop= fflop(imax,jmax,kmax); if (myrank == 0) { printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa); } nn= (int)(target/(cpu/3.0)); #pragma xmp reduction (max:nn) if (myrank == 0) { printf(" Now, start the actual measurement process.\n"); printf(" The loop will be excuted in %d times\n",nn); printf(" This will take about one minute.\n"); printf(" Wait for a while\n\n"); } /* * Start measuring */ cpu0 = xmp_wtime(); gosa = jacobi(nn); cpu1 = xmp_wtime(); cpu= cpu1 - cpu0; if (myrank == 0) { printf(" Loop executed for %d times\n",nn); printf(" Gosa : %e \n",gosa); printf(" MFLOPS measured : %f\tcpu : %f\n",mflops(nn,cpu,flop),cpu); printf(" Score based on Pentium III 600MHz : %f\n", mflops(nn,cpu,flop)/82.84); } return (0); }