VImage VAniso3d(VImage src,VImage dest,VShort numiter, VShort type,VFloat kappa,VFloat alpha) { VImage tmp1=NULL,tmp2=NULL; int nbands,nrows,ncols; int b,r,c,iter; float delta; float dx,dy,dz,d,u,v; float ux1,ux2,uy1,uy2,uz1,uz2; float b1,b2,r1,r2,c1,c2; VDouble xmax,xmin; VBoolean ignore = TRUE; nbands = VImageNBands(src); nrows = VImageNRows(src); ncols = VImageNColumns(src); if (nbands < 3) VError(" min number of slices is 3"); tmp1 = VConvertImageCopy(src,NULL,VAllBands,VFloatRepn); tmp2 = VCreateImage(nbands,nrows,ncols,VFloatRepn); VFillImage(tmp2,VAllBands,0); xmax = VPixelMaxValue (tmp1); xmin = VPixelMinValue (tmp1); delta = 1.0 / 7.0; dx = dy = dz = 0; for (iter=0; iter < numiter; iter++) { for (b=1; b<nbands-1; b++) { for (r=1; r<nrows-1; r++) { for (c=1; c<ncols-1; c++) { u = VPixel(tmp1,b,r,c,VFloat); if (ignore && ABS(u) < 1.0e-10) continue; c1 = VPixel(tmp1,b,r,c+1,VFloat); c2 = VPixel(tmp1,b,r,c-1,VFloat); r1 = VPixel(tmp1,b,r+1,c,VFloat); r2 = VPixel(tmp1,b,r-1,c,VFloat); b1 = VPixel(tmp1,b+1,r,c,VFloat); b2 = VPixel(tmp1,b-1,r,c,VFloat); /* col-dir */ dx = c1-u; dy = r1-r2; dz = b1-b2; d = diffusion3d(dx,dy,dz,type,kappa,alpha); ux1 = d*(c1 - u); dx = u-c2; d = diffusion3d(dx,dy,dz,type,kappa,alpha); ux2 = d*(u - c2); /* row-dir */ dx = c1-c2; dy = r1-u; dz = b1-b2; d = diffusion3d(dx,dy,dz,type,kappa,alpha); uy1 = d*(r1 - u); dy = u-r2; d = diffusion3d(dx,dy,dz,type,kappa,alpha); uy2 = d*(u - r2); /* slice-dir */ dx = c1-c2; dy = r1-r2; dz = b1-u; d = diffusion3d(dx,dy,dz,type,kappa,alpha); uz1 = d*(b1 - u); dz = u-b2; d = diffusion3d(dx,dy,dz,type,kappa,alpha); uz2 = d*(u - b2); /* sum */ v = u + delta*(ux1 - ux2 + uy1 - uy2 + uz1 - uz2); if (v > xmax) v = xmax; if (v < xmin) v = xmin; VPixel(tmp2,b,r,c,VFloat) = v; } } } tmp1 = VCopyImagePixels(tmp2,tmp1,VAllBands); } /* ** output */ dest = VCopyImage(src,dest,VAllBands); xmax = VPixelMaxValue (dest); xmin = VPixelMinValue (dest); for (b=1; b<nbands-1; b++) { for (r=1; r<nrows-1; r++) { for (c=1; c<ncols-1; c++) { v = VPixel(tmp2,b,r,c,VFloat); if (v > xmax) v = xmax; if (v < xmin) v = xmin; VSetPixel(dest,b,r,c,(VDouble) v); } } } VDestroyImage(tmp1); VDestroyImage(tmp2); return dest; }
void mainloop // ==================================================================== // // purpos : 2-dimensional diffusion equation solved by FDM // // date : 2012-5-10 // programmer : Michel Müller // place : Tokyo Institute of Technology // ( FLOAT *f, /* dependent variable */ FLOAT *fn, /* updated dependent variable */ FLOAT kappa, /* diffusion coefficient */ FLOAT *time, /* time */ FLOAT dt, /* time step interval */ FLOAT dx, /* grid spacing in the x-direction */ FLOAT dy, /* grid spacing in the y-direction */ FLOAT dz /* grid spacing in the z-direction */ ) // -------------------------------------------------------------------- { int icnt = 1; double start_time, elapsed_time; double start_time_total, start_computation_time, elapsed_time_total, elapsed_computation_time; clock_t ctime_start_computation_time, ctime_start_total_time; double ctime_elapsed_computation_time, ctime_elapsed_total_time; long long int numOfStencilsComputed = 0; long long int idealCacheModelBytesTransferred = 0; long long int noCacheModelBytesTransferred = 0; start_time = omp_get_wtime(); ctime_start_total_time = clock() / CLOCKS_PER_SEC; printf("Starting Reference C Version of 3D Diffusion\n"); printf("kappa: %e, dt: %e, dx: %e\n", kappa, dt, dx); #pragma omp parallel #pragma omp master { printf("num threads: %d\n", omp_get_num_threads( )); } #pragma acc data copy(f[0:XYZ_SIZE]), create(fn[0:XYZ_SIZE]) { #pragma omp master { start_computation_time = omp_get_wtime(); ctime_start_computation_time = clock() / CLOCKS_PER_SEC; } do { if(icnt % 100 == 0) fprintf(stderr,"time after iteration %4d:%7.5f\n",icnt+1,*time + dt); diffusion3d(f,fn,kappa,dt,dx,dy,dz); numOfStencilsComputed += DIM_X_INNER * DIM_Y_INNER * DIM_Z_INNER; idealCacheModelBytesTransferred += DIM_X_INNER * DIM_Y_INNER * DIM_Z_INNER * FLOAT_BYTE_LENGTH * 2; noCacheModelBytesTransferred += DIM_X_INNER * DIM_Y_INNER * DIM_Z_INNER * FLOAT_BYTE_LENGTH * 8; swap(&f,&fn); *time = *time + dt; } while(icnt++ < 90000 && *time + 0.5*dt < 0.1); #pragma acc wait #pragma omp master { elapsed_computation_time = omp_get_wtime() - start_computation_time; ctime_elapsed_computation_time = (clock() - ctime_start_computation_time) / (double) CLOCKS_PER_SEC; } } elapsed_time_total = omp_get_wtime() - start_time; ctime_elapsed_total_time = (clock() - ctime_start_total_time) / (double) CLOCKS_PER_SEC; double elapsed_computation_time_combined = elapsed_computation_time; if (elapsed_computation_time_combined <= 0.0) { elapsed_computation_time_combined = ctime_elapsed_computation_time; } aprint("Calculated Time= %9.3e [sec]\n",*time); aprint("Elapsed Total Time (OMP timer)= %9.3e [sec]\n",elapsed_time_total); aprint("Elapsed Total Time (CTime)= %9.3e [sec]\n",ctime_elapsed_total_time); aprint("Elapsed Computation Time (OMP timer)= %9.3e [sec]\n",elapsed_computation_time); aprint("Elapsed Computation Time (CTime)= %9.3e [sec]\n",ctime_elapsed_computation_time); aprint("Performance= %7.2f [million stencils/sec]\n",((double)numOfStencilsComputed)/elapsed_computation_time_combined*1.0e-06); aprint("Bandwidth Ideal Cache Model= %7.2f [GB/s]\n",((double)idealCacheModelBytesTransferred)/elapsed_computation_time_combined*1.0e-09); aprint("Bandwidth No Cache Model= %7.2f [GB/s]\n",((double)noCacheModelBytesTransferred)/elapsed_computation_time_combined*1.0e-09); }