static void mg3P(double **u, double *v, double **r, double a[4], double c[4], int n1, int n2, int n3, int k) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c multigrid V-cycle routine c-------------------------------------------------------------------*/ int j; /*-------------------------------------------------------------------- c down cycle. c restrict the residual from the find grid to the coarse c-------------------------------------------------------------------*/ for (k = lt; k >= lb+1; k--) { j = k-1; rprj3(r[k], m1[k], m2[k], m3[k], r[j], m1[j], m2[j], m3[j], k); } k = lb; /*-------------------------------------------------------------------- c compute an approximate solution on the coarsest grid c-------------------------------------------------------------------*/ zero3(u[k], m1[k], m2[k], m3[k]); psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k); for (k = lb+1; k <= lt-1; k++) { j = k-1; /*-------------------------------------------------------------------- c prolongate from level k-1 to k c-------------------------------------------------------------------*/ zero3(u[k], m1[k], m2[k], m3[k]); interp(u[j], m1[j], m2[j], m3[j], u[k], m1[k], m2[k], m3[k], k); /*-------------------------------------------------------------------- c compute residual for level k c-------------------------------------------------------------------*/ resid(u[k], r[k], r[k], m1[k], m2[k], m3[k], a, k); /*-------------------------------------------------------------------- c apply smoother c-------------------------------------------------------------------*/ psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k); } j = lt - 1; k = lt; interp(u[j], m1[j], m2[j], m3[j], u[lt], n1, n2, n3, k); resid(u[lt], v, r[lt], n1, n2, n3, a, k); psinv(r[lt], u[lt], n1, n2, n3, c, k); }
void TestNonlinearEquationPde() { ChastePoint<1> zero1(0); ChastePoint<2> zero2(0,0); ChastePoint<3> zero3(0,0,0); double u = 2.0; NonlinearEquationPde<1> heat_equation1; NonlinearEquationPde<2> heat_equation2; NonlinearEquationPde<3> heat_equation3; TS_ASSERT_DELTA(heat_equation1.ComputeNonlinearSourceTerm(zero1,u),0.0,1e-12); TS_ASSERT_DELTA(heat_equation2.ComputeNonlinearSourceTerm(zero2,u),0.0,1e-12); TS_ASSERT_DELTA(heat_equation3.ComputeNonlinearSourceTerm(zero3,u),0.0,1e-12); // Diffusion matrices should be equal to identity * u; c_matrix<double, 1, 1> diff1 = heat_equation1.ComputeDiffusionTerm(zero1,u); c_matrix<double, 2, 2> diff2 = heat_equation2.ComputeDiffusionTerm(zero2,u); c_matrix<double, 3, 3> diff3 = heat_equation3.ComputeDiffusionTerm(zero3,u); TS_ASSERT_DELTA(diff1(0,0),u,1e-12); TS_ASSERT_DELTA(diff2(0,0),u,1e-12); TS_ASSERT_DELTA(diff2(1,1),u,1e-12); TS_ASSERT_DELTA(diff2(0,1),0,1e-12); TS_ASSERT_DELTA(diff3(0,0),u,1e-12); TS_ASSERT_DELTA(diff3(1,1),u,1e-12); TS_ASSERT_DELTA(diff3(2,2),u,1e-12); TS_ASSERT_DELTA(diff3(0,1),0,1e-12); TS_ASSERT_DELTA(diff3(0,2),0,1e-12); TS_ASSERT_DELTA(diff3(1,2),0,1e-12); }
void TestHeatEquation() { ChastePoint<1> zero1(0); ChastePoint<2> zero2(0,0); ChastePoint<3> zero3(0,0,0); double u = 2.0; HeatEquation<1> pde1; HeatEquation<2> pde2; HeatEquation<3> pde3; TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1,u), 0.0, 1e-12); TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2,u), 0.0, 1e-12); TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3,u), 0.0, 1e-12); TS_ASSERT_DELTA(pde1.ComputeDuDtCoefficientFunction(zero1), 1.0, 1e-12); TS_ASSERT_DELTA(pde2.ComputeDuDtCoefficientFunction(zero2), 1.0, 1e-12); TS_ASSERT_DELTA(pde3.ComputeDuDtCoefficientFunction(zero3), 1.0, 1e-12); // Diffusion matrices should be equal to identity c_matrix<double, 1, 1> diff1 = pde1.ComputeDiffusionTerm(zero1); c_matrix<double, 2, 2> diff2 = pde2.ComputeDiffusionTerm(zero2); c_matrix<double, 3, 3> diff3 = pde3.ComputeDiffusionTerm(zero3); TS_ASSERT_DELTA(diff1(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff2(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff2(1,1), 1, 1e-12); TS_ASSERT_DELTA(diff2(0,1), 0, 1e-12); TS_ASSERT_DELTA(diff3(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff3(1,1), 1, 1e-12); TS_ASSERT_DELTA(diff3(2,2), 1, 1e-12); TS_ASSERT_DELTA(diff3(0,1), 0, 1e-12); TS_ASSERT_DELTA(diff3(0,2), 0, 1e-12); TS_ASSERT_DELTA(diff3(1,2), 0, 1e-12); Node<1> node(0, zero1); TS_ASSERT_DELTA(pde1.ComputeSourceTermAtNode(node,u), 0.0, 1e-12); }
int main(int argc, char *argv[]) { /*------------------------------------------------------------------------- c k is the current level. It is passed down through subroutine args c and is NOT global. it is the current iteration c------------------------------------------------------------------------*/ int k, it; double t, tinit, mflops; int nthreads = 1; /*------------------------------------------------------------------------- c These arrays are in common because they are quite large c and probably shouldn''t be allocated on the stack. They c are always passed as subroutine args. c------------------------------------------------------------------------*/ double **u, *v, **r; double a[4], c[4]; double rnm2, rnmu; double epsilon = 1.0e-8; int n1, n2, n3, nit; double verify_value; boolean verified; int i, j, l; FILE *fp; timer_clear(T_BENCH); timer_clear(T_INIT); timer_start(T_INIT); /*---------------------------------------------------------------------- c Read in and broadcast input data c---------------------------------------------------------------------*/ printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - MG Benchmark\n\n"); fp = fopen("mg.input", "r"); if (fp != NULL) { printf(" Reading from input file mg.input\n"); fscanf(fp, "%d", <); while(fgetc(fp) != '\n'); fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]); while(fgetc(fp) != '\n'); fscanf(fp, "%d", &nit); while(fgetc(fp) != '\n'); for (i = 0; i <= 7; i++) { fscanf(fp, "%d", &debug_vec[i]); } fclose(fp); } else { printf(" No input file. Using compiled defaults\n"); lt = LT_DEFAULT; nit = NIT_DEFAULT; nx[lt] = NX_DEFAULT; ny[lt] = NY_DEFAULT; nz[lt] = NZ_DEFAULT; for (i = 0; i <= 7; i++) { debug_vec[i] = DEBUG_DEFAULT; } } if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) { Class = 'U'; } else if( nx[lt] == 32 && nit == 4 ) { Class = 'S'; } else if( nx[lt] == 64 && nit == 40 ) { Class = 'W'; } else if( nx[lt] == 256 && nit == 20 ) { Class = 'B'; } else if( nx[lt] == 512 && nit == 20 ) { Class = 'C'; } else if( nx[lt] == 256 && nit == 4 ) { Class = 'A'; } else { Class = 'U'; } /*-------------------------------------------------------------------- c Use these for debug info: c--------------------------------------------------------------------- c debug_vec(0) = 1 !=> report all norms c debug_vec(1) = 1 !=> some setup information c debug_vec(1) = 2 !=> more setup information c debug_vec(2) = k => at level k or below, show result of resid c debug_vec(3) = k => at level k or below, show result of psinv c debug_vec(4) = k => at level k or below, show result of rprj c debug_vec(5) = k => at level k or below, show result of interp c debug_vec(6) = 1 => (unused) c debug_vec(7) = 1 => (unused) c-------------------------------------------------------------------*/ a[0] = -8.0/3.0; a[1] = 0.0; a[2] = 1.0/6.0; a[3] = 1.0/12.0; if (Class == 'A' || Class == 'S' || Class =='W') { /*-------------------------------------------------------------------- c Coefficients for the S(a) smoother c-------------------------------------------------------------------*/ c[0] = -3.0/8.0; c[1] = 1.0/32.0; c[2] = -1.0/64.0; c[3] = 0.0; } else { /*-------------------------------------------------------------------- c Coefficients for the S(b) smoother c-------------------------------------------------------------------*/ c[0] = -3.0/17.0; c[1] = 1.0/33.0; c[2] = -1.0/61.0; c[3] = 0.0; } lb = 1; setup(&n1,&n2,&n3,lt); /* Allocate the data arrays * 3d arrays are flattened and allocated as a contiguous block * 4d arrays are allocated as separate 3d blocks */ u = (double **)malloc((lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) u[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double)); v = (double *)malloc(m3[lt]*m2[lt]*m1[lt]*sizeof(double)); r = (double **)malloc((lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) r[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double)); // Array v can be treated using a standard OpenACC data region #pragma acc data create(v[0:m3[lt]*m2[lt]*m1[lt]]) copyin(a[0:4],c[0:4]) { #ifdef _OPENACC //**************************************************************** /* Now manually deep-create arrays u,r on the GPU using the Cray extended * runtime API, instead of using a data region */ double **acc_u = (double **)cray_acc_create(u,(lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) { double *acc_ul = (double *)cray_acc_create(u[l],m3[l]*m2[l]*m1[l]*sizeof(double)); SET_ACC_PTR(acc_u[l], acc_ul); } double **acc_r = (double **)cray_acc_create(r,(lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) { double *acc_rl = (double *)cray_acc_create(r[l],m3[l]*m2[l]*m1[l]*sizeof(double)); SET_ACC_PTR(acc_r[l], acc_rl); } //**************************************************************** #endif /* _OPENACC */ #pragma omp parallel { zero3(u[lt],n1,n2,n3); } zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); #pragma omp parallel { norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); #pragma omp single { /* printf("\n norms of random v are\n"); printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu); printf(" about to evaluate resid, k= %d\n", lt);*/ printf(" Size: %3dx%3dx%3d (class %1c)\n", nx[lt], ny[lt], nz[lt], Class); printf(" Iterations: %3d\n", nit); } resid(u[lt],v,r[lt],n1,n2,n3,a,lt); norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); /*c--------------------------------------------------------------------- c One iteration for startup c---------------------------------------------------------------------*/ mg3P(u,v,r,a,c,n1,n2,n3,lt); resid(u[lt],v,r[lt],n1,n2,n3,a,lt); #pragma omp single setup(&n1,&n2,&n3,lt); zero3(u[lt],n1,n2,n3); } /* pragma omp parallel */ zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); timer_stop(T_INIT); timer_start(T_BENCH); #pragma omp parallel firstprivate(nit) private(it) { resid(u[lt],v,r[lt],n1,n2,n3,a,lt); norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); for ( it = 1; it <= nit; it++) { mg3P(u,v,r,a,c,n1,n2,n3,lt); resid(u[lt],v,r[lt],n1,n2,n3,a,lt); } norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif } /* pragma omp parallel */ timer_stop(T_BENCH); t = timer_read(T_BENCH); tinit = timer_read(T_INIT); verified = FALSE; verify_value = 0.0; printf(" Initialization time: %15.3f seconds\n", tinit); printf(" Benchmark completed\n"); if (Class != 'U') { if (Class == 'S') { verify_value = 0.530770700573e-04; } else if (Class == 'W') { verify_value = 0.250391406439e-17; /* 40 iterations*/ /* 0.183103168997d-044 iterations*/ } else if (Class == 'A') { verify_value = 0.2433365309e-5; } else if (Class == 'B') { verify_value = 0.180056440132e-5; } else if (Class == 'C') { verify_value = 0.570674826298e-06; } if ( fabs( rnm2 - verify_value ) <= epsilon ) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" L2 Norm is %20.12e\n", rnm2); printf(" Error is %20.12e\n", rnm2 - verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" L2 Norm is %20.12e\n", rnm2); printf(" The correct L2 Norm is %20.12e\n", verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { int nn = nx[lt]*ny[lt]*nz[lt]; mflops = 58.*nit*nn*1.0e-6 / t; } else { mflops = 0.0; } c_print_results("MG", Class, nx[lt], ny[lt], nz[lt], nit, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); // I should probably deep-free the manually deep-created accelerator data here } //acc end data }
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c zran3 loads +1 at ten randomly chosen points, c loads -1 at a different ten random points, c and zero elsewhere. c-------------------------------------------------------------------*/ #define MM 10 #define A pow(5.0,13) #define X 314159265.e0 int i0, m0, m1; int i1, i2, i3, d1, e1, e2, e3; double xx, x0, x1, a1, a2, ai; double ten[MM][2], best; int i, j1[MM][2], j2[MM][2], j3[MM][2]; int jg[4][MM][2]; double rdummy; a1 = power( A, nx ); a2 = power( A, nx*ny ); #if 0 #pragma omp parallel { zero3(z,n1,n2,n3); } #else #pragma omp parallel for private(i2, i1) for (i3 = 0;i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); z[i123] = 0.0; } } } #endif i = is1-1+nx*(is2-1+ny*(is3-1)); ai = power( A, i ); d1 = ie1 - is1 + 1; e1 = ie1 - is1 + 2; e2 = ie2 - is2 + 2; e3 = ie3 - is3 + 2; x0 = X; rdummy = randlc( &x0, ai ); for (i3 = 1; i3 < e3; i3++) { x1 = x0; for (i2 = 1; i2 < e2; i2++) { xx = x1; vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)])); rdummy = randlc( &x1, a1 ); } rdummy = randlc( &x0, a2 ); } /*-------------------------------------------------------------------- c call comm3(z,n1,n2,n3) c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c each processor looks for twenty candidates c-------------------------------------------------------------------*/ for (i = 0; i < MM; i++) { ten[i][1] = 0.0; j1[i][1] = 0; j2[i][1] = 0; j3[i][1] = 0; ten[i][0] = 1.0; j1[i][0] = 0; j2[i][0] = 0; j3[i][0] = 0; } for (i3 = 1; i3 < n3-1; i3++) { for (i2 = 1; i2 < n2-1; i2++) { for (i1 = 1; i1 < n1-1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); if ( z[i123] > ten[0][1] ) { ten[0][1] = z[i123]; j1[0][1] = i1; j2[0][1] = i2; j3[0][1] = i3; bubble( ten, j1, j2, j3, MM, 1 ); } if ( z[i123] < ten[0][0] ) { ten[0][0] = z[i123]; j1[0][0] = i1; j2[0][0] = i2; j3[0][0] = i3; bubble( ten, j1, j2, j3, MM, 0 ); } } } } /*-------------------------------------------------------------------- c Now which of these are globally best? c-------------------------------------------------------------------*/ i1 = MM - 1; i0 = MM - 1; for (i = MM - 1 ; i >= 0; i--) { int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]); best = z[j123]; if (best == z[j123]) { jg[0][i][1] = 0; jg[1][i][1] = is1 - 1 + j1[i1][1]; jg[2][i][1] = is2 - 1 + j2[i1][1]; jg[3][i][1] = is3 - 1 + j3[i1][1]; i1 = i1-1; } else { jg[0][i][1] = 0; jg[1][i][1] = 0; jg[2][i][1] = 0; jg[3][i][1] = 0; } ten[i][1] = best; j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]); best = z[j123]; if (best == z[j123]) { jg[0][i][0] = 0; jg[1][i][0] = is1 - 1 + j1[i0][0]; jg[2][i][0] = is2 - 1 + j2[i0][0]; jg[3][i][0] = is3 - 1 + j3[i0][0]; i0 = i0-1; } else { jg[0][i][0] = 0; jg[1][i][0] = 0; jg[2][i][0] = 0; jg[3][i][0] = 0; } ten[i][0] = best; } m1 = i1+1; m0 = i0+1; /* printf(" negative charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]); } printf("\n positive charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]); } printf("\n small random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][0]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][0]); } printf("\n large random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][1]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][1]); } printf("\n");*/ #if 0 #pragma omp parallel for private(i2, i1) for (i3 = 0; i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2+n2*i3); z[i123] = 0.0; } } } #else #pragma omp parallel { zero3(z,n1,n2,n3); } #endif #pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg) { #pragma acc loop for (i = MM-1; i >= m0; i--) { int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]); z[j123] = -1.0; } #pragma acc loop for (i = MM-1; i >= m1; i--) { int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]); z[j123] = 1.0; } } // end acc parallel #pragma omp parallel comm3(z,n1,n2,n3,k); /*-------------------------------------------------------------------- c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ }
int main(int argc, char *argv[]) { clock_t tstart, tend; double duration; int numprocs, rank; float *sendbuf, *recvbuf; MPI_Comm Comm=MPI_COMM_WORLD; bool verb, wantrecord, wantwf, onlyrecord; sf_file Ffvel, Ffden, Fbvel, Fbden; sf_file Fsrc, Frcd, Fimg1, Fimg2; sf_file FGx, FGz, Fsxx, Fsxz, Fszx, Fszz; sf_file Ftmpfwf, Ftmpbwf; sf_axis at, ax, az, atau; int shtbgn, shtinv, shtnmb, shtpad, shtnmb0; int snapturn, tmpint; float **fvel, **bvel; float ***fwf, ***record, **localrec; float ***img1, **img2, ***mig1, **mig2; float *tmpsxx, *tmpsxz, *tmpszx, *tmpszz; sf_init(argc, argv); MPI_Init(&argc, &argv); MPI_Comm_size(Comm, &numprocs); MPI_Comm_rank(Comm, &rank); tstart=clock(); if(rank==0) sf_warning("numprocs=%d", numprocs); if(!sf_getbool("verb", &verb)) verb=true; if(!sf_getbool("wantrecord", &wantrecord)) wantrecord=false; if(!sf_getbool("wantwf", &wantwf)) wantwf=false; if(!sf_getbool("onlyrecord", &onlyrecord)) onlyrecord=false; Fsrc=sf_input("-input"); Fimg1=sf_output("-output"); Fimg2=sf_output("img2"); Ffvel=sf_input("fvel"); Ffden=sf_input("fden"); Fbvel=sf_input("bvel"); Fbden=sf_input("bden"); if(wantrecord) Frcd=sf_input("record"); else Frcd=sf_output("record"); if(wantwf){ Ftmpfwf=sf_output("tmpfwf"); Ftmpbwf=sf_output("tmpbwf"); } FGx=sf_input("Gx"); FGz=sf_input("Gz"); Fsxx=sf_input("sxx"); Fsxz=sf_input("sxz"); Fszx=sf_input("szx"); Fszz=sf_input("szz"); at=sf_iaxa(Fsrc, 1); nt=sf_n(at); dt=sf_d(at); if(!sf_getbool("srcdecay", &srcdecay)) srcdecay=true; if(!sf_getint("srcrange", &srcrange)) srcrange=3; if(!sf_getfloat("srctrunc", &srctrunc)) srctrunc=0.2; if(!sf_getfloat("srcalpha", &srcalpha)) srcalpha=0.5; wavelet=sf_floatalloc(nt); sf_floatread(wavelet, nt, Fsrc); if(!sf_getint("pmlsize", &pmlsize)) pmlsize=30; if(!sf_getint("nfd", &nfd)) sf_error("Need half of the FD order!"); if(!sf_getfloat("pmld0", &pmld0)) pmld0=200; if(!sf_getint("shtnmb", &shtnmb)) sf_error("Need shot number!"); if(!sf_getint("shtinv", &shtinv)) sf_error("Need shot interval!"); if(!sf_getint("shtbgn", &shtbgn)) shtbgn=0; shtpad=numprocs-shtnmb%numprocs; shtnmb0=shtnmb+shtpad; az=sf_iaxa(Ffvel, 1); nzb=sf_n(az); ax=sf_iaxa(Ffvel, 2); nxb=sf_n(ax); nxzb=nxb*nzb; nz=nzb-2*nfd-2*pmlsize; nx=nxb-2*nfd-2*pmlsize; if(!sf_getint("snapturn", &snapturn)) snapturn=1; if(!sf_getint("ginv", &ginv)) ginv=1; if(!sf_getint("wfinv", &wfinv)) wfinv=1; if(!sf_getint("spz", &spz)) spz=6; if(!sf_getint("gp", &gp)) gp=0; ng=(nx-1)/ginv+1; wfnt=(nt-1)/wfinv+1; wfdt=dt*wfinv; if(!sf_getint("ntau", &ntau)) ntau=1; if(!sf_getfloat("dtau", &dtau)) dtau=wfdt; if(!sf_getfloat("tau0", &tau0)) tau0=0; atau=sf_iaxa(Fsrc, 1); sf_setn(atau, ntau); sf_setd(atau, dtau); sf_seto(atau, tau0); if(!sf_histint(FGx, "n1", &nxz)) sf_error("No n1= in FGx!"); if(nxz != nxzb) sf_error("Dimension error!"); if(!sf_histint(FGx, "n2", &lenx)) sf_error("No n2= in FGx!"); if(!sf_histint(FGz, "n2", &lenz)) sf_error("No n2= in FGz!"); Gx=sf_floatalloc3(nzb, nxb, lenx); Gz=sf_floatalloc3(nzb, nxb, lenz); sxx=sf_intalloc(lenx); sxz=sf_intalloc(lenx); szx=sf_intalloc(lenz); szz=sf_intalloc(lenz); tmpsxx=sf_floatalloc(lenx); tmpsxz=sf_floatalloc(lenx); tmpszx=sf_floatalloc(lenz); tmpszz=sf_floatalloc(lenz); sf_floatread(Gx[0][0], nxzb*lenx, FGx); sf_floatread(Gz[0][0], nxzb*lenz, FGz); sf_floatread(tmpsxx, lenx, Fsxx); sf_floatread(tmpsxz, lenx, Fsxz); sf_floatread(tmpszx, lenz, Fszx); sf_floatread(tmpszz, lenz, Fszz); for (ix=0; ix<lenx; ix++){ sxx[ix]=(int)tmpsxx[ix]; sxz[ix]=(int)tmpsxz[ix]; } for (iz=0; iz<lenz; iz++){ szx[iz]=(int)tmpszx[iz]; szz[iz]=(int)tmpszz[iz]; } fvel=sf_floatalloc2(nzb, nxb); fden=sf_floatalloc2(nzb, nxb); fc11=sf_floatalloc2(nzb, nxb); bvel=sf_floatalloc2(nzb, nxb); bden=sf_floatalloc2(nzb, nxb); bc11=sf_floatalloc2(nzb, nxb); sf_floatread(fvel[0], nxzb, Ffvel); sf_floatread(fden[0], nxzb, Ffden); sf_floatread(bvel[0], nxzb, Fbvel); sf_floatread(bden[0], nxzb, Fbden); for (ix=0; ix<nxb; ix++){ for (iz=0; iz<nzb; iz++){ fc11[ix][iz]=fden[ix][iz]*fvel[ix][iz]*fvel[ix][iz]; bc11[ix][iz]=bden[ix][iz]*bvel[ix][iz]*bvel[ix][iz]; } } if(wantrecord){ /* check record data */ sf_histint(Frcd, "n1", &tmpint); if(tmpint != nt) sf_error("Not matched dimensions!"); sf_histint(Frcd, "n2", &tmpint); if(tmpint != ng) sf_error("Not matched dimensions!"); sf_histint(Frcd, "n3", &tmpint); if(tmpint != shtnmb) sf_error("Not matched dimensions!"); } if(rank==0){ record=sf_floatalloc3(nt, ng, shtnmb0); if(wantrecord){ sf_floatread(record[0][0], nt*ng*shtnmb, Frcd); for(is=shtnmb; is<shtnmb0; is++) for(ix=0; ix<ng; ix++) for(it=0; it<nt; it++) record[is][ix][it]=0.0; } } img1=sf_floatalloc3(nz, nx, ntau); mig1=sf_floatalloc3(nz, nx, ntau); img2=sf_floatalloc2(nz, nx); mig2=sf_floatalloc2(nz, nx); zero3(img1, nz, nx, ntau); zero2(img2, nz, nx); sf_setn(az, nz); sf_setn(ax, ng); if(!wantrecord){ sf_oaxa(Frcd, at, 1); sf_oaxa(Frcd, ax, 2); sf_putint(Frcd, "n3", shtnmb); sf_putint(Frcd, "d3", shtinv); sf_putint(Frcd, "o3", shtbgn); } sf_setn(ax, nx); if(wantwf){ sf_setn(at, wfnt); sf_setd(at, wfdt); sf_oaxa(Ftmpfwf, az, 1); sf_oaxa(Ftmpfwf, ax, 2); sf_oaxa(Ftmpfwf, at, 3); sf_oaxa(Ftmpbwf, az, 1); sf_oaxa(Ftmpbwf, ax, 2); sf_oaxa(Ftmpbwf, at, 3); } sf_oaxa(Fimg1, az, 1); sf_oaxa(Fimg1, ax, 2); sf_oaxa(Fimg1, atau, 3); sf_oaxa(Fimg2, az, 1); sf_oaxa(Fimg2, ax, 2); fwf=sf_floatalloc3(nz, nx, wfnt); localrec=sf_floatalloc2(nt, ng); if(verb){ sf_warning("=================================="); sf_warning("nx=%d nz=%d nt=%d", nx, nz, nt); sf_warning("wfnt=%d wfdt=%f wfinv=%d dt=%f", wfnt, wfdt, wfinv, dt); sf_warning("nxb=%d nzb=%d pmlsize=%d nfd=%d", nxb, nzb, pmlsize, nfd); sf_warning("ntau=%d dtau=%f tau0=%f", ntau, dtau, tau0); sf_warning("shtnmb=%d shtbgn=%d shtinv=%d", shtnmb, shtbgn, shtinv); sf_warning("lenx=%d lenz=%d spz=%d gp=%d", lenx, lenz, spz, gp); sf_warning("=================================="); } init(); for(iturn=0; iturn*numprocs<shtnmb; iturn++){ is=iturn*numprocs+rank; if(is<shtnmb){ sf_warning("ishot/nshot: %d/%d", is+1, shtnmb); spx=is*shtinv+shtbgn; sglfdfor2(fwf, localrec, verb); } if(wantrecord){ recvbuf=localrec[0]; if(rank==0) sendbuf=record[iturn*numprocs][0]; else sendbuf=NULL; MPI_Scatter(sendbuf, ng*nt, MPI_FLOAT, recvbuf, ng*nt, MPI_FLOAT, 0, Comm); }else{ sendbuf=localrec[0]; if(rank==0) recvbuf=record[iturn*numprocs][0]; else recvbuf=NULL; MPI_Gather(sendbuf, ng*nt, MPI_FLOAT, recvbuf, ng*nt, MPI_FLOAT, 0, Comm); } if(wantwf && rank==0 && iturn==snapturn-1) wantwf=true; else wantwf=false; if(wantwf) sf_floatwrite(fwf[0][0], wfnt*nx*nz, Ftmpfwf); if(!onlyrecord && is<shtnmb){ sglfdback2(mig1, mig2, fwf, localrec, verb, wantwf, Ftmpbwf); for(itau=0; itau<ntau; itau++){ for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ img1[itau][ix][iz]+=mig1[itau][ix][iz]; } } } for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ img2[ix][iz]+=mig2[ix][iz]; } } } MPI_Barrier(Comm); } //end of iturn if(!onlyrecord){ if(rank==0){ sendbuf=(float *)MPI_IN_PLACE; recvbuf=img1[0][0]; }else{ sendbuf=img1[0][0]; recvbuf=NULL; } MPI_Reduce(sendbuf, recvbuf, ntau*nx*nz, MPI_FLOAT, MPI_SUM, 0, Comm); if(rank==0){ sendbuf=MPI_IN_PLACE; recvbuf=img2[0]; }else{ sendbuf=img2[0]; recvbuf=NULL; } MPI_Reduce(sendbuf, recvbuf, nx*nz, MPI_FLOAT, MPI_SUM, 0, Comm); } if(rank==0){ if(!wantrecord){ sf_floatwrite(record[0][0], shtnmb*ng*nt, Frcd); } sf_floatwrite(img1[0][0], ntau*nx*nz, Fimg1); sf_floatwrite(img2[0], nx*nz, Fimg2); } tend=clock(); duration=(double)(tend-tstart)/CLOCKS_PER_SEC; sf_warning(">>The CPU time of sfmpilfdrtm2 is: %f seconds<<", duration); MPI_Finalize(); exit(0); }
int sglfdback2(float ***mig1, float **mig2, float ***fwf, float **localrec, bool verb, bool wantwf, sf_file Ftmpbwf) { float **txxn1, **txxn0, **vxn1, **vxn0, **vzn1, **vzn0; float **sill, **ccr, ***bwf; int wfit, htau; float tau; sill=sf_floatalloc2(nz, nx); ccr=sf_floatalloc2(nz, nx); bwf=sf_floatalloc3(nz, nx, wfnt); zero2(sill, nz, nx); zero2(ccr, nz, nx); zero3(mig1, nz, nx, ntau); txxn1=sf_floatalloc2(nzb, nxb); txxn0=sf_floatalloc2(nzb, nxb); vxn1=sf_floatalloc2(nzb, nxb); vxn0=sf_floatalloc2(nzb, nxb); vzn1=sf_floatalloc2(nzb, nxb); vzn0=sf_floatalloc2(nzb, nxb); zero2(txxn1, nzb, nxb); zero2(txxn0, nzb, nxb); zero2(vxn1, nzb, nxb); zero2(vxn0, nzb, nxb); zero2(vzn1, nzb, nxb); zero2(vzn0, nzb, nxb); zero2(txxn1x, nzb, nxb); zero2(txxn1z, nzb, nxb); zero2(txxn0x, nzb, nxb); zero2(txxn0z, nzb, nxb); wfit=wfnt-1; for(it=nt-1; it>=0; it--){ if(verb) sf_warning("Backward it=%d/%d;", it+1, nt); #ifdef _OPENMP #pragma omp parallel for private(ix,iz) #endif for(ix=nfd+pmlsize; ix<nfd+pmlsize+nx; ix++){ for(iz=nfd+pmlsize; iz<nfd+pmlsize+nz; iz++){ txxn0[ix][iz]=txxn1[ix][iz]+dt*bc11[ix][iz]*(ldx(vxn1, ix-1, iz) +ldz(vzn1, ix, iz-1)); } } pml_txxb(txxn0, vxn1, vzn1); #ifdef _OPENMP #pragma omp parallel for private(ix) #endif for(ix=0; ix<ng; ix++){ txxn0[ix*ginv+pmlsize+nfd][pmlsize+nfd+gp]+=localrec[ix][it]; } #ifdef _OPENMP #pragma omp parallel for private(ix,iz) #endif for(ix=nfd+pmlsize; ix<nfd+pmlsize+nx; ix++){ for(iz=nfd+pmlsize; iz<nfd+pmlsize+nz; iz++){ vxn0[ix][iz]=vxn1[ix][iz]+dt/bdenx[ix][iz]*ldx(txxn0, ix, iz); vzn0[ix][iz]=vzn1[ix][iz]+dt/bdenz[ix][iz]*ldz(txxn0, ix, iz); } } pml_vxzb(vxn1, vzn1, vxn0, vzn0, txxn0); transp=txxn1; txxn1=txxn0; txxn0=transp; transp=vxn1; vxn1=vxn0; vxn0=transp; transp=vzn1; vzn1=vzn0; vzn0=transp; if(it%wfinv==0){ for(ix=0; ix<nx; ix++) for(iz=0; iz<nz; iz++){ bwf[wfit][ix][iz]=txxn0[ix+pmlsize+nfd][iz+pmlsize+nfd]; ccr[ix][iz]+=fwf[wfit][ix][iz]*bwf[wfit][ix][iz]; sill[ix][iz]+=fwf[wfit][ix][iz]*fwf[wfit][ix][iz]; } wfit--; } } //end of it if(verb) sf_warning("."); for(itau=0; itau<ntau; itau++){ tau=itau*dtau+tau0; htau=tau/wfdt; for(it=abs(htau); it<wfnt-abs(htau); it++){ for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ mig1[itau][ix][iz]+=fwf[it+htau][ix][iz]*bwf[it-htau][ix][iz]; } } }//end of it } // end of itau for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ mig2[ix][iz]=ccr[ix][iz]/(sill[ix][iz]+SF_EPS); } } if(wantwf) sf_floatwrite(bwf[0][0], wfnt*nx*nz, Ftmpbwf); return 0; }
void TestHeatEquationWithElementDependentSourceTerm() { // The PDE is set to give elements with index = 0 a source of zero // and a source of 1 otherwise. std::vector<Node<1>*> one_d_nodes; one_d_nodes.push_back(new Node<1>(0, false, 2.0)); one_d_nodes.push_back(new Node<1>(1, false, 2.5)); Element<1,1> one_d_element(0u, one_d_nodes); ChastePoint<1> zero1(0); std::vector<Node<2>*> two_d_nodes; two_d_nodes.push_back(new Node<2>(0, false, 0.0, 0.0)); two_d_nodes.push_back(new Node<2>(1, false, 1.0, 0.0)); two_d_nodes.push_back(new Node<2>(2, false, 0.0, 1.0)); Element<2,2> two_d_element(0u, two_d_nodes); ChastePoint<2> zero2(0,0); std::vector<Node<3>*> three_d_nodes; three_d_nodes.push_back(new Node<3>(0, false, 0.0, 0.0, 0.0)); three_d_nodes.push_back(new Node<3>(1, false, 1.0, 0.0, 0.0)); three_d_nodes.push_back(new Node<3>(2, false, 0.0, 1.0, 0.0)); three_d_nodes.push_back(new Node<3>(3, false, 0.0, 0.0, 1.0)); Element<3,3> three_d_element(0u, three_d_nodes); ChastePoint<3> zero3(0,0,0); double u = 2.0; HeatEquationWithElementDependentSourceTerm<1> pde1; HeatEquationWithElementDependentSourceTerm<2> pde2; HeatEquationWithElementDependentSourceTerm<3> pde3; TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1, u, &one_d_element), 0.0, 1e-12); one_d_element.ResetIndex(1u); TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1, u, &one_d_element), 1.0, 1e-12); TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2, u, &two_d_element), 0.0, 1e-12); two_d_element.ResetIndex(1u); TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2, u, &two_d_element), 1.0, 1e-12); TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3, u, &three_d_element), 0.0, 1e-12); three_d_element.ResetIndex(1u); TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3, u, &three_d_element), 1.0, 1e-12); TS_ASSERT_DELTA(pde1.ComputeDuDtCoefficientFunction(zero1), 1.0, 1e-12); TS_ASSERT_DELTA(pde2.ComputeDuDtCoefficientFunction(zero2), 1.0, 1e-12); TS_ASSERT_DELTA(pde3.ComputeDuDtCoefficientFunction(zero3), 1.0, 1e-12); // Diffusion matrices should be equal to identity c_matrix<double, 1, 1> diff1 = pde1.ComputeDiffusionTerm(zero1); c_matrix<double, 2, 2> diff2 = pde2.ComputeDiffusionTerm(zero2); c_matrix<double, 3, 3> diff3 = pde3.ComputeDiffusionTerm(zero3); TS_ASSERT_DELTA(diff1(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff2(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff2(1,1), 1, 1e-12); TS_ASSERT_DELTA(diff2(0,1), 0, 1e-12); TS_ASSERT_DELTA(diff3(0,0), 1, 1e-12); TS_ASSERT_DELTA(diff3(1,1), 1, 1e-12); TS_ASSERT_DELTA(diff3(2,2), 1, 1e-12); TS_ASSERT_DELTA(diff3(0,1), 0, 1e-12); TS_ASSERT_DELTA(diff3(0,2), 0, 1e-12); TS_ASSERT_DELTA(diff3(1,2), 0, 1e-12); delete one_d_nodes[0]; delete one_d_nodes[1]; delete two_d_nodes[0]; delete two_d_nodes[1]; delete two_d_nodes[2]; delete three_d_nodes[0]; delete three_d_nodes[1]; delete three_d_nodes[2]; delete three_d_nodes[3]; }