static void mg3P(double **u, double *v, double **r, double a[4],
		 double c[4], int n1, int n2, int n3, int k) {

/*--------------------------------------------------------------------
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c     multigrid V-cycle routine
c-------------------------------------------------------------------*/

    int j;

/*--------------------------------------------------------------------
c     down cycle.
c     restrict the residual from the find grid to the coarse
c-------------------------------------------------------------------*/

    for (k = lt; k >= lb+1; k--) {
	j = k-1;
	rprj3(r[k], m1[k], m2[k], m3[k],
	      r[j], m1[j], m2[j], m3[j], k);
    }

    k = lb;
/*--------------------------------------------------------------------
c     compute an approximate solution on the coarsest grid
c-------------------------------------------------------------------*/
    zero3(u[k], m1[k], m2[k], m3[k]);
    psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);

    for (k = lb+1; k <= lt-1; k++) {
	j = k-1;
/*--------------------------------------------------------------------
c        prolongate from level k-1  to k
c-------------------------------------------------------------------*/
	zero3(u[k], m1[k], m2[k], m3[k]);
	interp(u[j], m1[j], m2[j], m3[j],
	       u[k], m1[k], m2[k], m3[k], k);
/*--------------------------------------------------------------------
c        compute residual for level k
c-------------------------------------------------------------------*/
	resid(u[k], r[k], r[k], m1[k], m2[k], m3[k], a, k);
/*--------------------------------------------------------------------
c        apply smoother
c-------------------------------------------------------------------*/
	psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);
    }

    j = lt - 1;
    k = lt;
    interp(u[j], m1[j], m2[j], m3[j], u[lt], n1, n2, n3, k);
    resid(u[lt], v, r[lt], n1, n2, n3, a, k);
    psinv(r[lt], u[lt], n1, n2, n3, c, k);
}
    void TestNonlinearEquationPde()
    {
        ChastePoint<1> zero1(0);
        ChastePoint<2> zero2(0,0);
        ChastePoint<3> zero3(0,0,0);
        double u = 2.0;

        NonlinearEquationPde<1> heat_equation1;
        NonlinearEquationPde<2> heat_equation2;
        NonlinearEquationPde<3> heat_equation3;

        TS_ASSERT_DELTA(heat_equation1.ComputeNonlinearSourceTerm(zero1,u),0.0,1e-12);
        TS_ASSERT_DELTA(heat_equation2.ComputeNonlinearSourceTerm(zero2,u),0.0,1e-12);
        TS_ASSERT_DELTA(heat_equation3.ComputeNonlinearSourceTerm(zero3,u),0.0,1e-12);

        // Diffusion matrices should be equal to identity * u;
        c_matrix<double, 1, 1> diff1 = heat_equation1.ComputeDiffusionTerm(zero1,u);
        c_matrix<double, 2, 2> diff2 = heat_equation2.ComputeDiffusionTerm(zero2,u);
        c_matrix<double, 3, 3> diff3 = heat_equation3.ComputeDiffusionTerm(zero3,u);

        TS_ASSERT_DELTA(diff1(0,0),u,1e-12);

        TS_ASSERT_DELTA(diff2(0,0),u,1e-12);
        TS_ASSERT_DELTA(diff2(1,1),u,1e-12);
        TS_ASSERT_DELTA(diff2(0,1),0,1e-12);

        TS_ASSERT_DELTA(diff3(0,0),u,1e-12);
        TS_ASSERT_DELTA(diff3(1,1),u,1e-12);
        TS_ASSERT_DELTA(diff3(2,2),u,1e-12);
        TS_ASSERT_DELTA(diff3(0,1),0,1e-12);
        TS_ASSERT_DELTA(diff3(0,2),0,1e-12);
        TS_ASSERT_DELTA(diff3(1,2),0,1e-12);
    }
    void TestHeatEquation()
    {
        ChastePoint<1> zero1(0);
        ChastePoint<2> zero2(0,0);
        ChastePoint<3> zero3(0,0,0);
        double u = 2.0;

        HeatEquation<1> pde1;
        HeatEquation<2> pde2;
        HeatEquation<3> pde3;

        TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1,u), 0.0, 1e-12);
        TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2,u), 0.0, 1e-12);
        TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3,u), 0.0, 1e-12);

        TS_ASSERT_DELTA(pde1.ComputeDuDtCoefficientFunction(zero1), 1.0, 1e-12);
        TS_ASSERT_DELTA(pde2.ComputeDuDtCoefficientFunction(zero2), 1.0, 1e-12);
        TS_ASSERT_DELTA(pde3.ComputeDuDtCoefficientFunction(zero3), 1.0, 1e-12);

        // Diffusion matrices should be equal to identity
        c_matrix<double, 1, 1> diff1 = pde1.ComputeDiffusionTerm(zero1);
        c_matrix<double, 2, 2> diff2 = pde2.ComputeDiffusionTerm(zero2);
        c_matrix<double, 3, 3> diff3 = pde3.ComputeDiffusionTerm(zero3);

        TS_ASSERT_DELTA(diff1(0,0), 1, 1e-12);

        TS_ASSERT_DELTA(diff2(0,0), 1, 1e-12);
        TS_ASSERT_DELTA(diff2(1,1), 1, 1e-12);
        TS_ASSERT_DELTA(diff2(0,1), 0, 1e-12);

        TS_ASSERT_DELTA(diff3(0,0), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(1,1), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(2,2), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(0,1), 0, 1e-12);
        TS_ASSERT_DELTA(diff3(0,2), 0, 1e-12);
        TS_ASSERT_DELTA(diff3(1,2), 0, 1e-12);

        Node<1> node(0, zero1);
        TS_ASSERT_DELTA(pde1.ComputeSourceTermAtNode(node,u), 0.0, 1e-12);
    }
int main(int argc, char *argv[]) {

/*-------------------------------------------------------------------------
c k is the current level. It is passed down through subroutine args
c and is NOT global. it is the current iteration
c------------------------------------------------------------------------*/

    int k, it;
    double t, tinit, mflops;
    int nthreads = 1;

/*-------------------------------------------------------------------------
c These arrays are in common because they are quite large
c and probably shouldn''t be allocated on the stack. They
c are always passed as subroutine args. 
c------------------------------------------------------------------------*/
    
    double **u, *v, **r;
    double a[4], c[4];

    double rnm2, rnmu;
    double epsilon = 1.0e-8;
    int n1, n2, n3, nit;
    double verify_value;
    boolean verified;

    int i, j, l;
    FILE *fp;

    timer_clear(T_BENCH);
    timer_clear(T_INIT);

    timer_start(T_INIT);

/*----------------------------------------------------------------------
c Read in and broadcast input data
c---------------------------------------------------------------------*/

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	   " - MG Benchmark\n\n");

    fp = fopen("mg.input", "r");
    if (fp != NULL) {
	printf(" Reading from input file mg.input\n");
	fscanf(fp, "%d", &lt);
	while(fgetc(fp) != '\n');
	fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]);
	while(fgetc(fp) != '\n');
	fscanf(fp, "%d", &nit);
	while(fgetc(fp) != '\n');
	for (i = 0; i <= 7; i++) {
	    fscanf(fp, "%d", &debug_vec[i]);
	}
	fclose(fp);
    } else {
	printf(" No input file. Using compiled defaults\n");
    
	lt = LT_DEFAULT;
	nit = NIT_DEFAULT;
	nx[lt] = NX_DEFAULT;
	ny[lt] = NY_DEFAULT;
	nz[lt] = NZ_DEFAULT;

	for (i = 0; i <= 7; i++) {
	    debug_vec[i] = DEBUG_DEFAULT;
	}
    }

    if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) {
	Class = 'U';
    } else if( nx[lt] == 32 && nit == 4 ) {
	Class = 'S';
    } else if( nx[lt] == 64 && nit == 40 ) {
	Class = 'W';
    } else if( nx[lt] == 256 && nit == 20 ) {
	Class = 'B';
    } else if( nx[lt] == 512 && nit == 20 ) {
	Class = 'C';
    } else if( nx[lt] == 256 && nit == 4 ) {
	Class = 'A';
    } else {
	Class = 'U';
    }

/*--------------------------------------------------------------------
c  Use these for debug info:
c---------------------------------------------------------------------
c     debug_vec(0) = 1 !=> report all norms
c     debug_vec(1) = 1 !=> some setup information
c     debug_vec(1) = 2 !=> more setup information
c     debug_vec(2) = k => at level k or below, show result of resid
c     debug_vec(3) = k => at level k or below, show result of psinv
c     debug_vec(4) = k => at level k or below, show result of rprj
c     debug_vec(5) = k => at level k or below, show result of interp
c     debug_vec(6) = 1 => (unused)
c     debug_vec(7) = 1 => (unused)
c-------------------------------------------------------------------*/

    a[0] = -8.0/3.0;
    a[1] =  0.0;
    a[2] =  1.0/6.0;
    a[3] =  1.0/12.0;

    if (Class == 'A' || Class == 'S' || Class =='W') {
/*--------------------------------------------------------------------
c     Coefficients for the S(a) smoother
c-------------------------------------------------------------------*/
	c[0] =  -3.0/8.0;
	c[1] =  1.0/32.0;
	c[2] =  -1.0/64.0;
	c[3] =   0.0;
    } else {
/*--------------------------------------------------------------------
c     Coefficients for the S(b) smoother
c-------------------------------------------------------------------*/
	c[0] =  -3.0/17.0;
	c[1] =  1.0/33.0;
	c[2] =  -1.0/61.0;
	c[3] =   0.0;
    }
    
    lb = 1;

    setup(&n1,&n2,&n3,lt);
      
    /* Allocate the data arrays
     * 3d arrays are flattened and allocated as a contiguous block
     * 4d arrays are allocated as separate 3d blocks
     */
    u = (double **)malloc((lt+1)*sizeof(double *));
    for (l=lt; l >=1; l--)
      u[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double));

    v = (double *)malloc(m3[lt]*m2[lt]*m1[lt]*sizeof(double));

    r = (double **)malloc((lt+1)*sizeof(double *));
    for (l=lt; l >=1; l--)
      r[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double));

    // Array v can be treated using a standard OpenACC data region
#pragma acc data create(v[0:m3[lt]*m2[lt]*m1[lt]]) copyin(a[0:4],c[0:4])
    {

#ifdef _OPENACC
      //****************************************************************
      /* Now manually deep-create arrays u,r on the GPU using the Cray extended
       * runtime API, instead of using a data region
       */
      double **acc_u = (double **)cray_acc_create(u,(lt+1)*sizeof(double *));
      for (l=lt; l >=1; l--) {
	double *acc_ul = (double *)cray_acc_create(u[l],m3[l]*m2[l]*m1[l]*sizeof(double));
	SET_ACC_PTR(acc_u[l], acc_ul);
      }
      double **acc_r = (double **)cray_acc_create(r,(lt+1)*sizeof(double *));
      for (l=lt; l >=1; l--) {
	double *acc_rl = (double *)cray_acc_create(r[l],m3[l]*m2[l]*m1[l]*sizeof(double));
	SET_ACC_PTR(acc_r[l], acc_rl);
      }
    //****************************************************************
#endif /* _OPENACC */

#pragma omp parallel
{
    zero3(u[lt],n1,n2,n3);
}
    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);

#pragma omp parallel
{
    norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

#pragma omp single
{
/*    printf("\n norms of random v are\n");
    printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu);
    printf(" about to evaluate resid, k= %d\n", lt);*/

    printf(" Size: %3dx%3dx%3d (class %1c)\n",
	   nx[lt], ny[lt], nz[lt], Class);
    printf(" Iterations: %3d\n", nit);
}

    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

/*c---------------------------------------------------------------------
c     One iteration for startup
c---------------------------------------------------------------------*/
    mg3P(u,v,r,a,c,n1,n2,n3,lt);
    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);

#pragma omp single
    setup(&n1,&n2,&n3,lt);

    zero3(u[lt],n1,n2,n3);
  } /* pragma omp parallel */

    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);

    timer_stop(T_INIT);
    timer_start(T_BENCH);

#pragma omp parallel firstprivate(nit) private(it)
  {
    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

    for ( it = 1; it <= nit; it++) {
	mg3P(u,v,r,a,c,n1,n2,n3,lt);
	resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    }
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

#if defined(_OPENMP)    
#pragma omp master
    nthreads = omp_get_num_threads();
#endif    
  } /* pragma omp parallel */

    timer_stop(T_BENCH);
    t = timer_read(T_BENCH);
    tinit = timer_read(T_INIT);

    verified = FALSE;
    verify_value = 0.0;

    printf(" Initialization time: %15.3f seconds\n", tinit);
    printf(" Benchmark completed\n");

    if (Class != 'U') {
	if (Class == 'S') {
            verify_value = 0.530770700573e-04;
	} else if (Class == 'W') {
            verify_value = 0.250391406439e-17;  /* 40 iterations*/
/*				0.183103168997d-044 iterations*/
	} else if (Class == 'A') {
            verify_value = 0.2433365309e-5;
        } else if (Class == 'B') {
            verify_value = 0.180056440132e-5;
        } else if (Class == 'C') {
            verify_value = 0.570674826298e-06;
	}

	if ( fabs( rnm2 - verify_value ) <= epsilon ) {
            verified = TRUE;
	    printf(" VERIFICATION SUCCESSFUL\n");
	    printf(" L2 Norm is %20.12e\n", rnm2);
	    printf(" Error is   %20.12e\n", rnm2 - verify_value);
	} else {
            verified = FALSE;
	    printf(" VERIFICATION FAILED\n");
	    printf(" L2 Norm is             %20.12e\n", rnm2);
	    printf(" The correct L2 Norm is %20.12e\n", verify_value);
	}
    } else {
	verified = FALSE;
	printf(" Problem size unknown\n");
	printf(" NO VERIFICATION PERFORMED\n");
    }

    if ( t != 0.0 ) {
	int nn = nx[lt]*ny[lt]*nz[lt];
	mflops = 58.*nit*nn*1.0e-6 / t;
    } else {
	mflops = 0.0;
    }

    c_print_results("MG", Class, nx[lt], ny[lt], nz[lt], 
		    nit, nthreads, t, mflops, "          floating point", 
		    verified, NPBVERSION, COMPILETIME,
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);
// I should probably deep-free the manually deep-created accelerator data here
} //acc end data
}
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) {

/*--------------------------------------------------------------------
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c     zran3  loads +1 at ten randomly chosen points,
c     loads -1 at a different ten random points,
c     and zero elsewhere.
c-------------------------------------------------------------------*/

#define MM	10
#define	A	pow(5.0,13)
#define	X	314159265.e0    
    
    int i0, m0, m1;
    int i1, i2, i3, d1, e1, e2, e3;
    double xx, x0, x1, a1, a2, ai;

    double ten[MM][2], best;
    int i, j1[MM][2], j2[MM][2], j3[MM][2];
    int jg[4][MM][2];

    double rdummy;

    a1 = power( A, nx );
    a2 = power( A, nx*ny );

#if 0
#pragma omp parallel
  {
    zero3(z,n1,n2,n3);
  }
#else
#pragma omp parallel for private(i2, i1)    
  for (i3 = 0;i3 < n3; i3++) {
    for (i2 = 0; i2 < n2; i2++) {
      for (i1 = 0; i1 < n1; i1++) {
	int i123 = i1 + n1*(i2 + n2*i3);
	z[i123] = 0.0;
      }
    }
  }
#endif

    i = is1-1+nx*(is2-1+ny*(is3-1));

    ai = power( A, i );
    d1 = ie1 - is1 + 1;
    e1 = ie1 - is1 + 2;
    e2 = ie2 - is2 + 2;
    e3 = ie3 - is3 + 2;
    x0 = X;
    rdummy = randlc( &x0, ai );
    
    for (i3 = 1; i3 < e3; i3++) {
	x1 = x0;
	for (i2 = 1; i2 < e2; i2++) {
            xx = x1;
            vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)]));
            rdummy = randlc( &x1, a1 );
	}
	rdummy = randlc( &x0, a2 );
    }

/*--------------------------------------------------------------------
c       call comm3(z,n1,n2,n3)
c       call showall(z,n1,n2,n3)
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c     each processor looks for twenty candidates
c-------------------------------------------------------------------*/
    for (i = 0; i < MM; i++) {
	ten[i][1] = 0.0;
	j1[i][1] = 0;
	j2[i][1] = 0;
	j3[i][1] = 0;
	ten[i][0] = 1.0;
	j1[i][0] = 0;
	j2[i][0] = 0;
	j3[i][0] = 0;
    }
    for (i3 = 1; i3 < n3-1; i3++) {
	for (i2 = 1; i2 < n2-1; i2++) {
            for (i1 = 1; i1 < n1-1; i1++) {
	      int i123 = i1 + n1*(i2 + n2*i3);
		if ( z[i123] > ten[0][1] ) {
		    ten[0][1] = z[i123];
		    j1[0][1] = i1;
		    j2[0][1] = i2;
		    j3[0][1] = i3;
		    bubble( ten, j1, j2, j3, MM, 1 );
		}
		if ( z[i123] < ten[0][0] ) {
		    ten[0][0] = z[i123];
		    j1[0][0] = i1;
		    j2[0][0] = i2;
		    j3[0][0] = i3;
		    bubble( ten, j1, j2, j3, MM, 0 );
		}
	    }
	}
    }

/*--------------------------------------------------------------------
c     Now which of these are globally best?
c-------------------------------------------------------------------*/
    i1 = MM - 1;
    i0 = MM - 1;
    for (i = MM - 1 ; i >= 0; i--) {
      int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][1] = 0;
            jg[1][i][1] = is1 - 1 + j1[i1][1];
            jg[2][i][1] = is2 - 1 + j2[i1][1];
            jg[3][i][1] = is3 - 1 + j3[i1][1];
            i1 = i1-1;
	} else {
            jg[0][i][1] = 0;
            jg[1][i][1] = 0;
            jg[2][i][1] = 0;
            jg[3][i][1] = 0;
	}
	ten[i][1] = best;
      j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][0] = 0;
            jg[1][i][0] = is1 - 1 + j1[i0][0];
            jg[2][i][0] = is2 - 1 + j2[i0][0];
            jg[3][i][0] = is3 - 1 + j3[i0][0];
            i0 = i0-1;
	} else {
            jg[0][i][0] = 0;
            jg[1][i][0] = 0;
            jg[2][i][0] = 0;
            jg[3][i][0] = 0;
	}
	ten[i][0] = best;
    }
    m1 = i1+1;
    m0 = i0+1;

/*    printf(" negative charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]);
    }
    printf("\n positive charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]);
    }
    printf("\n small random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][0]);
    }
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][0]);
    }
    printf("\n large random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][1]);
    }
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][1]);
    }
    printf("\n");*/

#if 0
#pragma omp parallel for private(i2, i1)    
for (i3 = 0; i3 < n3; i3++) {
  for (i2 = 0; i2 < n2; i2++) {
    for (i1 = 0; i1 < n1; i1++) {
      int i123 = i1 + n1*(i2+n2*i3);
      z[i123] = 0.0;
    }
  }
 }
#else
#pragma omp parallel
    {
      zero3(z,n1,n2,n3);
    }
#endif

#pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg)
{
#pragma acc loop
    for (i = MM-1; i >= m0; i--) {
      int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]);
	z[j123] = -1.0;
    }
#pragma acc loop
    for (i = MM-1; i >= m1; i--) {
      int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]);
	z[j123] = 1.0;
    }
} // end acc parallel                                                         
#pragma omp parallel    
    comm3(z,n1,n2,n3,k);

/*--------------------------------------------------------------------
c          call showall(z,n1,n2,n3)
c-------------------------------------------------------------------*/
}
Beispiel #6
0
int main(int argc, char *argv[])
{
	clock_t tstart, tend;
	double duration;

	int numprocs, rank;
	float *sendbuf, *recvbuf;
	MPI_Comm Comm=MPI_COMM_WORLD;

	bool verb, wantrecord, wantwf, onlyrecord;
	sf_file Ffvel, Ffden, Fbvel, Fbden;
	sf_file Fsrc, Frcd, Fimg1, Fimg2;
	sf_file FGx, FGz, Fsxx, Fsxz, Fszx, Fszz;
	sf_file Ftmpfwf, Ftmpbwf;

	sf_axis at, ax, az, atau;

	int shtbgn, shtinv, shtnmb, shtpad, shtnmb0;
	int snapturn, tmpint;

	float **fvel, **bvel;
	float ***fwf, ***record, **localrec;
	float ***img1, **img2, ***mig1, **mig2;
	float *tmpsxx, *tmpsxz, *tmpszx, *tmpszz;

	sf_init(argc, argv);

	MPI_Init(&argc, &argv);
	MPI_Comm_size(Comm, &numprocs);
	MPI_Comm_rank(Comm, &rank);

	tstart=clock();
	if(rank==0) sf_warning("numprocs=%d", numprocs);

	if(!sf_getbool("verb", &verb)) verb=true;
	if(!sf_getbool("wantrecord", &wantrecord)) wantrecord=false;
	if(!sf_getbool("wantwf", &wantwf)) wantwf=false;
	if(!sf_getbool("onlyrecord", &onlyrecord)) onlyrecord=false;

	Fsrc=sf_input("-input");
	Fimg1=sf_output("-output");
	Fimg2=sf_output("img2");
	Ffvel=sf_input("fvel");
	Ffden=sf_input("fden");
	Fbvel=sf_input("bvel");
	Fbden=sf_input("bden");

	if(wantrecord)
		Frcd=sf_input("record");
	else
		Frcd=sf_output("record");

	if(wantwf){
		Ftmpfwf=sf_output("tmpfwf");
		Ftmpbwf=sf_output("tmpbwf");
	}

	FGx=sf_input("Gx");
	FGz=sf_input("Gz");
	Fsxx=sf_input("sxx");
	Fsxz=sf_input("sxz");
	Fszx=sf_input("szx");
	Fszz=sf_input("szz");
	
	at=sf_iaxa(Fsrc, 1); nt=sf_n(at); dt=sf_d(at);
	if(!sf_getbool("srcdecay", &srcdecay)) srcdecay=true;
	if(!sf_getint("srcrange", &srcrange)) srcrange=3;
	if(!sf_getfloat("srctrunc", &srctrunc)) srctrunc=0.2;
	if(!sf_getfloat("srcalpha", &srcalpha)) srcalpha=0.5;
	wavelet=sf_floatalloc(nt);
	sf_floatread(wavelet, nt, Fsrc);

	if(!sf_getint("pmlsize", &pmlsize)) pmlsize=30;
	if(!sf_getint("nfd", &nfd)) sf_error("Need half of the FD order!");
	if(!sf_getfloat("pmld0", &pmld0)) pmld0=200;

	if(!sf_getint("shtnmb", &shtnmb)) sf_error("Need shot number!");
	if(!sf_getint("shtinv", &shtinv)) sf_error("Need shot interval!");
	if(!sf_getint("shtbgn", &shtbgn)) shtbgn=0;
	shtpad=numprocs-shtnmb%numprocs;
	shtnmb0=shtnmb+shtpad;

	az=sf_iaxa(Ffvel, 1); nzb=sf_n(az);
	ax=sf_iaxa(Ffvel, 2); nxb=sf_n(ax);
	nxzb=nxb*nzb;
	nz=nzb-2*nfd-2*pmlsize;
	nx=nxb-2*nfd-2*pmlsize;

	if(!sf_getint("snapturn", &snapturn)) snapturn=1;
	if(!sf_getint("ginv", &ginv)) ginv=1;
	if(!sf_getint("wfinv", &wfinv)) wfinv=1;
	if(!sf_getint("spz", &spz)) spz=6;
	if(!sf_getint("gp", &gp)) gp=0;
	ng=(nx-1)/ginv+1;
	wfnt=(nt-1)/wfinv+1;
	wfdt=dt*wfinv;

	if(!sf_getint("ntau", &ntau)) ntau=1;
	if(!sf_getfloat("dtau", &dtau)) dtau=wfdt;
	if(!sf_getfloat("tau0", &tau0)) tau0=0;
	atau=sf_iaxa(Fsrc, 1);
	sf_setn(atau, ntau);
	sf_setd(atau, dtau);
	sf_seto(atau, tau0);

	if(!sf_histint(FGx, "n1", &nxz)) sf_error("No n1= in FGx!");
	if(nxz != nxzb) sf_error("Dimension error!");
	if(!sf_histint(FGx, "n2", &lenx)) sf_error("No n2= in FGx!");
	if(!sf_histint(FGz, "n2", &lenz)) sf_error("No n2= in FGz!");
	Gx=sf_floatalloc3(nzb, nxb, lenx);
	Gz=sf_floatalloc3(nzb, nxb, lenz);
	sxx=sf_intalloc(lenx);
	sxz=sf_intalloc(lenx);
	szx=sf_intalloc(lenz);
	szz=sf_intalloc(lenz);
	tmpsxx=sf_floatalloc(lenx);
	tmpsxz=sf_floatalloc(lenx);
	tmpszx=sf_floatalloc(lenz);
	tmpszz=sf_floatalloc(lenz);
	sf_floatread(Gx[0][0], nxzb*lenx, FGx);
	sf_floatread(Gz[0][0], nxzb*lenz, FGz);
	sf_floatread(tmpsxx, lenx, Fsxx);
	sf_floatread(tmpsxz, lenx, Fsxz);
	sf_floatread(tmpszx, lenz, Fszx);
	sf_floatread(tmpszz, lenz, Fszz);
	for (ix=0; ix<lenx; ix++){
		sxx[ix]=(int)tmpsxx[ix];
		sxz[ix]=(int)tmpsxz[ix];
	}
	for (iz=0; iz<lenz; iz++){
		szx[iz]=(int)tmpszx[iz];
		szz[iz]=(int)tmpszz[iz];
	}

	fvel=sf_floatalloc2(nzb, nxb);
	fden=sf_floatalloc2(nzb, nxb);
	fc11=sf_floatalloc2(nzb, nxb);
	bvel=sf_floatalloc2(nzb, nxb);
	bden=sf_floatalloc2(nzb, nxb);
	bc11=sf_floatalloc2(nzb, nxb);
	sf_floatread(fvel[0], nxzb, Ffvel);
	sf_floatread(fden[0], nxzb, Ffden);
	sf_floatread(bvel[0], nxzb, Fbvel);
	sf_floatread(bden[0], nxzb, Fbden);
	for (ix=0; ix<nxb; ix++){
		for (iz=0; iz<nzb; iz++){
			fc11[ix][iz]=fden[ix][iz]*fvel[ix][iz]*fvel[ix][iz];
			bc11[ix][iz]=bden[ix][iz]*bvel[ix][iz]*bvel[ix][iz];
		}
	}

	if(wantrecord){
		/* check record data */
		sf_histint(Frcd, "n1", &tmpint);
		if(tmpint != nt) sf_error("Not matched dimensions!");
		sf_histint(Frcd, "n2", &tmpint);
		if(tmpint != ng) sf_error("Not matched dimensions!");
		sf_histint(Frcd, "n3", &tmpint);
		if(tmpint != shtnmb) sf_error("Not matched dimensions!");
	}

	if(rank==0){
		record=sf_floatalloc3(nt, ng, shtnmb0);
		if(wantrecord){
			sf_floatread(record[0][0], nt*ng*shtnmb, Frcd);
			for(is=shtnmb; is<shtnmb0; is++)
				for(ix=0; ix<ng; ix++)
					for(it=0; it<nt; it++)
						record[is][ix][it]=0.0;
		}
	}

	img1=sf_floatalloc3(nz, nx, ntau);
	mig1=sf_floatalloc3(nz, nx, ntau);
	img2=sf_floatalloc2(nz, nx);
	mig2=sf_floatalloc2(nz, nx);
	zero3(img1, nz, nx, ntau);
	zero2(img2, nz, nx);

	sf_setn(az, nz);
	sf_setn(ax, ng);
	if(!wantrecord){
		sf_oaxa(Frcd, at, 1);
		sf_oaxa(Frcd, ax, 2);
		sf_putint(Frcd, "n3", shtnmb);
		sf_putint(Frcd, "d3", shtinv);
		sf_putint(Frcd, "o3", shtbgn);
	}

	sf_setn(ax, nx);
	if(wantwf){
		sf_setn(at, wfnt);
		sf_setd(at, wfdt);

		sf_oaxa(Ftmpfwf, az, 1);
		sf_oaxa(Ftmpfwf, ax, 2);
		sf_oaxa(Ftmpfwf, at, 3);

		sf_oaxa(Ftmpbwf, az, 1);
		sf_oaxa(Ftmpbwf, ax, 2);
		sf_oaxa(Ftmpbwf, at, 3);
	}

	sf_oaxa(Fimg1, az, 1);
	sf_oaxa(Fimg1, ax, 2);
	sf_oaxa(Fimg1, atau, 3);
	sf_oaxa(Fimg2, az, 1);
	sf_oaxa(Fimg2, ax, 2);

	fwf=sf_floatalloc3(nz, nx, wfnt);
	localrec=sf_floatalloc2(nt, ng);

	if(verb){
		sf_warning("==================================");
		sf_warning("nx=%d nz=%d nt=%d", nx, nz, nt);
		sf_warning("wfnt=%d wfdt=%f wfinv=%d dt=%f", wfnt, wfdt, wfinv, dt);
		sf_warning("nxb=%d nzb=%d pmlsize=%d nfd=%d", nxb, nzb, pmlsize, nfd);
		sf_warning("ntau=%d dtau=%f tau0=%f", ntau, dtau, tau0);
		sf_warning("shtnmb=%d shtbgn=%d shtinv=%d", shtnmb, shtbgn, shtinv);
		sf_warning("lenx=%d lenz=%d spz=%d gp=%d", lenx, lenz, spz, gp);
		sf_warning("==================================");
	}

	init();

	for(iturn=0; iturn*numprocs<shtnmb; iturn++){
		is=iturn*numprocs+rank;
		if(is<shtnmb){
			sf_warning("ishot/nshot: %d/%d", is+1, shtnmb);
			spx=is*shtinv+shtbgn;
			sglfdfor2(fwf, localrec, verb);
		}

		if(wantrecord){
			recvbuf=localrec[0];
			if(rank==0) sendbuf=record[iturn*numprocs][0];
			else sendbuf=NULL;
			MPI_Scatter(sendbuf, ng*nt, MPI_FLOAT, recvbuf, ng*nt, MPI_FLOAT, 0, Comm);
		}else{
			sendbuf=localrec[0];
			if(rank==0) recvbuf=record[iturn*numprocs][0];
			else recvbuf=NULL;
			MPI_Gather(sendbuf, ng*nt, MPI_FLOAT, recvbuf, ng*nt, MPI_FLOAT, 0, Comm);
		}

		if(wantwf && rank==0 && iturn==snapturn-1) wantwf=true;
		else wantwf=false;
		if(wantwf) sf_floatwrite(fwf[0][0], wfnt*nx*nz, Ftmpfwf);

		if(!onlyrecord && is<shtnmb){
			sglfdback2(mig1, mig2, fwf, localrec, verb, wantwf, Ftmpbwf);
			for(itau=0; itau<ntau; itau++){
				for(ix=0; ix<nx; ix++){
					for(iz=0; iz<nz; iz++){
						img1[itau][ix][iz]+=mig1[itau][ix][iz];
					}
				}
			}
			for(ix=0; ix<nx; ix++){
				for(iz=0; iz<nz; iz++){
					img2[ix][iz]+=mig2[ix][iz];
				}
			}
		}
		MPI_Barrier(Comm);
	} //end of iturn

	if(!onlyrecord){
	if(rank==0){
		sendbuf=(float *)MPI_IN_PLACE;
		recvbuf=img1[0][0];
	}else{
		sendbuf=img1[0][0];
		recvbuf=NULL;
	}
	MPI_Reduce(sendbuf, recvbuf, ntau*nx*nz, MPI_FLOAT, MPI_SUM, 0, Comm);

	if(rank==0){
		sendbuf=MPI_IN_PLACE;
		recvbuf=img2[0];
	}else{
		sendbuf=img2[0];
		recvbuf=NULL;
	}
	MPI_Reduce(sendbuf, recvbuf, nx*nz, MPI_FLOAT, MPI_SUM, 0, Comm);
	}

	if(rank==0){
		if(!wantrecord){
			sf_floatwrite(record[0][0], shtnmb*ng*nt, Frcd);
		}
		sf_floatwrite(img1[0][0], ntau*nx*nz, Fimg1);
		sf_floatwrite(img2[0], nx*nz, Fimg2);
	}

	tend=clock();
	duration=(double)(tend-tstart)/CLOCKS_PER_SEC;
	sf_warning(">>The CPU time of sfmpilfdrtm2 is: %f seconds<<", duration);
	MPI_Finalize();
	exit(0);
}
Beispiel #7
0
int sglfdback2(float ***mig1, float **mig2, float ***fwf, float **localrec, bool verb, bool wantwf, sf_file Ftmpbwf)
{
	float **txxn1, **txxn0, **vxn1, **vxn0, **vzn1, **vzn0;
	float **sill, **ccr, ***bwf;
	int wfit, htau;
	float tau;

	sill=sf_floatalloc2(nz, nx);
	ccr=sf_floatalloc2(nz, nx);
	bwf=sf_floatalloc3(nz, nx, wfnt);
	zero2(sill, nz, nx);
	zero2(ccr, nz, nx);
	zero3(mig1, nz, nx, ntau);

	txxn1=sf_floatalloc2(nzb, nxb);
	txxn0=sf_floatalloc2(nzb, nxb);
	vxn1=sf_floatalloc2(nzb, nxb);
	vxn0=sf_floatalloc2(nzb, nxb);
	vzn1=sf_floatalloc2(nzb, nxb);
	vzn0=sf_floatalloc2(nzb, nxb);

	zero2(txxn1, nzb, nxb);
	zero2(txxn0, nzb, nxb);
	zero2(vxn1, nzb, nxb);
	zero2(vxn0, nzb, nxb);
	zero2(vzn1, nzb, nxb);
	zero2(vzn0, nzb, nxb);

	zero2(txxn1x, nzb, nxb);
	zero2(txxn1z, nzb, nxb);
	zero2(txxn0x, nzb, nxb);
	zero2(txxn0z, nzb, nxb);
	
	wfit=wfnt-1;
	for(it=nt-1; it>=0; it--){
		if(verb) sf_warning("Backward it=%d/%d;", it+1, nt);
#ifdef _OPENMP
#pragma omp parallel for private(ix,iz)
#endif
		for(ix=nfd+pmlsize; ix<nfd+pmlsize+nx; ix++){
			for(iz=nfd+pmlsize; iz<nfd+pmlsize+nz; iz++){
				txxn0[ix][iz]=txxn1[ix][iz]+dt*bc11[ix][iz]*(ldx(vxn1, ix-1, iz) +ldz(vzn1, ix, iz-1));
			}
		}

		pml_txxb(txxn0, vxn1, vzn1);
#ifdef _OPENMP
#pragma omp parallel for private(ix)
#endif
		for(ix=0; ix<ng; ix++){
			txxn0[ix*ginv+pmlsize+nfd][pmlsize+nfd+gp]+=localrec[ix][it];
		}
#ifdef _OPENMP
#pragma omp parallel for private(ix,iz)
#endif
		for(ix=nfd+pmlsize; ix<nfd+pmlsize+nx; ix++){
			for(iz=nfd+pmlsize; iz<nfd+pmlsize+nz; iz++){
				vxn0[ix][iz]=vxn1[ix][iz]+dt/bdenx[ix][iz]*ldx(txxn0, ix, iz);
				vzn0[ix][iz]=vzn1[ix][iz]+dt/bdenz[ix][iz]*ldz(txxn0, ix, iz);
			}
		}

		pml_vxzb(vxn1, vzn1, vxn0, vzn0, txxn0);

		transp=txxn1; txxn1=txxn0; txxn0=transp;
		transp=vxn1; vxn1=vxn0; vxn0=transp;
		transp=vzn1; vzn1=vzn0; vzn0=transp;

		if(it%wfinv==0){
			for(ix=0; ix<nx; ix++)
				for(iz=0; iz<nz; iz++){
					bwf[wfit][ix][iz]=txxn0[ix+pmlsize+nfd][iz+pmlsize+nfd];
					ccr[ix][iz]+=fwf[wfit][ix][iz]*bwf[wfit][ix][iz];
					sill[ix][iz]+=fwf[wfit][ix][iz]*fwf[wfit][ix][iz];
				}
			wfit--;
		}
	} //end of it
	if(verb) sf_warning(".");

	for(itau=0; itau<ntau; itau++){
		tau=itau*dtau+tau0;
		htau=tau/wfdt;
		for(it=abs(htau); it<wfnt-abs(htau); it++){
			for(ix=0; ix<nx; ix++){
				for(iz=0; iz<nz; iz++){
					mig1[itau][ix][iz]+=fwf[it+htau][ix][iz]*bwf[it-htau][ix][iz];
				}
			}
		}//end of it
	} // end of itau

	for(ix=0; ix<nx; ix++){
		for(iz=0; iz<nz; iz++){
			mig2[ix][iz]=ccr[ix][iz]/(sill[ix][iz]+SF_EPS);
		}
	}

	if(wantwf) sf_floatwrite(bwf[0][0], wfnt*nx*nz, Ftmpbwf);
	return 0;
}
    void TestHeatEquationWithElementDependentSourceTerm()
    {
        // The PDE is set to give elements with index = 0 a source of zero
        // and a source of 1 otherwise.

        std::vector<Node<1>*> one_d_nodes;
        one_d_nodes.push_back(new Node<1>(0, false, 2.0));
        one_d_nodes.push_back(new Node<1>(1, false, 2.5));
        Element<1,1> one_d_element(0u, one_d_nodes);
        ChastePoint<1> zero1(0);

        std::vector<Node<2>*> two_d_nodes;
        two_d_nodes.push_back(new Node<2>(0, false, 0.0, 0.0));
        two_d_nodes.push_back(new Node<2>(1, false, 1.0, 0.0));
        two_d_nodes.push_back(new Node<2>(2, false, 0.0, 1.0));
        Element<2,2> two_d_element(0u, two_d_nodes);
        ChastePoint<2> zero2(0,0);

        std::vector<Node<3>*> three_d_nodes;
        three_d_nodes.push_back(new Node<3>(0, false, 0.0, 0.0, 0.0));
        three_d_nodes.push_back(new Node<3>(1, false, 1.0, 0.0, 0.0));
        three_d_nodes.push_back(new Node<3>(2, false, 0.0, 1.0, 0.0));
        three_d_nodes.push_back(new Node<3>(3, false, 0.0, 0.0, 1.0));
        Element<3,3> three_d_element(0u, three_d_nodes);
        ChastePoint<3> zero3(0,0,0);
        double u = 2.0;

        HeatEquationWithElementDependentSourceTerm<1> pde1;
        HeatEquationWithElementDependentSourceTerm<2> pde2;
        HeatEquationWithElementDependentSourceTerm<3> pde3;

        TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1, u, &one_d_element), 0.0, 1e-12);
        one_d_element.ResetIndex(1u);
        TS_ASSERT_DELTA(pde1.ComputeSourceTerm(zero1, u, &one_d_element), 1.0, 1e-12);

        TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2, u, &two_d_element), 0.0, 1e-12);
        two_d_element.ResetIndex(1u);
        TS_ASSERT_DELTA(pde2.ComputeSourceTerm(zero2, u, &two_d_element), 1.0, 1e-12);

        TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3, u, &three_d_element), 0.0, 1e-12);
        three_d_element.ResetIndex(1u);
        TS_ASSERT_DELTA(pde3.ComputeSourceTerm(zero3, u, &three_d_element), 1.0, 1e-12);

        TS_ASSERT_DELTA(pde1.ComputeDuDtCoefficientFunction(zero1), 1.0, 1e-12);
        TS_ASSERT_DELTA(pde2.ComputeDuDtCoefficientFunction(zero2), 1.0, 1e-12);
        TS_ASSERT_DELTA(pde3.ComputeDuDtCoefficientFunction(zero3), 1.0, 1e-12);

        // Diffusion matrices should be equal to identity
        c_matrix<double, 1, 1> diff1 = pde1.ComputeDiffusionTerm(zero1);
        c_matrix<double, 2, 2> diff2 = pde2.ComputeDiffusionTerm(zero2);
        c_matrix<double, 3, 3> diff3 = pde3.ComputeDiffusionTerm(zero3);

        TS_ASSERT_DELTA(diff1(0,0), 1, 1e-12);

        TS_ASSERT_DELTA(diff2(0,0), 1, 1e-12);
        TS_ASSERT_DELTA(diff2(1,1), 1, 1e-12);
        TS_ASSERT_DELTA(diff2(0,1), 0, 1e-12);

        TS_ASSERT_DELTA(diff3(0,0), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(1,1), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(2,2), 1, 1e-12);
        TS_ASSERT_DELTA(diff3(0,1), 0, 1e-12);
        TS_ASSERT_DELTA(diff3(0,2), 0, 1e-12);
        TS_ASSERT_DELTA(diff3(1,2), 0, 1e-12);

        delete one_d_nodes[0];
        delete one_d_nodes[1];

        delete two_d_nodes[0];
        delete two_d_nodes[1];
        delete two_d_nodes[2];

        delete three_d_nodes[0];
        delete three_d_nodes[1];
        delete three_d_nodes[2];
        delete three_d_nodes[3];
    }