コード例 #1
0
ファイル: Mpassive2d.c プロジェクト: Seislet/src
int main(int argc, char* argv[])
{
    bool verb,pas,adj,abc;          /* execution flags */
    int ix, iz, it;                 /* index variables */
    int nt, nx, nz, depth, nzxpad, nb, n2, snap;
    float ox, oz, dx, dz, dt, dt2, idz2, idx2, cb;

    int nxpad, nzpad;
    float **vvpad;
    
    float **dd, **mm, **vv, ***ww;
    float **u0, **u1, **u2, **tmp;  /* temporary arrays */

    sf_file in, out, vel, wave;     /* I/O files */

    /* initialize Madagascar */
    sf_init(argc,argv);
    
    /* initialize OpenMP support */
#ifdef _OPENMP
    omp_init();
#endif

    if(!sf_getbool("verb", &verb)) verb=false;
    /* verbosity flag */
    if(!sf_getbool("adj", &adj)) adj=false;
    /* adjoint flag, 0: modeling, 1: migration */
    if(!sf_getbool("pas", &pas)) pas=false;
    /* passive flag, 0: exploding reflector rtm, 1: passive seismic imaging */
    if(!sf_getbool("abc",&abc)) abc = false;
    /* absorbing boundary condition */
    if(!sf_getint("snap", &snap)) snap=0;
    /* wavefield snapshot flag */
    if(!sf_getint("depth", &depth)) depth=0;
    /* surface */
	
    /* setup I/O files */
    in  = sf_input("in");
    out = sf_output("out");
    vel = sf_input("velocity");
    /* velocity model */
    
    /* Dimensions */
    if(!sf_histint  (vel, "n1", &nz)) sf_error("No n1= in velocity");
    if(!sf_histint  (vel, "n2", &nx)) sf_error("No n2= in velocity");
    if(!sf_histfloat(vel, "o1", &oz)) sf_error("No o1= in velocity");
    if(!sf_histfloat(vel, "o2", &ox)) sf_error("No o2= in velocity");
    if(!sf_histfloat(vel, "d1", &dz)) sf_error("No d1= in velocity");
    if(!sf_histfloat(vel, "d2", &dx)) sf_error("No d2= in velocity");

    if(adj){ /* migration */
        if(!sf_histint(in, "n1", &nt)) sf_error("No n1= in data");
        if(!sf_histfloat(in, "d1", &dt)) sf_error("No d1= in data");
        if(!sf_histint(in, "n2", &n2) || n2!=nx) sf_error("Need n2=%d in data", nx);

        sf_putint   (out, "n1", nz);
        sf_putfloat (out, "o1", oz);
        sf_putfloat (out, "d1", dz);
        sf_putstring(out, "label1", "Depth");
        sf_putstring(out, "unit1" , "km");
        sf_putint   (out, "n2", nx);
        sf_putfloat (out, "o2", ox);
        sf_putfloat (out, "d2", dx);
        sf_putstring(out, "label2", "Distance");
        sf_putstring(out, "unit2" , "km");
        if (pas) {
            sf_putint   (out, "n3", nt);
            sf_putfloat (out, "d3", dt);
            sf_putfloat (out, "o3", 0.0f);
            sf_putstring(out, "label3", "Time");
            sf_putstring(out, "unit3" , "s");
        }
    }else{ /* modeling */
        if(!sf_getint("nt", &nt)) sf_error("Need nt=");
        if(!sf_getfloat("dt", &dt)) sf_error("Need dt=");
        
        sf_putint   (out, "n1", nt);
        sf_putfloat (out, "d1", dt);
        sf_putfloat (out, "o1", 0.0);
        sf_putstring(out, "label1", "Time");
        sf_putstring(out, "unit1" , "s");
        sf_putint   (out, "n2", nx);
        sf_putfloat (out, "o2", ox);
        sf_putfloat (out, "d2", dx);
        sf_putstring(out, "label2", "Distance");
        sf_putstring(out, "unit2" , "km");
        if (pas) {
            sf_putint   (out, "n3", 1);
        }
    }
    
    /* dimension of padded boundary */
    if(!sf_getint("nb", &nb) || nb<NOP) nb = NOP;
    if(!sf_getfloat("cb", &cb)) cb = 0.0f;
    nxpad = nx+2*nb;
    nzpad = nz+2*nb;
    nzxpad = nzpad*nxpad;
    depth = depth+nb;

    /* set Laplacian coefficients */
    idz2 = 1.0f/(dz*dz);
    idx2 = 1.0f/(dx*dx);
    
    /* wavefield snapshot */
    if(snap){
        wave = sf_output("wave");
        
        sf_putint(wave, "n1", nzpad);
        sf_putfloat(wave, "d1", dz);
        sf_putfloat(wave, "o1", oz-nb*dz);
            
        sf_putint(wave, "n2", nxpad);
        sf_putfloat(wave, "d2", dx);
        sf_putfloat(wave, "o2", ox-nb*dx);
        
        sf_putint(wave, "n3", 1+(nt-1)/snap);
        if(adj){
            sf_putfloat(wave, "d3", -snap*dt);
            sf_putfloat(wave, "o3", (nt-1)*dt);
        }else{
            sf_putfloat(wave, "d3", snap*dt);
            sf_putfloat(wave, "o3", 0.0f);
        }
    }
        
    /* allocate arrays */
    vv    = sf_floatalloc2(nz, nx);
    dd    = sf_floatalloc2(nt, nx);
    vvpad = sf_floatalloc2(nzpad, nxpad);
    u0    = sf_floatalloc2(nzpad, nxpad);
    u1    = sf_floatalloc2(nzpad, nxpad);
    u2    = sf_floatalloc2(nzpad, nxpad);
    if (pas) {
        mm = NULL;
        ww = sf_floatalloc3(nz, nx, nt);
    } else {
        mm = sf_floatalloc2(nz, nx);
        ww = NULL;
    }
    
    /* read velocity */
    sf_floatread(vv[0], nz*nx, vel);
    
    /* pad boundary */
    dt2 = dt*dt;
    for     (ix=0; ix<nx; ix++)
        for (iz=0; iz<nz; iz++)
            vvpad[ix+nb][iz+nb] = vv[ix][iz]*vv[ix][iz]*dt2;
    for     (ix=0; ix<nxpad; ix++){
        for (iz=0; iz<nb;    iz++){
            vvpad[ix][      iz  ] = vvpad[ix][      nb  ];
            vvpad[ix][nzpad-iz-1] = vvpad[ix][nzpad-nb-1];
        }
    }
    for     (ix=0; ix<nb;    ix++){
        for (iz=0; iz<nzpad; iz++){
            vvpad[      ix  ][iz]=vvpad[      nb  ][iz];
            vvpad[nxpad-ix-1][iz]=vvpad[nxpad-nb-1][iz];
        }
    }

    memset(u0[0], 0.0f, nzxpad*sizeof(float));
    memset(u1[0], 0.0f, nzxpad*sizeof(float));
    memset(u2[0], 0.0f, nzxpad*sizeof(float));

    /* absorbing boundary condition */
    if (abc) {
        if (verb) sf_warning("absorbing boundary condition");
        abc_init(nzpad,nxpad,nzpad,nxpad,nb,nb,nb,nb,cb,cb,cb,cb);
    }

    if(adj){ /* migration */
        
        /* read data */
        sf_floatread(dd[0], nt*nx, in);
        
        for (it=nt-1; it>-1; it--){
            if (verb) sf_warning("Migration: %d/%d;", it, 0);
            
            /* time stepping */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
            for     (ix=NOP; ix<nxpad-NOP; ix++){
                for (iz=NOP; iz<nzpad-NOP; iz++){
                    u2[ix][iz] = LapT(u1,ix,iz,idx2,idz2,vvpad) + 2.0f*u1[ix][iz] - u0[ix][iz];
                }
            }
            /* rotate pointers */
            tmp=u0; u0=u1; u1=u2; u2=tmp;
            if (abc) abc_apply(u1[0]);
            if (abc) abc_apply(u0[0]);

            /* inject data */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix)
#endif
            for (ix=nb; ix<nb+nx; ix++)
                u1[ix][depth] += dd[ix-nb][it];
            
            if (pas) {
                /* image source */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
                for     (ix=0; ix<nx; ix++)
                    for (iz=0; iz<nz; iz++)
                        ww[it][ix][iz] = u1[ix+nb][iz+nb];
            }
            if (snap && it%snap==0) sf_floatwrite(u1[0], nzxpad, wave);
        }
        if (verb) sf_warning(".");

        if (!pas) {
            /* output image */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
            for     (ix=0; ix<nx; ix++)
                for (iz=0; iz<nz; iz++)
                    mm[ix][iz] = u1[ix+nb][iz+nb];
            sf_floatwrite(mm[0], nz*nx, out);
        } else {
            /* output source */
            sf_floatwrite(ww[0][0], nz*nx*nt, out);
        }
    
    }else{/* modeling */
    	
        if (pas) { 
            /* read source */
            sf_floatread(ww[0][0], nz*nx*nt, in);
        } else { 
            /* read image */
            sf_floatread(mm[0], nz*nx, in);
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
            for     (ix=0; ix<nx; ix++)
                for (iz=0; iz<nz; iz++)
                    u1[ix+nb][iz+nb] = mm[ix][iz];
        }

    	for (it=0; it<nt; it++){
            if (verb) sf_warning("Modeling: %d/%d;", it, nt-1);
            
            if(snap && it%snap==0) sf_floatwrite(u1[0], nzxpad, wave);
            if (pas){
                /* inject source */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
                for     (ix=0; ix<nx; ix++)
                    for (iz=0; iz<nz; iz++)
                        u1[ix+nb][iz+nb] += ww[it][ix][iz];
            }

            /* record data */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix)
#endif
            for (ix=nb; ix<nb+nx; ix++)
	        dd[ix-nb][it] = u1[ix][depth];
            
            if (abc) abc_apply(u0[0]);
            if (abc) abc_apply(u1[0]);
            /* time stepping */
#ifdef _OPENMP
#pragma omp parallel for default(shared) private(ix, iz)
#endif
            for     (ix=NOP; ix<nxpad-NOP; ix++){
                for (iz=NOP; iz<nzpad-NOP; iz++){
                    u2[ix][iz] = Lap (u1,ix,iz,idx2,idz2,vvpad) + 2.0f*u1[ix][iz] - u0[ix][iz];
                }
            }
            /* rotate pointers */
            tmp=u0; u0=u1; u1=u2; u2=tmp;
        }
    	if (verb) sf_warning(".");
        
    	/* output data */
    	sf_floatwrite(dd[0], nt*nx, out);
    }

    if(pas) {
        free(**ww); free(*ww); free(ww);
    } else {
        free(*mm); free(mm);
    }

    if (abc) abc_close();
    free(*vvpad); free(vvpad); free(*vv); free(vv);
    free(*dd); free(dd); 
    free(*u0); free(u0); free(*u1); free(u1); free(*u2); free(u2);
    exit (0);
}
コード例 #2
0
ファイル: mri.c プロジェクト: shangchying/root
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;
	float scale		 = sqrtf(N1*N2);
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;
	double lambdaGamma1 = lambda/Gamma1;
	double lambdaGamma1Scale = lambda/Gamma1*scale;
	float Thresh=1.0/Gamma1;
	
	MPI_Datatype mpi_complexf;
	MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf);
	MPI_Type_commit(&mpi_complexf);
	
	int np, rank;
    MPI_Comm_size(MPI_COMM_WORLD ,&np);
    MPI_Comm_rank(MPI_COMM_WORLD ,&rank);
    int chunksize = N1/np;
    int start = rank * chunksize;
    int cnt[np];
	int disp[np];
	for (i = 0 ; i < np - 1; i ++) {
		cnt[i] = chunksize * N2 ;
		disp[i] = chunksize * i * N2;
	}
	cnt[i] = (chunksize + N1%np) * N2;
	disp[i] = chunksize * i * N2;

    if (rank == np - 1)
    	chunksize += N1%np;
    int end = start + chunksize;


	for(i=start; i<chunksize; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
	MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);		
	float normFactor = 1.f/sqrtf(sum);
	

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
	MPI_Status *status;

	for(i=start;i<end;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);
			
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
			
			for(i=start;i<end;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;
			
			dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);

			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;
					if (iter == MaxIter - 1)
						f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; 
				}
			}
			
			idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
			//MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			else if (rank == 0)
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
			else {
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			}
			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
			//MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
			else if (rank == 0)
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			else {
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			}
            
		}
	}

	for(i=start; i<end; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}
	MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD);
	free(w1);
	free(w2);
	free(buff);
	MPI_Finalize();
	if (rank > 0)
		exit(0);
	return 0;
}
コード例 #3
0
ファイル: mri_mpi.c プロジェクト: samwarring/CS133Project
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));

	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2);

	for(i=0;i<N1;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);

	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {

			for(i=0;i<N1;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;

			dft(diff, diff, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++)
				for(j=0;j<N2;j++)
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;

			idft(u, u_fft2, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					float Thresh=1.0/Gamma1;

					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
		}
		for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
	}

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}

	free(w1);
	free(w2);
	free(buff);
	return 0;
}
コード例 #4
0
ファイル: mri.c プロジェクト: shangchying/root
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

    // Use this to check the output of each API call
    cl_int status;

    // Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    // Allocate enough space for each platform
    cl_platform_id *platforms = NULL;
    platforms = (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

    // Fill in the platforms
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    // Retrieve the number of devices
    cl_uint numDevices = 0;
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
        NULL, &numDevices);

    // Allocate enough space for each device
    cl_device_id *devices;
    devices = (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

    // Fill in the devices
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
        numDevices, devices, NULL);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL,
        NULL, &status);

    // Create a command queue and associate it with the device
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, devices[0], 0,
        &status);

    // Create a buffer object that will contain the data
    // from the host array A
        
	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));
    float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex));
	float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex));
	float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex));
       
    Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

    cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    
    cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

    status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL);
        
	cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&kernel, NULL, &status);
        
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
	cl_kernel ker;
	size_t globalWorkSize[2]={N1,N2};
	
	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
            
	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

    ker = clCreateKernel(program, "loop1", &status);

    
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
    status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor);
    status = clSetKernelArg(ker, 3, sizeof(int), &N1);
    status = clSetKernelArg(ker, 4, sizeof(int), &N2);
    
    
    
    
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    w1[0] = 1;
	w2[0] = 1;
	dft_init(&w1, &w2, &buff, N1, N2);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL);

    ker = clCreateKernel(program, "dft1", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
                   if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "dft2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
               if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "loop2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
    status = clSetKernelArg(ker, 3, sizeof(float), &lambda);
    status = clSetKernelArg(ker, 4, sizeof(int), &N1);
    status = clSetKernelArg(ker, 5, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL);    
    
    float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex));
    float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex));
    
    
    
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
            ker = clCreateKernel(program, "loop3", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 3, sizeof(int), &N1);
            status = clSetKernelArg(ker, 4, sizeof(int), &N2);

            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 

            ker = clCreateKernel(program, "dft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
             if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
            	
            ker = clCreateKernel(program, "dft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
			//dft(diff, diff, w1, w2, buff, N1, N2);

            
            ker = clCreateKernel(program, "loop4", &status);
            int more = (iter == MaxIter - 1);

            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(int), &N1);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clSetKernelArg(ker, 6, sizeof(float), &scale);
            status = clSetKernelArg(ker, 7, sizeof(float), &lambda);
            status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            
            ker = clCreateKernel(program, "idft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
            ker = clCreateKernel(program, "idft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
         
            ker = clCreateKernel(program, "loop5", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new);
            status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new);
            status = clSetKernelArg(ker, 7, sizeof(int), &N1);
            status = clSetKernelArg(ker, 8, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

            
		}
        /*
          ker = clCreateKernel(program, "last_loop", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(float), &scale);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
          if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
          
          */
        clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);    
        clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL);
        clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL);
        
        for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
        
        clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);
        
       
	}
    
    
            ker = clCreateKernel(program, "loop7", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 2, sizeof(int), &N1);
            status = clSetKernelArg(ker, 3, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 


    clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL);
    clReleaseKernel(ker);
    clReleaseProgram(program);
    clReleaseCommandQueue(cmdQueue);
    clReleaseMemObject(cl_img);
    clReleaseMemObject(cl_mask);
    clReleaseMemObject(cl_f);
    clReleaseMemObject(cl_f0);
    clReleaseMemObject(cl_dx);
    clReleaseMemObject(cl_dy);
    clReleaseMemObject(cl_dx_new);
    clReleaseMemObject(cl_dy_new);
    clReleaseMemObject(cl_dtildex);
    clReleaseMemObject(cl_dtildey);
    clReleaseMemObject(cl_u_fft2);
    clReleaseMemObject(cl_u);
    clReleaseMemObject(cl_fftmul);
    clReleaseMemObject(cl_Lap);
    clReleaseMemObject(cl_diff);
    clReleaseMemObject(cl_w1);
    clReleaseMemObject(cl_w2);
    clReleaseMemObject(cl_buff);
    
    clReleaseContext(context);
    free(platforms);
    free(devices);
	free(w1);
	free(w2);
	free(buff);
	return 0;
}