int main(int argc, char* argv[]) { bool verb,pas,adj,abc; /* execution flags */ int ix, iz, it; /* index variables */ int nt, nx, nz, depth, nzxpad, nb, n2, snap; float ox, oz, dx, dz, dt, dt2, idz2, idx2, cb; int nxpad, nzpad; float **vvpad; float **dd, **mm, **vv, ***ww; float **u0, **u1, **u2, **tmp; /* temporary arrays */ sf_file in, out, vel, wave; /* I/O files */ /* initialize Madagascar */ sf_init(argc,argv); /* initialize OpenMP support */ #ifdef _OPENMP omp_init(); #endif if(!sf_getbool("verb", &verb)) verb=false; /* verbosity flag */ if(!sf_getbool("adj", &adj)) adj=false; /* adjoint flag, 0: modeling, 1: migration */ if(!sf_getbool("pas", &pas)) pas=false; /* passive flag, 0: exploding reflector rtm, 1: passive seismic imaging */ if(!sf_getbool("abc",&abc)) abc = false; /* absorbing boundary condition */ if(!sf_getint("snap", &snap)) snap=0; /* wavefield snapshot flag */ if(!sf_getint("depth", &depth)) depth=0; /* surface */ /* setup I/O files */ in = sf_input("in"); out = sf_output("out"); vel = sf_input("velocity"); /* velocity model */ /* Dimensions */ if(!sf_histint (vel, "n1", &nz)) sf_error("No n1= in velocity"); if(!sf_histint (vel, "n2", &nx)) sf_error("No n2= in velocity"); if(!sf_histfloat(vel, "o1", &oz)) sf_error("No o1= in velocity"); if(!sf_histfloat(vel, "o2", &ox)) sf_error("No o2= in velocity"); if(!sf_histfloat(vel, "d1", &dz)) sf_error("No d1= in velocity"); if(!sf_histfloat(vel, "d2", &dx)) sf_error("No d2= in velocity"); if(adj){ /* migration */ if(!sf_histint(in, "n1", &nt)) sf_error("No n1= in data"); if(!sf_histfloat(in, "d1", &dt)) sf_error("No d1= in data"); if(!sf_histint(in, "n2", &n2) || n2!=nx) sf_error("Need n2=%d in data", nx); sf_putint (out, "n1", nz); sf_putfloat (out, "o1", oz); sf_putfloat (out, "d1", dz); sf_putstring(out, "label1", "Depth"); sf_putstring(out, "unit1" , "km"); sf_putint (out, "n2", nx); sf_putfloat (out, "o2", ox); sf_putfloat (out, "d2", dx); sf_putstring(out, "label2", "Distance"); sf_putstring(out, "unit2" , "km"); if (pas) { sf_putint (out, "n3", nt); sf_putfloat (out, "d3", dt); sf_putfloat (out, "o3", 0.0f); sf_putstring(out, "label3", "Time"); sf_putstring(out, "unit3" , "s"); } }else{ /* modeling */ if(!sf_getint("nt", &nt)) sf_error("Need nt="); if(!sf_getfloat("dt", &dt)) sf_error("Need dt="); sf_putint (out, "n1", nt); sf_putfloat (out, "d1", dt); sf_putfloat (out, "o1", 0.0); sf_putstring(out, "label1", "Time"); sf_putstring(out, "unit1" , "s"); sf_putint (out, "n2", nx); sf_putfloat (out, "o2", ox); sf_putfloat (out, "d2", dx); sf_putstring(out, "label2", "Distance"); sf_putstring(out, "unit2" , "km"); if (pas) { sf_putint (out, "n3", 1); } } /* dimension of padded boundary */ if(!sf_getint("nb", &nb) || nb<NOP) nb = NOP; if(!sf_getfloat("cb", &cb)) cb = 0.0f; nxpad = nx+2*nb; nzpad = nz+2*nb; nzxpad = nzpad*nxpad; depth = depth+nb; /* set Laplacian coefficients */ idz2 = 1.0f/(dz*dz); idx2 = 1.0f/(dx*dx); /* wavefield snapshot */ if(snap){ wave = sf_output("wave"); sf_putint(wave, "n1", nzpad); sf_putfloat(wave, "d1", dz); sf_putfloat(wave, "o1", oz-nb*dz); sf_putint(wave, "n2", nxpad); sf_putfloat(wave, "d2", dx); sf_putfloat(wave, "o2", ox-nb*dx); sf_putint(wave, "n3", 1+(nt-1)/snap); if(adj){ sf_putfloat(wave, "d3", -snap*dt); sf_putfloat(wave, "o3", (nt-1)*dt); }else{ sf_putfloat(wave, "d3", snap*dt); sf_putfloat(wave, "o3", 0.0f); } } /* allocate arrays */ vv = sf_floatalloc2(nz, nx); dd = sf_floatalloc2(nt, nx); vvpad = sf_floatalloc2(nzpad, nxpad); u0 = sf_floatalloc2(nzpad, nxpad); u1 = sf_floatalloc2(nzpad, nxpad); u2 = sf_floatalloc2(nzpad, nxpad); if (pas) { mm = NULL; ww = sf_floatalloc3(nz, nx, nt); } else { mm = sf_floatalloc2(nz, nx); ww = NULL; } /* read velocity */ sf_floatread(vv[0], nz*nx, vel); /* pad boundary */ dt2 = dt*dt; for (ix=0; ix<nx; ix++) for (iz=0; iz<nz; iz++) vvpad[ix+nb][iz+nb] = vv[ix][iz]*vv[ix][iz]*dt2; for (ix=0; ix<nxpad; ix++){ for (iz=0; iz<nb; iz++){ vvpad[ix][ iz ] = vvpad[ix][ nb ]; vvpad[ix][nzpad-iz-1] = vvpad[ix][nzpad-nb-1]; } } for (ix=0; ix<nb; ix++){ for (iz=0; iz<nzpad; iz++){ vvpad[ ix ][iz]=vvpad[ nb ][iz]; vvpad[nxpad-ix-1][iz]=vvpad[nxpad-nb-1][iz]; } } memset(u0[0], 0.0f, nzxpad*sizeof(float)); memset(u1[0], 0.0f, nzxpad*sizeof(float)); memset(u2[0], 0.0f, nzxpad*sizeof(float)); /* absorbing boundary condition */ if (abc) { if (verb) sf_warning("absorbing boundary condition"); abc_init(nzpad,nxpad,nzpad,nxpad,nb,nb,nb,nb,cb,cb,cb,cb); } if(adj){ /* migration */ /* read data */ sf_floatread(dd[0], nt*nx, in); for (it=nt-1; it>-1; it--){ if (verb) sf_warning("Migration: %d/%d;", it, 0); /* time stepping */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=NOP; ix<nxpad-NOP; ix++){ for (iz=NOP; iz<nzpad-NOP; iz++){ u2[ix][iz] = LapT(u1,ix,iz,idx2,idz2,vvpad) + 2.0f*u1[ix][iz] - u0[ix][iz]; } } /* rotate pointers */ tmp=u0; u0=u1; u1=u2; u2=tmp; if (abc) abc_apply(u1[0]); if (abc) abc_apply(u0[0]); /* inject data */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix) #endif for (ix=nb; ix<nb+nx; ix++) u1[ix][depth] += dd[ix-nb][it]; if (pas) { /* image source */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=0; ix<nx; ix++) for (iz=0; iz<nz; iz++) ww[it][ix][iz] = u1[ix+nb][iz+nb]; } if (snap && it%snap==0) sf_floatwrite(u1[0], nzxpad, wave); } if (verb) sf_warning("."); if (!pas) { /* output image */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=0; ix<nx; ix++) for (iz=0; iz<nz; iz++) mm[ix][iz] = u1[ix+nb][iz+nb]; sf_floatwrite(mm[0], nz*nx, out); } else { /* output source */ sf_floatwrite(ww[0][0], nz*nx*nt, out); } }else{/* modeling */ if (pas) { /* read source */ sf_floatread(ww[0][0], nz*nx*nt, in); } else { /* read image */ sf_floatread(mm[0], nz*nx, in); #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=0; ix<nx; ix++) for (iz=0; iz<nz; iz++) u1[ix+nb][iz+nb] = mm[ix][iz]; } for (it=0; it<nt; it++){ if (verb) sf_warning("Modeling: %d/%d;", it, nt-1); if(snap && it%snap==0) sf_floatwrite(u1[0], nzxpad, wave); if (pas){ /* inject source */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=0; ix<nx; ix++) for (iz=0; iz<nz; iz++) u1[ix+nb][iz+nb] += ww[it][ix][iz]; } /* record data */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix) #endif for (ix=nb; ix<nb+nx; ix++) dd[ix-nb][it] = u1[ix][depth]; if (abc) abc_apply(u0[0]); if (abc) abc_apply(u1[0]); /* time stepping */ #ifdef _OPENMP #pragma omp parallel for default(shared) private(ix, iz) #endif for (ix=NOP; ix<nxpad-NOP; ix++){ for (iz=NOP; iz<nzpad-NOP; iz++){ u2[ix][iz] = Lap (u1,ix,iz,idx2,idz2,vvpad) + 2.0f*u1[ix][iz] - u0[ix][iz]; } } /* rotate pointers */ tmp=u0; u0=u1; u1=u2; u2=tmp; } if (verb) sf_warning("."); /* output data */ sf_floatwrite(dd[0], nt*nx, out); } if(pas) { free(**ww); free(*ww); free(ww); } else { free(*mm); free(mm); } if (abc) abc_close(); free(*vvpad); free(vvpad); free(*vv); free(vv); free(*dd); free(dd); free(*u0); free(u0); free(*u1); free(u1); free(*u2); free(u2); exit (0); }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; float scale = sqrtf(N1*N2); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; double lambdaGamma1 = lambda/Gamma1; double lambdaGamma1Scale = lambda/Gamma1*scale; float Thresh=1.0/Gamma1; MPI_Datatype mpi_complexf; MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf); MPI_Type_commit(&mpi_complexf); int np, rank; MPI_Comm_size(MPI_COMM_WORLD ,&np); MPI_Comm_rank(MPI_COMM_WORLD ,&rank); int chunksize = N1/np; int start = rank * chunksize; int cnt[np]; int disp[np]; for (i = 0 ; i < np - 1; i ++) { cnt[i] = chunksize * N2 ; disp[i] = chunksize * i * N2; } cnt[i] = (chunksize + N1%np) * N2; disp[i] = chunksize * i * N2; if (rank == np - 1) chunksize += N1%np; int end = start + chunksize; for(i=start; i<chunksize; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); float normFactor = 1.f/sqrtf(sum); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); MPI_Status *status; for(i=start;i<end;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=start;i<end;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); for(i=start;i<end;i++) { for(j=0;j<N2;j++) { u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; if (iter == MaxIter - 1) f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); //MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); else if (rank == 0) MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); else { MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); } for(i=start;i<end;i++) { for(j=0;j<N2;j++) { float tmp; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } //MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); else if (rank == 0) MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); else { MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); } } } for(i=start; i<end; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD); free(w1); free(w2); free(buff); MPI_Finalize(); if (rank > 0) exit(0); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=0;i<N1;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; idft(u, u_fft2, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { float tmp; float Thresh=1.0/Gamma1; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } } for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } } for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } free(w1); free(w2); free(buff); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex)); float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex)); float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex)); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status); cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status); cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&kernel, NULL, &status); status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); cl_kernel ker; size_t globalWorkSize[2]={N1,N2}; float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); ker = clCreateKernel(program, "loop1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); w1[0] = 1; w2[0] = 1; dft_init(&w1, &w2, &buff, N1, N2); status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "loop2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(float), &lambda); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL); float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex)); float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex)); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { ker = clCreateKernel(program, "loop3", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); //dft(diff, diff, w1, w2, buff, N1, N2); ker = clCreateKernel(program, "loop4", &status); int more = (iter == MaxIter - 1); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clSetKernelArg(ker, 6, sizeof(float), &scale); status = clSetKernelArg(ker, 7, sizeof(float), &lambda); status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "loop5", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new); status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new); status = clSetKernelArg(ker, 7, sizeof(int), &N1); status = clSetKernelArg(ker, 8, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); } /* ker = clCreateKernel(program, "last_loop", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(float), &scale); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); */ clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); } ker = clCreateKernel(program, "loop7", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 2, sizeof(int), &N1); status = clSetKernelArg(ker, 3, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL); clReleaseKernel(ker); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(cl_img); clReleaseMemObject(cl_mask); clReleaseMemObject(cl_f); clReleaseMemObject(cl_f0); clReleaseMemObject(cl_dx); clReleaseMemObject(cl_dy); clReleaseMemObject(cl_dx_new); clReleaseMemObject(cl_dy_new); clReleaseMemObject(cl_dtildex); clReleaseMemObject(cl_dtildey); clReleaseMemObject(cl_u_fft2); clReleaseMemObject(cl_u); clReleaseMemObject(cl_fftmul); clReleaseMemObject(cl_Lap); clReleaseMemObject(cl_diff); clReleaseMemObject(cl_w1); clReleaseMemObject(cl_w2); clReleaseMemObject(cl_buff); clReleaseContext(context); free(platforms); free(devices); free(w1); free(w2); free(buff); return 0; }