コード例 #1
0
ファイル: mri.c プロジェクト: shangchying/root
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;
	float scale		 = sqrtf(N1*N2);
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;
	double lambdaGamma1 = lambda/Gamma1;
	double lambdaGamma1Scale = lambda/Gamma1*scale;
	float Thresh=1.0/Gamma1;
	
	MPI_Datatype mpi_complexf;
	MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf);
	MPI_Type_commit(&mpi_complexf);
	
	int np, rank;
    MPI_Comm_size(MPI_COMM_WORLD ,&np);
    MPI_Comm_rank(MPI_COMM_WORLD ,&rank);
    int chunksize = N1/np;
    int start = rank * chunksize;
    int cnt[np];
	int disp[np];
	for (i = 0 ; i < np - 1; i ++) {
		cnt[i] = chunksize * N2 ;
		disp[i] = chunksize * i * N2;
	}
	cnt[i] = (chunksize + N1%np) * N2;
	disp[i] = chunksize * i * N2;

    if (rank == np - 1)
    	chunksize += N1%np;
    int end = start + chunksize;


	for(i=start; i<chunksize; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
	MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);		
	float normFactor = 1.f/sqrtf(sum);
	

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
	MPI_Status *status;

	for(i=start;i<end;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);
			
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
			
			for(i=start;i<end;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;
			
			dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);

			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;
					if (iter == MaxIter - 1)
						f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; 
				}
			}
			
			idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
			//MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			else if (rank == 0)
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
			else {
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			}
			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
			//MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
			else if (rank == 0)
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			else {
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			}
            
		}
	}

	for(i=start; i<end; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}
	MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD);
	free(w1);
	free(w2);
	free(buff);
	MPI_Finalize();
	if (rank > 0)
		exit(0);
	return 0;
}
コード例 #2
0
ファイル: mri_mpi.c プロジェクト: samwarring/CS133Project
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));

	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2);

	for(i=0;i<N1;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);

	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {

			for(i=0;i<N1;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;

			dft(diff, diff, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++)
				for(j=0;j<N2;j++)
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;

			idft(u, u_fft2, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					float Thresh=1.0/Gamma1;

					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
		}
		for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
	}

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}

	free(w1);
	free(w2);
	free(buff);
	return 0;
}
コード例 #3
0
ファイル: mri.c プロジェクト: shangchying/root
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

    // Use this to check the output of each API call
    cl_int status;

    // Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    // Allocate enough space for each platform
    cl_platform_id *platforms = NULL;
    platforms = (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

    // Fill in the platforms
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    // Retrieve the number of devices
    cl_uint numDevices = 0;
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
        NULL, &numDevices);

    // Allocate enough space for each device
    cl_device_id *devices;
    devices = (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

    // Fill in the devices
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
        numDevices, devices, NULL);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL,
        NULL, &status);

    // Create a command queue and associate it with the device
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, devices[0], 0,
        &status);

    // Create a buffer object that will contain the data
    // from the host array A
        
	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));
    float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex));
	float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex));
	float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex));
       
    Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

    cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    
    cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

    status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL);
        
	cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&kernel, NULL, &status);
        
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
	cl_kernel ker;
	size_t globalWorkSize[2]={N1,N2};
	
	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
            
	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

    ker = clCreateKernel(program, "loop1", &status);

    
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
    status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor);
    status = clSetKernelArg(ker, 3, sizeof(int), &N1);
    status = clSetKernelArg(ker, 4, sizeof(int), &N2);
    
    
    
    
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    w1[0] = 1;
	w2[0] = 1;
	dft_init(&w1, &w2, &buff, N1, N2);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL);

    ker = clCreateKernel(program, "dft1", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
                   if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "dft2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
               if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "loop2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
    status = clSetKernelArg(ker, 3, sizeof(float), &lambda);
    status = clSetKernelArg(ker, 4, sizeof(int), &N1);
    status = clSetKernelArg(ker, 5, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL);    
    
    float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex));
    float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex));
    
    
    
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
            ker = clCreateKernel(program, "loop3", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 3, sizeof(int), &N1);
            status = clSetKernelArg(ker, 4, sizeof(int), &N2);

            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 

            ker = clCreateKernel(program, "dft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
             if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
            	
            ker = clCreateKernel(program, "dft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
			//dft(diff, diff, w1, w2, buff, N1, N2);

            
            ker = clCreateKernel(program, "loop4", &status);
            int more = (iter == MaxIter - 1);

            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(int), &N1);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clSetKernelArg(ker, 6, sizeof(float), &scale);
            status = clSetKernelArg(ker, 7, sizeof(float), &lambda);
            status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            
            ker = clCreateKernel(program, "idft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
            ker = clCreateKernel(program, "idft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
         
            ker = clCreateKernel(program, "loop5", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new);
            status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new);
            status = clSetKernelArg(ker, 7, sizeof(int), &N1);
            status = clSetKernelArg(ker, 8, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

            
		}
        /*
          ker = clCreateKernel(program, "last_loop", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(float), &scale);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
          if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
          
          */
        clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);    
        clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL);
        clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL);
        
        for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
        
        clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);
        
       
	}
    
    
            ker = clCreateKernel(program, "loop7", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 2, sizeof(int), &N1);
            status = clSetKernelArg(ker, 3, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 


    clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL);
    clReleaseKernel(ker);
    clReleaseProgram(program);
    clReleaseCommandQueue(cmdQueue);
    clReleaseMemObject(cl_img);
    clReleaseMemObject(cl_mask);
    clReleaseMemObject(cl_f);
    clReleaseMemObject(cl_f0);
    clReleaseMemObject(cl_dx);
    clReleaseMemObject(cl_dy);
    clReleaseMemObject(cl_dx_new);
    clReleaseMemObject(cl_dy_new);
    clReleaseMemObject(cl_dtildex);
    clReleaseMemObject(cl_dtildey);
    clReleaseMemObject(cl_u_fft2);
    clReleaseMemObject(cl_u);
    clReleaseMemObject(cl_fftmul);
    clReleaseMemObject(cl_Lap);
    clReleaseMemObject(cl_diff);
    clReleaseMemObject(cl_w1);
    clReleaseMemObject(cl_w2);
    clReleaseMemObject(cl_buff);
    
    clReleaseContext(context);
    free(platforms);
    free(devices);
	free(w1);
	free(w2);
	free(buff);
	return 0;
}