int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; float scale = sqrtf(N1*N2); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; double lambdaGamma1 = lambda/Gamma1; double lambdaGamma1Scale = lambda/Gamma1*scale; float Thresh=1.0/Gamma1; MPI_Datatype mpi_complexf; MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf); MPI_Type_commit(&mpi_complexf); int np, rank; MPI_Comm_size(MPI_COMM_WORLD ,&np); MPI_Comm_rank(MPI_COMM_WORLD ,&rank); int chunksize = N1/np; int start = rank * chunksize; int cnt[np]; int disp[np]; for (i = 0 ; i < np - 1; i ++) { cnt[i] = chunksize * N2 ; disp[i] = chunksize * i * N2; } cnt[i] = (chunksize + N1%np) * N2; disp[i] = chunksize * i * N2; if (rank == np - 1) chunksize += N1%np; int end = start + chunksize; for(i=start; i<chunksize; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); float normFactor = 1.f/sqrtf(sum); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); MPI_Status *status; for(i=start;i<end;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=start;i<end;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); for(i=start;i<end;i++) { for(j=0;j<N2;j++) { u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; if (iter == MaxIter - 1) f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); //MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); else if (rank == 0) MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); else { MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); } for(i=start;i<end;i++) { for(j=0;j<N2;j++) { float tmp; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } //MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); else if (rank == 0) MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); else { MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); } } } for(i=start; i<end; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD); free(w1); free(w2); free(buff); MPI_Finalize(); if (rank > 0) exit(0); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=0;i<N1;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; idft(u, u_fft2, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { float tmp; float Thresh=1.0/Gamma1; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } } for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } } for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } free(w1); free(w2); free(buff); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex)); float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex)); float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex)); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status); cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status); cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&kernel, NULL, &status); status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); cl_kernel ker; size_t globalWorkSize[2]={N1,N2}; float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); ker = clCreateKernel(program, "loop1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); w1[0] = 1; w2[0] = 1; dft_init(&w1, &w2, &buff, N1, N2); status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "loop2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(float), &lambda); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL); float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex)); float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex)); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { ker = clCreateKernel(program, "loop3", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); //dft(diff, diff, w1, w2, buff, N1, N2); ker = clCreateKernel(program, "loop4", &status); int more = (iter == MaxIter - 1); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clSetKernelArg(ker, 6, sizeof(float), &scale); status = clSetKernelArg(ker, 7, sizeof(float), &lambda); status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "loop5", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new); status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new); status = clSetKernelArg(ker, 7, sizeof(int), &N1); status = clSetKernelArg(ker, 8, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); } /* ker = clCreateKernel(program, "last_loop", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(float), &scale); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); */ clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); } ker = clCreateKernel(program, "loop7", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 2, sizeof(int), &N1); status = clSetKernelArg(ker, 3, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL); clReleaseKernel(ker); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(cl_img); clReleaseMemObject(cl_mask); clReleaseMemObject(cl_f); clReleaseMemObject(cl_f0); clReleaseMemObject(cl_dx); clReleaseMemObject(cl_dy); clReleaseMemObject(cl_dx_new); clReleaseMemObject(cl_dy_new); clReleaseMemObject(cl_dtildex); clReleaseMemObject(cl_dtildey); clReleaseMemObject(cl_u_fft2); clReleaseMemObject(cl_u); clReleaseMemObject(cl_fftmul); clReleaseMemObject(cl_Lap); clReleaseMemObject(cl_diff); clReleaseMemObject(cl_w1); clReleaseMemObject(cl_w2); clReleaseMemObject(cl_buff); clReleaseContext(context); free(platforms); free(devices); free(w1); free(w2); free(buff); return 0; }