Beispiel #1
0
int main() {

    int i;
    unsigned int *order;

    dft_init(6);


    dft_thread_create(sink);
    dft_thread_create(muxer);
    dft_thread_create(intermediate);
    dft_thread_create(intermediate);
    dft_thread_create(doubler);
    dft_thread_create(source);
    dft_thread_link(5, 4);
    dft_thread_link(4, 3);
    dft_thread_link(4, 2);
    dft_thread_link(3, 1);
    dft_thread_link(2, 1);
    dft_thread_link(1, 0);

    dft_execute();

    return 0;
}
Beispiel #2
0
int main(int argc, char **argv)
{
  if(argc!=2)
    {
      printf("Usage:- %s <PREC>.\n",argv[0]);
      exit(0);
    }

  uint64_t prec; // how many bits do we want MPFI to use?
  prec=atoi(argv[1]);
  if((prec<MPFR_PREC_MIN)||(prec>MPFR_PREC_MAX))
   {
      printf("Usage:- %s <PREC>.\n",argv[0]);
      exit(0);
    }
  dft_init(prec);

#define Q (5*37*41)
#define PHI_Q (4*36*40)

  mpfi_c_t vec[PHI_Q];
  uint64_t dims[3]={4,36,40};

  uint64_t i;
  for(i=0;i<PHI_Q;i++)
    {
      mpfi_c_init(vec[i]);
      mpfi_c_set_ui(vec[i],rand(),0);
    }

  //
  // this will do 36*40 length 4 dft's of i, i+36*40, i+2*36*40 and i+3*36+40
  // then 4*40 length 36 dfts
  // then 4*36 length 40 dfts
  //

  ndft(vec,PHI_Q,3,dims);

  mpfi_c_print_str("res[100]: ",vec[100]);

  return(0);
}
Beispiel #3
0
int main(void)
{
	int i;
	const int numDFTs0 = 1;
	const int numDFTs1 = 10;
	const int numDFTs2 = 100;
    dft_init();  //  Initialize the sliding DFT
    
	printf("With %d channels, k=%d, N=%d:\n", M_SLIDING_DFT_nchannels, M_SLIDING_DFT_k, M_SLIDING_DFT_N);
	
	memset(sample, 0, sizeof(adcdata_t) * M_SLIDING_DFT_nchannels);
	
	resetTimer();
	T1CONbits.TON = 1;
	for (i = 0 ; i < numDFTs0 ; i ++)
		dft_update();
	T1CONbits.TON = 0;
	
	printf("  Ran %d DFTs in %d clock cycles.\n", numDFTs0, TMR1);
	printf("    %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs0)));
	
	resetTimer();
	T1CONbits.TON = 1;
	for (i = 0 ; i < numDFTs1 ; i ++)
		dft_update();
	T1CONbits.TON = 0;
	
	printf("  Ran %d DFTs in %d clock cycles.\n", numDFTs1, TMR1);
	printf("    %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs1)));
	
	resetTimer();
	T1CONbits.TON = 1;
	for (i = 0 ; i < numDFTs2 ; i ++)
		dft_update();
	T1CONbits.TON = 0;
	
	printf("  Ran %d DFTs in %d clock cycles.\n", numDFTs2, TMR1);
	printf("    %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs2)));
	
	return 0;
}
Beispiel #4
0
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));

	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2);

	for(i=0;i<N1;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);

	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {

			for(i=0;i<N1;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;

			dft(diff, diff, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++)
				for(j=0;j<N2;j++)
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;

			idft(u, u_fft2, w1, w2, buff, N1, N2);

			for(i=0;i<N1;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					float Thresh=1.0/Gamma1;

					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
		}
		for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
	}

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}

	free(w1);
	free(w2);
	free(buff);
	return 0;
}
Beispiel #5
0
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float sum = 0;
	float scale		 = sqrtf(N1*N2);
	Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

	float complex *w1;
	float complex *w2;
	float complex *buff;
	double lambdaGamma1 = lambda/Gamma1;
	double lambdaGamma1Scale = lambda/Gamma1*scale;
	float Thresh=1.0/Gamma1;
	
	MPI_Datatype mpi_complexf;
	MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf);
	MPI_Type_commit(&mpi_complexf);
	
	int np, rank;
    MPI_Comm_size(MPI_COMM_WORLD ,&np);
    MPI_Comm_rank(MPI_COMM_WORLD ,&rank);
    int chunksize = N1/np;
    int start = rank * chunksize;
    int cnt[np];
	int disp[np];
	for (i = 0 ; i < np - 1; i ++) {
		cnt[i] = chunksize * N2 ;
		disp[i] = chunksize * i * N2;
	}
	cnt[i] = (chunksize + N1%np) * N2;
	disp[i] = chunksize * i * N2;

    if (rank == np - 1)
    	chunksize += N1%np;
    int end = start + chunksize;


	for(i=start; i<chunksize; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
	MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);		
	float normFactor = 1.f/sqrtf(sum);
	

	for(i=0; i<N1; i++) {
		for(j=0; j<N2; j++) {
			f(i, j)  = f(i, j)*normFactor;
			f0(i, j) = f(i, j);
		}
	}
	

	dft_init(&w1, &w2, &buff, N1, N2);
	dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
	MPI_Status *status;

	for(i=start;i<end;i++)
		for(j=0;j<N2;j++)					
			fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2);
			
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
			
			for(i=start;i<end;i++)	
				for(j=0;j<N2;j++)
					diff(i,j)  = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ;
			
			dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);

			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ;
					if (iter == MaxIter - 1)
						f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; 
				}
			}
			
			idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank);
			//MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			else if (rank == 0)
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
			else {
				MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD);
			}
			
			for(i=start;i<end;i++) {
				for(j=0;j<N2;j++) {
					float tmp;
					dx(i,j)     = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ;
					dy(i,j)     = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ;

					tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j))));
					tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh));
					dx_new(i,j) =dx(i,j)*tmp;
					dy_new(i,j) =dy(i,j)*tmp;
					dtildex(i,j) = 2*dx_new(i,j) - dx(i,j);
					dtildey(i,j) = 2*dy_new(i,j) - dy(i,j);
					dx(i,j)      = dx_new(i,j);
					dy(i,j)      = dy_new(i,j);
				}
			}
			//MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD);
			if (rank == np - 1)
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
			else if (rank == 0)
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			else {
				MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status);
				MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD);
			}
            
		}
	}

	for(i=start; i<end; i++) {
		for(j=0; j<N2; j++) {
			img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j))));
		}
	}
	MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD);
	free(w1);
	free(w2);
	free(buff);
	MPI_Finalize();
	if (rank > 0)
		exit(0);
	return 0;
}
Beispiel #6
0
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

    // Use this to check the output of each API call
    cl_int status;

    // Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    // Allocate enough space for each platform
    cl_platform_id *platforms = NULL;
    platforms = (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

    // Fill in the platforms
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    // Retrieve the number of devices
    cl_uint numDevices = 0;
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
        NULL, &numDevices);

    // Allocate enough space for each device
    cl_device_id *devices;
    devices = (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

    // Fill in the devices
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
        numDevices, devices, NULL);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL,
        NULL, &status);

    // Create a command queue and associate it with the device
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, devices[0], 0,
        &status);

    // Create a buffer object that will contain the data
    // from the host array A
        
	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));
    float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex));
	float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex));
	float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex));
       
    Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

    cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    
    cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

    status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL);
        
	cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&kernel, NULL, &status);
        
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
	cl_kernel ker;
	size_t globalWorkSize[2]={N1,N2};
	
	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
            
	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

    ker = clCreateKernel(program, "loop1", &status);

    
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
    status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor);
    status = clSetKernelArg(ker, 3, sizeof(int), &N1);
    status = clSetKernelArg(ker, 4, sizeof(int), &N2);
    
    
    
    
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    w1[0] = 1;
	w2[0] = 1;
	dft_init(&w1, &w2, &buff, N1, N2);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL);

    ker = clCreateKernel(program, "dft1", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
                   if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "dft2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
               if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "loop2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
    status = clSetKernelArg(ker, 3, sizeof(float), &lambda);
    status = clSetKernelArg(ker, 4, sizeof(int), &N1);
    status = clSetKernelArg(ker, 5, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL);    
    
    float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex));
    float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex));
    
    
    
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
            ker = clCreateKernel(program, "loop3", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 3, sizeof(int), &N1);
            status = clSetKernelArg(ker, 4, sizeof(int), &N2);

            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 

            ker = clCreateKernel(program, "dft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
             if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
            	
            ker = clCreateKernel(program, "dft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
			//dft(diff, diff, w1, w2, buff, N1, N2);

            
            ker = clCreateKernel(program, "loop4", &status);
            int more = (iter == MaxIter - 1);

            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(int), &N1);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clSetKernelArg(ker, 6, sizeof(float), &scale);
            status = clSetKernelArg(ker, 7, sizeof(float), &lambda);
            status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            
            ker = clCreateKernel(program, "idft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
            ker = clCreateKernel(program, "idft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
         
            ker = clCreateKernel(program, "loop5", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new);
            status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new);
            status = clSetKernelArg(ker, 7, sizeof(int), &N1);
            status = clSetKernelArg(ker, 8, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

            
		}
        /*
          ker = clCreateKernel(program, "last_loop", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(float), &scale);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
          if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
          
          */
        clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);    
        clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL);
        clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL);
        
        for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
        
        clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);
        
       
	}
    
    
            ker = clCreateKernel(program, "loop7", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 2, sizeof(int), &N1);
            status = clSetKernelArg(ker, 3, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 


    clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL);
    clReleaseKernel(ker);
    clReleaseProgram(program);
    clReleaseCommandQueue(cmdQueue);
    clReleaseMemObject(cl_img);
    clReleaseMemObject(cl_mask);
    clReleaseMemObject(cl_f);
    clReleaseMemObject(cl_f0);
    clReleaseMemObject(cl_dx);
    clReleaseMemObject(cl_dy);
    clReleaseMemObject(cl_dx_new);
    clReleaseMemObject(cl_dy_new);
    clReleaseMemObject(cl_dtildex);
    clReleaseMemObject(cl_dtildey);
    clReleaseMemObject(cl_u_fft2);
    clReleaseMemObject(cl_u);
    clReleaseMemObject(cl_fftmul);
    clReleaseMemObject(cl_Lap);
    clReleaseMemObject(cl_diff);
    clReleaseMemObject(cl_w1);
    clReleaseMemObject(cl_w2);
    clReleaseMemObject(cl_buff);
    
    clReleaseContext(context);
    free(platforms);
    free(devices);
	free(w1);
	free(w2);
	free(buff);
	return 0;
}
int main() {
		//////////////////////////////////////////////////////
    // init
    int16_t* raw_fft_input = (int16_t*) RAW_INPUT_ADDR;    // size: 512 * 16 / 32 = 200 (0x100)
    int16_t* fft_input = (int16_t*) INPUT_ADDR;            // size: 512 * 16 * 2 / 32 * 1.5 = 768 (0x300)
    int16_t* fft_output = (int16_t*) OUTPUT_ADDR;          // size: 257 * 16 * 2 / 32 * 1.5 = 386 (0x182)
    int16_t* window_parameter = (int16_t*) WINDOW_PARA_ADDR;   // size: 400 * 16 / 32 = 200 (0xC8).
    int16_t* dft_parameter_r = (int16_t*) DFT_PARA_ADDR;       // size: 2 * 2 * 16 / 32 = 2 (0x2). 14 fractional bits 
    int16_t* dft_parameter_i = (int16_t*) DFT_PARA_ADDR + DFT_SIZE * DFT_SIZE * sizeof(int16_t);
    int16_t* mfcc_input = (int16_t*) MFCC_INPUT_ADDR;          // size: 257 * 16 * 2 / 32 = 257 (0x101)
//    int16_t* dct_parameter = DCT_PARA_ADDR;
    init_input(raw_fft_input, WINDOW_SIZE);
    window_init(window_parameter, WINDOW_SIZE);
    dft_init(dft_parameter_r, dft_parameter_i, DFT_SIZE);
//    dct_init(dct_parameter);

		//////////////////////////////////////////////////////
    // PE init
		uint16_t inst_no;
		set_dnn_insts();
		set_nli_parameters();
    *REG_WAIT_BEFORE_VDD = 0xff;
      *DNN_SRAM_RSTN_0 = 0x000007ff;
      *DNN_SRAM_RSTN_1 = 0x000007ff;
      *DNN_SRAM_RSTN_2 = 0x000007ff;
      *DNN_SRAM_RSTN_3 = 0x000007ff;
      delay(3);

      *DNN_SRAM_ISOL_0 = 0x000007ff;
      *DNN_SRAM_ISOL_1 = 0x000007ff;
      *DNN_SRAM_ISOL_2 = 0x000007ff;
      *DNN_SRAM_ISOL_3 = 0x000007ff;
      delay(3);

      *DNN_PG_0 = 0x000007ff;
      *DNN_PG_1 = 0x000007ff;
      *DNN_PG_2 = 0x000007ff;
      *DNN_PG_3 = 0x000007ff;
      delay(5);

      //*DNN_PG_0 = 0x003ff800;
      //*DNN_PG_1 = 0x003ff800;
      //*DNN_PG_2 = 0x003ff800;
      //*DNN_PG_3 = 0x003ff800;
      //delay(3);

      *DNN_SRAM_RSTN_0 = 0x000007ff;
      *DNN_SRAM_RSTN_0 = 0xffffffff;
      *DNN_SRAM_RSTN_1 = 0xffffffff;
      *DNN_SRAM_RSTN_2 = 0xffffffff;
      *DNN_SRAM_RSTN_3 = 0xffffffff;
      delay(3);
//      *DNN_RAND_0 = 1;
//      *DNN_RAND_1 = 1;
//      *DNN_RAND_2 = 1;
//      *DNN_RAND_3 = 1;
//      delay(3);
      
      //*DNN_SRAM_ISOL_0 = 0x00000000;
      //*DNN_SRAM_ISOL_1 = 0x00000000;
      //*DNN_SRAM_ISOL_2 = 0x00000000;
      //*DNN_SRAM_ISOL_3 = 0x00000000;
      //delay(3);

    signal_debug(1);    // init ends; start counting runtime
		//////////////////////////////////////////////////////
    // M0 working sequence
    
    // TODO: optimize these arrays
    int16_t input_r_dft[FFT_SIZE];
    int16_t input_i_dft[FFT_SIZE];

    // prepare fft
    fft_apply_window(WINDOW_SIZE, raw_fft_input, window_parameter); 
    prepare_dft_for_fft(WINDOW_SIZE, FFT_SIZE, raw_fft_input, input_r_dft, FFT_NUM, FFT_NUM_BITS, DFT_SIZE);

    // dft
    dft(DFT_SIZE, FFT_SIZE, input_r_dft, input_i_dft, dft_parameter_r, dft_parameter_i);
 
    prepare_PE_fft_input(fft_input, input_r_dft, input_i_dft, DFT_SIZE, FFT_SIZE, FFT_PREC);
    signal_debug(2);    // M0 ends; write to PE

    // write to PE
    uint16_t fft_input_space = FFT_SIZE / 2;   // DLC memory occupied. for 16 bits only
    write_dnn_sram_16(FFT_START, fft_input, fft_input_space);
    signal_debug(0);    // start PE runtime, check if memory correct
		//////////////////////////////////////////////////////
    // PE working sequence
    inst_no = 0;
    reset_PE(0);
    write_instruction(inst_no, 0, 0);   // write FFT 0
   
    while (inst_no < FFT_INST - 1) {
      if (inst_no % 2 == 0) {
        switch_inst_buffer(0, 0);
        reset_PE(0);
			  start_pe_inst(0b0001);                    // start FFT 0, 2, 4, 6
			  inst_no++;
        write_instruction(inst_no, 0, 1);         // write FFT 1, 3, 5, 7
//        clock_gate();
      } else {
        switch_inst_buffer(0, 1);
        reset_PE(0);
			  start_pe_inst(0b0001);              // start FFT 1, 3, 5
			  inst_no++;
        write_instruction(inst_no, 0, 0);   // write FFT 2, 4, 6
//        clock_gate(); 
      }
    }

    switch_inst_buffer(0, 1);
    reset_PE(0);
		start_pe_inst(0b0001);        // start FFT 7
		inst_no++;
//    clock_gate(); 

		// finish
    wait_until_pe_finish(0b1111);
		//////////////////////////////////////////////////////

    uint16_t fft_output_space = MFCC_SIZE / 2;   // DLC memory occupied. for 16 bits only 
    read_dnn_sram_16(FFT_START, fft_output, fft_output_space); 
    extract_fft_output(fft_output, mfcc_input, MFCC_SIZE, FFT_PREC);

    // MFCC
//  run_mfcc();   // TODO: on PE

//    mfcc_to_dnn(test, test1);

    signal_done();
    delay(7000);
    return 1;
 		// done
}