int main() { int i; unsigned int *order; dft_init(6); dft_thread_create(sink); dft_thread_create(muxer); dft_thread_create(intermediate); dft_thread_create(intermediate); dft_thread_create(doubler); dft_thread_create(source); dft_thread_link(5, 4); dft_thread_link(4, 3); dft_thread_link(4, 2); dft_thread_link(3, 1); dft_thread_link(2, 1); dft_thread_link(1, 0); dft_execute(); return 0; }
int main(int argc, char **argv) { if(argc!=2) { printf("Usage:- %s <PREC>.\n",argv[0]); exit(0); } uint64_t prec; // how many bits do we want MPFI to use? prec=atoi(argv[1]); if((prec<MPFR_PREC_MIN)||(prec>MPFR_PREC_MAX)) { printf("Usage:- %s <PREC>.\n",argv[0]); exit(0); } dft_init(prec); #define Q (5*37*41) #define PHI_Q (4*36*40) mpfi_c_t vec[PHI_Q]; uint64_t dims[3]={4,36,40}; uint64_t i; for(i=0;i<PHI_Q;i++) { mpfi_c_init(vec[i]); mpfi_c_set_ui(vec[i],rand(),0); } // // this will do 36*40 length 4 dft's of i, i+36*40, i+2*36*40 and i+3*36+40 // then 4*40 length 36 dfts // then 4*36 length 40 dfts // ndft(vec,PHI_Q,3,dims); mpfi_c_print_str("res[100]: ",vec[100]); return(0); }
int main(void) { int i; const int numDFTs0 = 1; const int numDFTs1 = 10; const int numDFTs2 = 100; dft_init(); // Initialize the sliding DFT printf("With %d channels, k=%d, N=%d:\n", M_SLIDING_DFT_nchannels, M_SLIDING_DFT_k, M_SLIDING_DFT_N); memset(sample, 0, sizeof(adcdata_t) * M_SLIDING_DFT_nchannels); resetTimer(); T1CONbits.TON = 1; for (i = 0 ; i < numDFTs0 ; i ++) dft_update(); T1CONbits.TON = 0; printf(" Ran %d DFTs in %d clock cycles.\n", numDFTs0, TMR1); printf(" %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs0))); resetTimer(); T1CONbits.TON = 1; for (i = 0 ; i < numDFTs1 ; i ++) dft_update(); T1CONbits.TON = 0; printf(" Ran %d DFTs in %d clock cycles.\n", numDFTs1, TMR1); printf(" %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs1))); resetTimer(); T1CONbits.TON = 1; for (i = 0 ; i < numDFTs2 ; i ++) dft_update(); T1CONbits.TON = 0; printf(" Ran %d DFTs in %d clock cycles.\n", numDFTs2, TMR1); printf(" %d cycles per channel\n", (int)((float)TMR1/(M_SLIDING_DFT_nchannels*numDFTs2))); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=0;i<N1;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) for(j=0;j<N2;j++) u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambda/Gamma1*scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; idft(u, u_fft2, w1, w2, buff, N1, N2); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { float tmp; float Thresh=1.0/Gamma1; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } } for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } } for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } free(w1); free(w2); free(buff); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float sum = 0; float scale = sqrtf(N1*N2); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; float complex *w1; float complex *w2; float complex *buff; double lambdaGamma1 = lambda/Gamma1; double lambdaGamma1Scale = lambda/Gamma1*scale; float Thresh=1.0/Gamma1; MPI_Datatype mpi_complexf; MPI_Type_contiguous(2, MPI_FLOAT, &mpi_complexf); MPI_Type_commit(&mpi_complexf); int np, rank; MPI_Comm_size(MPI_COMM_WORLD ,&np); MPI_Comm_rank(MPI_COMM_WORLD ,&rank); int chunksize = N1/np; int start = rank * chunksize; int cnt[np]; int disp[np]; for (i = 0 ; i < np - 1; i ++) { cnt[i] = chunksize * N2 ; disp[i] = chunksize * i * N2; } cnt[i] = (chunksize + N1%np) * N2; disp[i] = chunksize * i * N2; if (rank == np - 1) chunksize += N1%np; int end = start + chunksize; for(i=start; i<chunksize; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); MPI_Allreduce(&sum, &sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); float normFactor = 1.f/sqrtf(sum); for(i=0; i<N1; i++) { for(j=0; j<N2; j++) { f(i, j) = f(i, j)*normFactor; f0(i, j) = f(i, j); } } dft_init(&w1, &w2, &buff, N1, N2); dft(Lap, Lap, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); MPI_Status *status; for(i=start;i<end;i++) for(j=0;j<N2;j++) fftmul(i,j) = 1.0/((lambda/Gamma1)*mask(i,j) - Lap(i,j) + Gamma2); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { for(i=start;i<end;i++) for(j=0;j<N2;j++) diff(i,j) = dtildex(i,j)-dtildex(i,(j-1)>=0?(j-1):0) + dtildey(i,j)- dtildey((i-1)>=0?(i-1):0,j) ; dft(diff, diff, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); for(i=start;i<end;i++) { for(j=0;j<N2;j++) { u_fft2(i,j) = fftmul(i,j)*(f(i,j)*lambdaGamma1Scale-diff(i,j)+Gamma2*u_fft2(i,j)) ; if (iter == MaxIter - 1) f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } idft(u, u_fft2, w1, w2, buff, N1, N2, start, end, cnt, disp, mpi_complexf, rank); //MPI_Allgatherv(u + disp[rank], cnt[rank], mpi_complexf, u, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); else if (rank == 0) MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); else { MPI_Recv(u + disp[rank] + cnt[rank], N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD, status); MPI_Send(u + disp[rank], N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD); } for(i=start;i<end;i++) { for(j=0;j<N2;j++) { float tmp; dx(i,j) = u(i,j<(N2-1)?(j+1):j)-u(i,j)+dx(i,j)-dtildex(i,j) ; dy(i,j) = u(i<(N1-1)?(i+1):i,j)-u(i,j)+dy(i,j)-dtildey(i,j) ; tmp = sqrtf(SQR(crealf(dx(i,j)))+SQR(cimagf(dx(i,j))) + SQR(crealf(dy(i,j)))+SQR(cimagf(dy(i,j)))); tmp = max(0,tmp-Thresh)/(tmp+(tmp<Thresh)); dx_new(i,j) =dx(i,j)*tmp; dy_new(i,j) =dy(i,j)*tmp; dtildex(i,j) = 2*dx_new(i,j) - dx(i,j); dtildey(i,j) = 2*dy_new(i,j) - dy(i,j); dx(i,j) = dx_new(i,j); dy(i,j) = dy_new(i,j); } } //MPI_Allgatherv(dtildey + disp[rank], cnt[rank], mpi_complexf, dtildey, cnt, disp, mpi_complexf, MPI_COMM_WORLD); if (rank == np - 1) MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); else if (rank == 0) MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); else { MPI_Recv(dtildey + disp[rank] - N2, N2, mpi_complexf, rank - 1, 0, MPI_COMM_WORLD, status); MPI_Send(dtildey + disp[rank] + cnt[rank] - N2, N2, mpi_complexf, rank + 1, 0, MPI_COMM_WORLD); } } } for(i=start; i<end; i++) { for(j=0; j<N2; j++) { img(i, j) = sqrt(SQR(crealf(u(i, j))) + SQR(cimagf(u(i, j)))); } } MPI_Gatherv(img + disp[rank], cnt[rank], MPI_FLOAT, img, cnt, disp, MPI_FLOAT, 0, MPI_COMM_WORLD); free(w1); free(w2); free(buff); MPI_Finalize(); if (rank > 0) exit(0); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex)); float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex)); float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex)); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status); cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status); cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&kernel, NULL, &status); status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); cl_kernel ker; size_t globalWorkSize[2]={N1,N2}; float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); ker = clCreateKernel(program, "loop1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); w1[0] = 1; w2[0] = 1; dft_init(&w1, &w2, &buff, N1, N2); status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "loop2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(float), &lambda); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL); float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex)); float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex)); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { ker = clCreateKernel(program, "loop3", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); //dft(diff, diff, w1, w2, buff, N1, N2); ker = clCreateKernel(program, "loop4", &status); int more = (iter == MaxIter - 1); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clSetKernelArg(ker, 6, sizeof(float), &scale); status = clSetKernelArg(ker, 7, sizeof(float), &lambda); status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "loop5", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new); status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new); status = clSetKernelArg(ker, 7, sizeof(int), &N1); status = clSetKernelArg(ker, 8, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); } /* ker = clCreateKernel(program, "last_loop", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(float), &scale); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); */ clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); } ker = clCreateKernel(program, "loop7", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 2, sizeof(int), &N1); status = clSetKernelArg(ker, 3, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL); clReleaseKernel(ker); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(cl_img); clReleaseMemObject(cl_mask); clReleaseMemObject(cl_f); clReleaseMemObject(cl_f0); clReleaseMemObject(cl_dx); clReleaseMemObject(cl_dy); clReleaseMemObject(cl_dx_new); clReleaseMemObject(cl_dy_new); clReleaseMemObject(cl_dtildex); clReleaseMemObject(cl_dtildey); clReleaseMemObject(cl_u_fft2); clReleaseMemObject(cl_u); clReleaseMemObject(cl_fftmul); clReleaseMemObject(cl_Lap); clReleaseMemObject(cl_diff); clReleaseMemObject(cl_w1); clReleaseMemObject(cl_w2); clReleaseMemObject(cl_buff); clReleaseContext(context); free(platforms); free(devices); free(w1); free(w2); free(buff); return 0; }
int main() { ////////////////////////////////////////////////////// // init int16_t* raw_fft_input = (int16_t*) RAW_INPUT_ADDR; // size: 512 * 16 / 32 = 200 (0x100) int16_t* fft_input = (int16_t*) INPUT_ADDR; // size: 512 * 16 * 2 / 32 * 1.5 = 768 (0x300) int16_t* fft_output = (int16_t*) OUTPUT_ADDR; // size: 257 * 16 * 2 / 32 * 1.5 = 386 (0x182) int16_t* window_parameter = (int16_t*) WINDOW_PARA_ADDR; // size: 400 * 16 / 32 = 200 (0xC8). int16_t* dft_parameter_r = (int16_t*) DFT_PARA_ADDR; // size: 2 * 2 * 16 / 32 = 2 (0x2). 14 fractional bits int16_t* dft_parameter_i = (int16_t*) DFT_PARA_ADDR + DFT_SIZE * DFT_SIZE * sizeof(int16_t); int16_t* mfcc_input = (int16_t*) MFCC_INPUT_ADDR; // size: 257 * 16 * 2 / 32 = 257 (0x101) // int16_t* dct_parameter = DCT_PARA_ADDR; init_input(raw_fft_input, WINDOW_SIZE); window_init(window_parameter, WINDOW_SIZE); dft_init(dft_parameter_r, dft_parameter_i, DFT_SIZE); // dct_init(dct_parameter); ////////////////////////////////////////////////////// // PE init uint16_t inst_no; set_dnn_insts(); set_nli_parameters(); *REG_WAIT_BEFORE_VDD = 0xff; *DNN_SRAM_RSTN_0 = 0x000007ff; *DNN_SRAM_RSTN_1 = 0x000007ff; *DNN_SRAM_RSTN_2 = 0x000007ff; *DNN_SRAM_RSTN_3 = 0x000007ff; delay(3); *DNN_SRAM_ISOL_0 = 0x000007ff; *DNN_SRAM_ISOL_1 = 0x000007ff; *DNN_SRAM_ISOL_2 = 0x000007ff; *DNN_SRAM_ISOL_3 = 0x000007ff; delay(3); *DNN_PG_0 = 0x000007ff; *DNN_PG_1 = 0x000007ff; *DNN_PG_2 = 0x000007ff; *DNN_PG_3 = 0x000007ff; delay(5); //*DNN_PG_0 = 0x003ff800; //*DNN_PG_1 = 0x003ff800; //*DNN_PG_2 = 0x003ff800; //*DNN_PG_3 = 0x003ff800; //delay(3); *DNN_SRAM_RSTN_0 = 0x000007ff; *DNN_SRAM_RSTN_0 = 0xffffffff; *DNN_SRAM_RSTN_1 = 0xffffffff; *DNN_SRAM_RSTN_2 = 0xffffffff; *DNN_SRAM_RSTN_3 = 0xffffffff; delay(3); // *DNN_RAND_0 = 1; // *DNN_RAND_1 = 1; // *DNN_RAND_2 = 1; // *DNN_RAND_3 = 1; // delay(3); //*DNN_SRAM_ISOL_0 = 0x00000000; //*DNN_SRAM_ISOL_1 = 0x00000000; //*DNN_SRAM_ISOL_2 = 0x00000000; //*DNN_SRAM_ISOL_3 = 0x00000000; //delay(3); signal_debug(1); // init ends; start counting runtime ////////////////////////////////////////////////////// // M0 working sequence // TODO: optimize these arrays int16_t input_r_dft[FFT_SIZE]; int16_t input_i_dft[FFT_SIZE]; // prepare fft fft_apply_window(WINDOW_SIZE, raw_fft_input, window_parameter); prepare_dft_for_fft(WINDOW_SIZE, FFT_SIZE, raw_fft_input, input_r_dft, FFT_NUM, FFT_NUM_BITS, DFT_SIZE); // dft dft(DFT_SIZE, FFT_SIZE, input_r_dft, input_i_dft, dft_parameter_r, dft_parameter_i); prepare_PE_fft_input(fft_input, input_r_dft, input_i_dft, DFT_SIZE, FFT_SIZE, FFT_PREC); signal_debug(2); // M0 ends; write to PE // write to PE uint16_t fft_input_space = FFT_SIZE / 2; // DLC memory occupied. for 16 bits only write_dnn_sram_16(FFT_START, fft_input, fft_input_space); signal_debug(0); // start PE runtime, check if memory correct ////////////////////////////////////////////////////// // PE working sequence inst_no = 0; reset_PE(0); write_instruction(inst_no, 0, 0); // write FFT 0 while (inst_no < FFT_INST - 1) { if (inst_no % 2 == 0) { switch_inst_buffer(0, 0); reset_PE(0); start_pe_inst(0b0001); // start FFT 0, 2, 4, 6 inst_no++; write_instruction(inst_no, 0, 1); // write FFT 1, 3, 5, 7 // clock_gate(); } else { switch_inst_buffer(0, 1); reset_PE(0); start_pe_inst(0b0001); // start FFT 1, 3, 5 inst_no++; write_instruction(inst_no, 0, 0); // write FFT 2, 4, 6 // clock_gate(); } } switch_inst_buffer(0, 1); reset_PE(0); start_pe_inst(0b0001); // start FFT 7 inst_no++; // clock_gate(); // finish wait_until_pe_finish(0b1111); ////////////////////////////////////////////////////// uint16_t fft_output_space = MFCC_SIZE / 2; // DLC memory occupied. for 16 bits only read_dnn_sram_16(FFT_START, fft_output, fft_output_space); extract_fft_output(fft_output, mfcc_input, MFCC_SIZE, FFT_PREC); // MFCC // run_mfcc(); // TODO: on PE // mfcc_to_dnn(test, test1); signal_done(); delay(7000); return 1; // done }