fft_kernel(size_t ndim, const size_stride_t *src0_size_stride) { MKL_LONG status; switch (ndim) { case 1: { MKL_LONG src0_size = src0_size_stride[0].dim_size; status = DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 1, src0_size); break; } case 2: { MKL_LONG src0_size[2] = {src0_size_stride[0].dim_size, src0_size_stride[1].dim_size}; status = DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 2, src0_size); break; } default: break; } status = DftiSetValue(descriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE); status = DftiCommitDescriptor(descriptor); }
ifft_kernel(size_t ndim, const char *src0_metadata, real_type scale) { if (ndim == 1) { DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 1, reinterpret_cast<const size_stride_t *>(src0_metadata)->dim_size); } else { /* MKL_LONG src0_size[3]; for (size_t i = 0; i < ndim; ++i) { src0_size[i] = reinterpret_cast<const size_stride_t *>(src0_metadata)->dim_size; src0_metadata += sizeof(size_stride_t); } */ } DftiSetValue(descriptor, DFTI_BACKWARD_SCALE, scale); DftiSetValue(descriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE); DftiCommitDescriptor(descriptor); }
void FFT_MKL(double *inputSignal, double *outputSignal,int n) { DFTI_DESCRIPTOR_HANDLE handle; MKL_LONG status; status = DftiCreateDescriptor(&handle, DFTI_DOUBLE, DFTI_REAL, 1, n); status = DftiSetValue(handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); status = DftiCommitDescriptor(handle); status = DftiComputeForward(handle, inputSignal, outputSignal); int k = n / 2; if (n % 2 != 0) { k++; } for (int i = n / 2 + 1; i < n; i++) { k--; outputSignal[2 * i] = outputSignal[2 * k]; outputSignal[2 * i + 1] = (-1) * outputSignal[2 * k + 1]; } status = DftiFreeDescriptor(&handle); }
// Basic constructor without passing in vector structures, which need to be registered later or // passed into the calculate[FFT/IFT] methods. N_UTL_IntelFFT_Interface( int length, int numSignals=1, int reqStride=0, bool overwrite=false ) : N_UTL_FFTInterfaceDecl<VectorType>(length, numSignals, reqStride, overwrite) { // create the fft descriptor structor int fftDimension = 1; // 1D fft's long status = DftiCreateDescriptor( &fftDescriptor, DFTI_DOUBLE, DFTI_REAL, fftDimension, this->signalLength_ ); checkAndTrapErrors( status ); // configure the fft library to do numberSignals of 1D FFT's at the same time status = DftiSetValue( fftDescriptor, DFTI_NUMBER_OF_TRANSFORMS, this->numberSignals_); checkAndTrapErrors( status ); // The real input is signalLength long and the output is twice as long, complex status = DftiSetValue( fftDescriptor, DFTI_INPUT_DISTANCE, this->signalLength_); checkAndTrapErrors( status ); status = DftiSetValue( fftDescriptor, DFTI_OUTPUT_DISTANCE, 2*this->signalLength_); checkAndTrapErrors( status ); if( !overwrite ) { // Don't overwrite the input with the results status = DftiSetValue( fftDescriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE); checkAndTrapErrors( status ); } // I had set this up to use a slightly tighter packing scheme, but FFTW // does not support such schemes. So, we'll generalize and stick to // one that is supported by both to make it easier for Xyce to use // this inteface regardless of the underlying FFT library. // // Packing scheme is CSS which for real input of: // in0, in1, ... , inN-1 // // produces // r0, 0.0, r1, c1, ... , rN, cN (two extra element if N is even) // r0, 0.0, r1, c1, ... , rN, cN (one extra element if N is odd) // // see // // http://www.intel.com/software/products/mkl/docs/WebHelp/mklrefman.htm // // for more details // The forward and backward transform must have a consistent scale factor // so that the inverse of a forward transform is the same signal. By default // we'll use 1.0 for the forward scale factor and then 1/n for the inverse transform. double scaleFactor = 1.0 / this->signalLength_; status = DftiSetValue( fftDescriptor, DFTI_BACKWARD_SCALE, scaleFactor); checkAndTrapErrors( status ); if( this->stride_ != 0 ) { // try to overwrite the default stride int strideArray[2]; strideArray[0] = 0; // this is the offset from the start. We'll fix at zero strideArray[1] = this->stride_; // this is the offset to the next value. status = DftiSetValue(fftDescriptor, DFTI_INPUT_STRIDES, strideArray); checkAndTrapErrors( status ); status = DftiSetValue(fftDescriptor, DFTI_OUTPUT_STRIDES, strideArray); checkAndTrapErrors( status ); } // commit it so that the library can do any needed allocations status = DftiCommitDescriptor( fftDescriptor ); checkAndTrapErrors( status ); }
int main(void) { /* Size of 1D transform */ int N = 1000000; /* Arbitrary harmonic */ int H = -N/2; /* Execution status */ MKL_LONG status = 0; int forward_ok = 1, backward_ok = 1; double time_start = 0, time_end = 0; double flops = 0; printf("Forward and backward 1D complex inplace transforms\n"); printf("Allocate space for data on the host\n"); x = (COMPLEX*)malloc( N * sizeof(COMPLEX) ); if (0 == x) { printf("Error: memory allocation on host failed\n"); exit(1); } printf("Preallocate memory on the target\n"); /* * SOLUTION: Use offload pragma to preallocate memory for x on the target. * (1) The lenght of x is N * (2) Make sure the memory of x is aligned on 64-byte boundary * (3) Make sure the allocated memory is not freed */ #pragma offload target(mic) in(x:length(N) align(64) alloc_if(1) free_if(0)) { } printf("Create handle for 1D single-precision forward and backward transforms\n"); /* * SOLUTION: Offload the call to DftiCreateDescriptor to the target. * (1) What would be the 'in' variables? * (2) What would be the 'out' variables? */ #pragma offload target(mic) in(N) nocopy(handle) out(status) { status = DftiCreateDescriptor(&handle, DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG)N ); if (0 == status) { status = DftiCommitDescriptor(handle); } } if (status) { printf("Error: cannot create handle\n"); exit(1); } /* * SOLUTION: Offload the call to DftiComputeForward to the target. * (1) Make sure x is an 'inout' variable, because this is in-place * transform. * (2) Do not allocate memory for x because it was preallocated. * (3) Do not free momory of x because we will use it again for more * transforms. * (4) What would be the 'out' variables? */ // We do not time the first offload. #pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \ nocopy(handle) out(status) { status = DftiComputeForward(handle, x); } printf("Initialize input for forward transform\n"); init(x, N, H); printf("Offload forward FFT computation to the target\n"); time_start = dsecnd(); /* * SOLUTION: Offload the call to DftiComputeForward to the target. * This should be the same as the previous offload. */ #pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \ nocopy(handle) out(status) { status = DftiComputeForward(handle, x); } time_end = dsecnd(); if (status) { printf("Error: DftiComputeForward failed\n"); exit(1); } printf("Verify result of forward FFT\n"); forward_ok = verify(x, N, H); if (0 == forward_ok) { flops = 5 * N * log2((double)N) / (time_end - time_start); printf("\t Forward: size = %d, GFlops = %.3f \n", N, flops/1000000000); } printf("Initialize input for backward transform\n"); init(x, N, -H); printf("Offload backward FFT computation to the target\n"); time_start = dsecnd(); /* * SOLUTION: Offload the call to DftiComputeBackward to the target. * (1) Make sure x is an 'inout' variable, because this is in-place * transform. * (2) Do not allocate memory for x because it was preallocated. * (3) Do not free momory of x at this time. * (4) What would be the 'out' variables? */ #pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \ nocopy(handle) out(status) { status = DftiComputeBackward(handle, x); } time_end = dsecnd(); if (status) { printf("Error: DftiComputeBackward failed\n"); exit(1); } printf("Verify result of backward FFT\n"); backward_ok = verify(x, N, H); if (0 == backward_ok) { flops = 5 * N * log2((double)N) / (time_end - time_start); printf("\t Backward: size = %d, GFlops = %.3f \n", N, flops/1000000000 ); } printf("Destroy DFTI handle and free space on the target\n"); /* * SOLUTION: Use offload pragma to deallocate memory of x on the target. * (1) What would be 'in' variables? * (2) Do the 'in' variables need to be copied in? */ #pragma offload target(mic) nocopy(x:length(N) alloc_if(0) free_if(1)) \ nocopy(handle) { DftiFreeDescriptor(&handle); } printf("Free space on host\n"); free(x); printf("TEST %s\n",0==forward_ok ? "FORWARD FFT PASSED" : "FORWARD FFT FAILED"); printf("TEST %s\n",0==backward_ok ? "BACKWARD FFT PASSED" : "BACKWARD FFT FAILED"); return 0; }
int gmx_fft_init_1d(gmx_fft_t * pfft, int nx, enum gmx_fft_flag flags) { gmx_fft_t fft; int d; int status; if(pfft==NULL) { gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer."); return EINVAL; } *pfft = NULL; if( (fft = malloc(sizeof(struct gmx_fft))) == NULL) { return ENOMEM; } /* Mark all handles invalid */ for(d=0;d<3;d++) { fft->inplace[d] = fft->ooplace[d] = NULL; } fft->ooplace[3] = NULL; status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if( status == 0 ) status = DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE); if( status == 0 ) status = DftiCommitDescriptor(fft->inplace[0]); if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if( status == 0) DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE); if( status == 0) DftiCommitDescriptor(fft->ooplace[0]); if( status != 0 ) { gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status); gmx_fft_destroy(fft); return status; } fft->ndim = 1; fft->nx = nx; fft->real_fft = 0; fft->work = NULL; *pfft = fft; return 0; }
int gmx_fft_init_3d_real(gmx_fft_t * pfft, int nx, int ny, int nz, enum gmx_fft_flag flags) { gmx_fft_t fft; int d; int status; MKL_LONG stride[2]; int nzc; if(pfft==NULL) { gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer."); return EINVAL; } *pfft = NULL; nzc = (nz/2 + 1); if( (fft = malloc(sizeof(struct gmx_fft))) == NULL) { return ENOMEM; } /* Mark all handles invalid */ for(d=0;d<3;d++) { fft->inplace[d] = fft->ooplace[d] = NULL; } fft->ooplace[3] = NULL; /* Roll our own 3D real transform using multiple transforms in MKL, * since the current MKL versions does not support our storage format * or 3D real transforms. */ /* In-place X FFT. * ny*nzc complex-to-complex transforms, length nx * transform distance: 1 * element strides: ny*nzc */ status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if ( status == 0) { stride[0] = 0; stride[1] = ny*nzc; status = (DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE) || DftiSetValue(fft->inplace[0],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)ny*nzc) || DftiSetValue(fft->inplace[0],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->inplace[0],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->inplace[0],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->inplace[0],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->inplace[0])); } /* Out-of-place X FFT: * ny*nzc complex-to-complex transforms, length nx * transform distance: 1 * element strides: ny*nzc */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if( status == 0 ) { stride[0] = 0; stride[1] = ny*nzc; status = (DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[0],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)ny*nzc) || DftiSetValue(fft->ooplace[0],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[0],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[0])); } /* In-place Y FFT. * We cannot do all NX*NZC transforms at once, so define a handle to do * NZC transforms, and then execute it NX times. * nzc complex-to-complex transforms, length ny * transform distance: 1 * element strides: nzc */ if( status == 0 ) status = DftiCreateDescriptor(&fft->inplace[1],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)ny); if( status == 0 ) { stride[0] = 0; stride[1] = nzc; status = (DftiSetValue(fft->inplace[1],DFTI_PLACEMENT,DFTI_INPLACE) || DftiSetValue(fft->inplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nzc) || DftiSetValue(fft->inplace[1],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->inplace[1],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->inplace[1],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->inplace[1],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->inplace[1])); } /* Out-of-place Y FFT: * We cannot do all NX*NZC transforms at once, so define a handle to do * NZC transforms, and then execute it NX times. * nzc complex-to-complex transforms, length ny * transform distance: 1 * element strides: nzc */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[1],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)ny); if( status == 0 ) { stride[0] = 0; stride[1] = nzc; status = (DftiSetValue(fft->ooplace[1],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nzc) || DftiSetValue(fft->ooplace[1],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[1],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[1])); } /* In-place Z FFT: * nx*ny real-to-complex transforms, length nz * transform distance: nzc*2 -> nzc*2 * element strides: 1 */ if( status == 0 ) status = DftiCreateDescriptor(&fft->inplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->inplace[2],DFTI_PLACEMENT,DFTI_INPLACE) || DftiSetValue(fft->inplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) || DftiSetValue(fft->inplace[2],DFTI_INPUT_DISTANCE,(MKL_LONG)nzc*2) || DftiSetValue(fft->inplace[2],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->inplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nzc*2) || DftiSetValue(fft->inplace[2],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->inplace[2])); } /* Out-of-place real-to-complex (affects distance) Z FFT: * nx*ny real-to-complex transforms, length nz * transform distance: nz -> nzc*2 * element STRIDES: 1 */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->ooplace[2],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) || DftiSetValue(fft->ooplace[2],DFTI_INPUT_DISTANCE,(MKL_LONG)nz) || DftiSetValue(fft->ooplace[2],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nzc*2) || DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[2])); } /* Out-of-place complex-to-real (affects distance) Z FFT: * nx*ny real-to-complex transforms, length nz * transform distance: nzc*2 -> nz * element STRIDES: 1 */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[3],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->ooplace[3],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[3],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) || DftiSetValue(fft->ooplace[3],DFTI_INPUT_DISTANCE,(MKL_LONG)nzc*2) || DftiSetValue(fft->ooplace[3],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[3],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nz) || DftiSetValue(fft->ooplace[3],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[3])); } if ( status == 0 ) { if ((fft->work = malloc(sizeof(t_complex)*(nx*ny*(nz/2+1)))) == NULL) { status = ENOMEM; } } if( status != 0 ) { gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status); gmx_fft_destroy(fft); return status; } fft->ndim = 3; fft->nx = nx; fft->ny = ny; fft->nz = nz; fft->real_fft = 1; *pfft = fft; return 0; }
int gmx_fft_init_2d_real(gmx_fft_t * pfft, int nx, int ny, enum gmx_fft_flag flags) { gmx_fft_t fft; int d; int status; MKL_LONG stride[2]; MKL_LONG nyc; if(pfft==NULL) { gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer."); return EINVAL; } *pfft = NULL; if( (fft = malloc(sizeof(struct gmx_fft))) == NULL) { return ENOMEM; } nyc = (ny/2 + 1); /* Mark all handles invalid */ for(d=0;d<3;d++) { fft->inplace[d] = fft->ooplace[d] = NULL; } fft->ooplace[3] = NULL; /* Roll our own 2D real transform using multiple transforms in MKL, * since the current MKL versions does not support our storage format, * and all but the most recent don't even have 2D real FFTs. */ /* In-place X FFT */ status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if ( status == 0 ) { stride[0] = 0; stride[1] = nyc; status = (DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE) || DftiSetValue(fft->inplace[0],DFTI_NUMBER_OF_TRANSFORMS,nyc) || DftiSetValue(fft->inplace[0],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->inplace[0],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->inplace[0],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->inplace[0],DFTI_OUTPUT_STRIDES,stride)); } if( status == 0 ) status = DftiCommitDescriptor(fft->inplace[0]); /* Out-of-place X FFT */ if( status == 0 ) status = DftiCreateDescriptor(&(fft->ooplace[0]),GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx); if( status == 0 ) { stride[0] = 0; stride[1] = nyc; status = (DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[0],DFTI_NUMBER_OF_TRANSFORMS,nyc) || DftiSetValue(fft->ooplace[0],DFTI_INPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[0],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_DISTANCE,1) || DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_STRIDES,stride)); } if( status == 0 ) status = DftiCommitDescriptor(fft->ooplace[0]); /* In-place Y FFT */ if( status == 0 ) status = DftiCreateDescriptor(&fft->inplace[1],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->inplace[1],DFTI_PLACEMENT,DFTI_INPLACE) || DftiSetValue(fft->inplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx) || DftiSetValue(fft->inplace[1],DFTI_INPUT_DISTANCE,2*nyc) || DftiSetValue(fft->inplace[1],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->inplace[1],DFTI_OUTPUT_DISTANCE,2*nyc) || DftiSetValue(fft->inplace[1],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->inplace[1])); } /* Out-of-place real-to-complex (affects output distance) Y FFT */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[1],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->ooplace[1],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx) || DftiSetValue(fft->ooplace[1],DFTI_INPUT_DISTANCE,(MKL_LONG)ny) || DftiSetValue(fft->ooplace[1],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_DISTANCE,2*nyc) || DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[1])); } /* Out-of-place complex-to-real (affects output distance) Y FFT */ if( status == 0 ) status = DftiCreateDescriptor(&fft->ooplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny); if( status == 0 ) { stride[0] = 0; stride[1] = 1; status = (DftiSetValue(fft->ooplace[2],DFTI_PLACEMENT,DFTI_NOT_INPLACE) || DftiSetValue(fft->ooplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx) || DftiSetValue(fft->ooplace[2],DFTI_INPUT_DISTANCE,2*nyc) || DftiSetValue(fft->ooplace[2],DFTI_INPUT_STRIDES,stride) || DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)ny) || DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_STRIDES,stride) || DftiCommitDescriptor(fft->ooplace[2])); } if ( status == 0 ) { if ((fft->work = malloc(sizeof(t_complex)*(nx*(ny/2+1)))) == NULL) { status = ENOMEM; } } if( status != 0 ) { gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status); gmx_fft_destroy(fft); return status; } fft->ndim = 2; fft->nx = nx; fft->ny = ny; fft->real_fft = 1; *pfft = fft; return 0; }
int bi_entry(void * mdpv, int iproblemsize, double * dresults) { /* dstart, dend: the start and end time of the measurement */ /* dtime: the time for a single measurement in seconds */ double dstart = 0.0, dend = 0.0, dtime = 0.0, dinit = 0.0; /* flops stores the calculated FLOPS */ double flops = 0.0; /* ii is used for loop iterations */ myinttype ii, jj, imyproblemsize, numberOfRuns; /* cast void* pointer */ mydata_t* pmydata = (mydata_t*)mdpv; int invalid = 0; long status; /* calculate real problemsize */ imyproblemsize = (int)pow(2, (log2(pmydata->min) + (myinttype)iproblemsize - 1)); /* store the value for the x axis in results[0] */ dresults[0] = (double)imyproblemsize; /*** in place run ***/ /* malloc */ pmydata->inout = (float*)malloc(sizeof(float) * imyproblemsize * 2); /* create FFT plan */ status = DftiCreateDescriptor(&pmydata->my_desc_handle, DFTI_SINGLE, DFTI_COMPLEX, 1, imyproblemsize); status = DftiCommitDescriptor(pmydata->my_desc_handle); /* init stuff */ initData_ip(pmydata, imyproblemsize); numberOfRuns = 1; dstart = bi_gettime(); /* fft calculation */ status = DftiComputeForward(pmydata->my_desc_handle, pmydata->inout); dend = bi_gettime(); /* calculate the used time*/ dtime = dend - dstart; dtime -= dTimerOverhead; /* loop calculation if accuracy is insufficient */ while (dtime < 100 * dTimerGranularity) { numberOfRuns = numberOfRuns * 2; dstart = bi_gettime(); for (jj = 0; jj < numberOfRuns; jj++) { /* fft calculation */ status = DftiComputeForward(pmydata->my_desc_handle, pmydata->inout); } dend = bi_gettime(); dtime = dend - dstart; dtime -= dTimerOverhead; } /* check for overflows */ for (ii = 0; ii < imyproblemsize; ii++) { if (isnan(pmydata->inout[2 * ii]) || isnan(pmydata->inout[2 * ii + 1])) invalid = 1; if (isinf(pmydata->inout[2 * ii]) || isinf(pmydata->inout[2 * ii + 1])) invalid = 1; } /* if loop was necessary */ if (numberOfRuns > 1) dtime = dtime / numberOfRuns; /* calculate the used FLOPS */ flops = (double)(5.0 * imyproblemsize * (log2(1.0 * imyproblemsize)) / dtime); /* store the FLOPS in results[1] */ if (invalid == 1) dresults[1] = INVALID_MEASUREMENT; else dresults[1] = flops; status = DftiFreeDescriptor(&pmydata->my_desc_handle); /* free data */ free(pmydata->inout); /*** out of place run ***/ /* malloc */ pmydata->in = (float*)malloc(sizeof(float) * imyproblemsize * 2); pmydata->out = (float*)malloc(sizeof(float) * imyproblemsize * 2); /* create FFT plan */ status = DftiCreateDescriptor(&pmydata->my_desc_handle, DFTI_SINGLE, DFTI_COMPLEX, 1, imyproblemsize); status = DftiSetValue(pmydata->my_desc_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); status = DftiCommitDescriptor(pmydata->my_desc_handle); /* init stuff */ initData_oop(pmydata, imyproblemsize); numberOfRuns = 1; dstart = bi_gettime(); /* fft calculation */ status = DftiComputeForward(pmydata->my_desc_handle, pmydata->in, pmydata->out); dend = bi_gettime(); /* calculate the used time*/ dtime = dend - dstart; dtime -= dTimerOverhead; /* loop calculation if accuracy is insufficient */ while (dtime < 100 * dTimerGranularity) { numberOfRuns = numberOfRuns * 2; dstart = bi_gettime(); for (ii = 0; ii < numberOfRuns; ii++) { /* fft calculation */ status = DftiComputeForward(pmydata->my_desc_handle, pmydata->in, pmydata->out); } dend = bi_gettime(); /* calculate the used time*/ dtime = dend - dstart; dtime -= dTimerOverhead; } /* if loop was necessary */ if (numberOfRuns > 1) dtime = dtime / numberOfRuns; /* check for overflows */ for (ii = 0; ii < imyproblemsize; ii++) { if (isnan(pmydata->out[2 * ii]) || isnan(pmydata->out[2 * ii + 1])) invalid = 1; if (isinf(pmydata->out[2 * ii]) || isinf(pmydata->out[2 * ii + 1])) invalid = 1; } /* calculate the used FLOPS */ flops = (double)(5.0 * imyproblemsize * (log2(1.0 * imyproblemsize)) / dtime); /* store the FLOPS in results[2] */ if (invalid == 1) dresults[2] = INVALID_MEASUREMENT; else dresults[2] = flops; status = DftiFreeDescriptor(&pmydata->my_desc_handle); /* free data */ free(pmydata->in); free(pmydata->out); return 0; }
void ccmfft(complex *data, int n1, int n2, int ld1, int sign) { #if defined(HAVE_LIBSCS) int ntable, nwork, zero=0; static int isys, nprev=0; static float *work, *table, scale=1.0; #elif defined(ACML440) static int nprev=0; int nwork, zero=0, one=1, i, j, inpl; static int isys; static complex *work; REAL scl; complex *y; #elif defined(MKL) static DFTI_DESCRIPTOR_HANDLE handle[MAX_NUMTHREADS]; static int nprev[MAX_NUMTHREADS]; MKL_LONG Status; int j; #endif int id; #ifdef _OPENMP id = omp_get_thread_num(); #else id = 0; #endif #if defined(HAVE_LIBSCS) if (n1 != nprev) { isys = 0; ntable = 2*n1 + 30; nwork = 2*n1; if (work) free(work); work = (float *)malloc(nwork*sizeof(float)); if (work == NULL) fprintf(stderr,"ccmfft: memory allocation error\n"); if (table) free(table); table = (float *)malloc(ntable*sizeof(float)); if (table == NULL) fprintf(stderr,"ccmfft: memory allocation error\n"); ccfftm_(&zero, &n1, &n2, &scale, data, &ld1, data, &ld1, table, work, &isys); nprev = n1; } ccfftm_(&sign, &n1, &n2, &scale, data, &ld1, data, &ld1, table, work, &isys); #elif defined(ACML440) scl = 1.0; inpl = 1; if (n1 != nprev) { isys = 0; nwork = 5*n1 + 100; if (work) free(work); work = (complex *)malloc(nwork*sizeof(complex)); if (work == NULL) fprintf(stderr,"rc1fft: memory allocation error\n"); acmlccmfft(zero, scl, inpl, n2, n1, data, 1, ld1, y, 1, ld1, work, &isys); nprev = n1; } acmlccmfft(sign, scl, inpl, n2, n1, data, 1, ld1, y, 1, ld1, work, &isys); #elif defined(MKL) if (n1 != nprev[id]) { DftiFreeDescriptor(&handle[id]); Status = DftiCreateDescriptor(&handle[id], DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG)n1); if(! DftiErrorClass(Status, DFTI_NO_ERROR)){ dfti_status_print(Status); printf(" DftiCreateDescriptor FAIL\n"); } Status = DftiCommitDescriptor(handle[id]); if(! DftiErrorClass(Status, DFTI_NO_ERROR)){ dfti_status_print(Status); printf(" DftiCommitDescriptor FAIL\n"); } nprev[id] = n1; } if (sign < 0) { for (j=0; j<n2; j++) { Status = DftiComputeBackward(handle[id], &data[j*ld1]); } } else { for (j=0; j<n2; j++) { Status = DftiComputeForward(handle[id], &data[j*ld1]); } } #else ccm_fft(data, n1, n2, ld1, sign); #endif return; }
struct fft_plan_3d *fft_3d_create_plan( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; // system specific variables #ifdef FFT_SCSL FFT_DATA dummy_d[5]; FFT_PREC dummy_p[5]; int isign,isys; FFT_PREC scalef; #endif #ifdef FFT_INTEL FFT_DATA dummy; #endif #ifdef FFT_T3E FFT_DATA dummy[5]; int isign,isys; double scalef; #endif // query MPI info MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); // compute division of procs in 2 dimensions not on-processor bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; // allocate memory for plan data struct plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; // remap from initial distribution to layout needed for 1st set of 1d FFTs // not needed if all procs own entire fast axis initially // first indices = distribution after 1st set of FFTs if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,2,0,0,FFT_PRECISION); if (plan->pre_plan == NULL) return NULL; } // 1d FFTs along fast axis plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); // remap from 1st to 2nd FFT // choose which axis is split over np1 vs np2 to minimize communication // second indices = distribution after 2nd set of FFTs second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi,2,1,0,FFT_PRECISION); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); // remap from 2nd to 3rd FFT // if final distribution is permute=2 with all procs owning entire slow axis // then this remapping goes directly to final distribution // third indices = distribution after 3rd set of FFTs if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi,2,1,0,FFT_PRECISION); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; // remap from 3rd FFT to final distribution // not needed if permute = 2 and third indices = out indices on all procs if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION); if (plan->post_plan == NULL) return NULL; } // configure plan memory pointers and allocate work space // out_size = amount of memory given to FFT by user // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps // copy_size = amount needed internally for extra copy of data // scratch_size = amount needed internally for remap scratch space // for each remap: // out space used for result if big enough, else require copy buffer // accumulate largest required remap scratch space out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; // system specific pre-computation of 1d FFT coeffs // and scaling normalization #if defined(FFT_SGI) plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; FFT_1D_INIT(nfast,plan->coeff1); FFT_1D_INIT(nmid,plan->coeff2); FFT_1D_INIT(nslow,plan->coeff3); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_SCSL) plan->coeff1 = (FFT_PREC *) malloc((2*nfast+30)*sizeof(FFT_PREC)); plan->coeff2 = (FFT_PREC *) malloc((2*nmid+30)*sizeof(FFT_PREC)); plan->coeff3 = (FFT_PREC *) malloc((2*nslow+30)*sizeof(FFT_PREC)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (FFT_PREC *) malloc((2*nfast)*sizeof(FFT_PREC)); plan->work2 = (FFT_PREC *) malloc((2*nmid)*sizeof(FFT_PREC)); plan->work3 = (FFT_PREC *) malloc((2*nslow)*sizeof(FFT_PREC)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(isign,nfast,scalef,dummy_d,dummy_d,plan->coeff1,dummy_p,&isys); FFT_1D_INIT(isign,nmid,scalef,dummy_d,dummy_d,plan->coeff2,dummy_p,&isys); FFT_1D_INIT(isign,nslow,scalef,dummy_d,dummy_d,plan->coeff3,dummy_p,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_ACML) plan->coeff1 = (FFT_DATA *) malloc((3*nfast+100)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid+100)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow+100)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; int isign = 100; int isys = 1; int info = 0; FFT_DATA *dummy = NULL; FFT_1D(&isign,&isys,&nfast,dummy,plan->coeff1,&info); FFT_1D(&isign,&isys,&nmid,dummy,plan->coeff2,&info); FFT_1D(&isign,&isys,&nslow,dummy,plan->coeff3,&info); if (scaled == 0) { plan->scaled = 0; plan->norm = sqrt(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else { plan->scaled = 1; plan->norm = sqrt(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_INTEL) flag = 0; num = 0; factor(nfast,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nmid,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nslow,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm); if (fftflag) { if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n"); return NULL; } plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; flag = 0; FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1); FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2); FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3); if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #elif defined(FFT_MKL) DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nfast); DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total1/nfast); DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast); DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast); DftiCommitDescriptor(plan->handle_fast); DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nmid); DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total2/nmid); DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid); DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid); DftiCommitDescriptor(plan->handle_mid); DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nslow); DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total3/nslow); DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow); DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow); DftiCommitDescriptor(plan->handle_slow); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_DEC) if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #elif defined(FFT_T3E) plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double)); plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double)); plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (double *) malloc((8*nfast)*sizeof(double)); plan->work2 = (double *) malloc((8*nmid)*sizeof(double)); plan->work3 = (double *) malloc((8*nslow)*sizeof(double)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys); FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys); FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_FFTW2) plan->plan_fast_forward = fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_fast_backward = fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); if (nmid == nfast) { plan->plan_mid_forward = plan->plan_fast_forward; plan->plan_mid_backward = plan->plan_fast_backward; } else { plan->plan_mid_forward = fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_mid_backward = fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (nslow == nfast) { plan->plan_slow_forward = plan->plan_fast_forward; plan->plan_slow_backward = plan->plan_fast_backward; } else if (nslow == nmid) { plan->plan_slow_forward = plan->plan_mid_forward; plan->plan_slow_backward = plan->plan_mid_backward; } else { plan->plan_slow_forward = fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_slow_backward = fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #elif defined(FFT_FFTW3) plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_mid_forward = FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_slow_forward = FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_BACKWARD,FFTW_ESTIMATE); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #else plan->cfg_fast_forward = kiss_fft_alloc(nfast,0,NULL,NULL); plan->cfg_fast_backward = kiss_fft_alloc(nfast,1,NULL,NULL); if (nmid == nfast) { plan->cfg_mid_forward = plan->cfg_fast_forward; plan->cfg_mid_backward = plan->cfg_fast_backward; } else { plan->cfg_mid_forward = kiss_fft_alloc(nmid,0,NULL,NULL); plan->cfg_mid_backward = kiss_fft_alloc(nmid,1,NULL,NULL); } if (nslow == nfast) { plan->cfg_slow_forward = plan->cfg_fast_forward; plan->cfg_slow_backward = plan->cfg_fast_backward; } else if (nslow == nmid) { plan->cfg_slow_forward = plan->cfg_mid_forward; plan->cfg_slow_backward = plan->cfg_mid_backward; } else { plan->cfg_slow_forward = kiss_fft_alloc(nslow,0,NULL,NULL); plan->cfg_slow_backward = kiss_fft_alloc(nslow,1,NULL,NULL); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif return plan; }
int gmx_fft_init_1d_real(gmx_fft_t * pfft, int nx, gmx_fft_flag gmx_unused flags) { gmx_fft_t fft; int d; int status; if (pfft == NULL) { gmx_fatal(FARGS, "Invalid opaque FFT datatype pointer."); return EINVAL; } *pfft = NULL; if ( (fft = (gmx_fft_t)malloc(sizeof(struct gmx_fft))) == NULL) { return ENOMEM; } /* Mark all handles invalid */ for (d = 0; d < 3; d++) { fft->inplace[d] = fft->ooplace[d] = NULL; } fft->ooplace[3] = NULL; status = DftiCreateDescriptor(&fft->inplace[0], GMX_DFTI_PREC, DFTI_REAL, 1, (MKL_LONG)nx); if (status == 0) { status = DftiSetValue(fft->inplace[0], DFTI_PLACEMENT, DFTI_INPLACE); } if (status == 0) { status = DftiCommitDescriptor(fft->inplace[0]); } if (status == 0) { status = DftiCreateDescriptor(&fft->ooplace[0], GMX_DFTI_PREC, DFTI_REAL, 1, (MKL_LONG)nx); } if (status == 0) { status = DftiSetValue(fft->ooplace[0], DFTI_PLACEMENT, DFTI_NOT_INPLACE); } if (status == 0) { status = DftiCommitDescriptor(fft->ooplace[0]); } if (status == DFTI_UNIMPLEMENTED) { gmx_fatal(FARGS, "The linked Intel MKL version (<6.0?) cannot do real FFTs."); gmx_fft_destroy(fft); return status; } if (status != 0) { gmx_fatal(FARGS, "Error initializing Intel MKL FFT; status=%d", status); gmx_fft_destroy(fft); return status; } fft->ndim = 1; fft->nx = nx; fft->real_fft = 1; fft->work = NULL; *pfft = fft; return 0; }