예제 #1
0
      fft_kernel(size_t ndim, const size_stride_t *src0_size_stride) {
        MKL_LONG status;
        switch (ndim) {
        case 1: {
          MKL_LONG src0_size = src0_size_stride[0].dim_size;
          status = DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 1, src0_size);
          break;
        }
        case 2: {
          MKL_LONG src0_size[2] = {src0_size_stride[0].dim_size, src0_size_stride[1].dim_size};
          status = DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 2, src0_size);
          break;
        }
        default:
          break;
        }

        status = DftiSetValue(descriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE);

        status = DftiCommitDescriptor(descriptor);
      }
      ifft_kernel(size_t ndim, const char *src0_metadata, real_type scale) {

        if (ndim == 1) {
          DftiCreateDescriptor(&descriptor, DFTI_DOUBLE, DFTI_COMPLEX, 1,
                                        reinterpret_cast<const size_stride_t *>(src0_metadata)->dim_size);
        } else {
/*
          MKL_LONG src0_size[3];
          for (size_t i = 0; i < ndim; ++i) {
            src0_size[i] = reinterpret_cast<const size_stride_t *>(src0_metadata)->dim_size;
            src0_metadata += sizeof(size_stride_t);
          }
*/
        }

        DftiSetValue(descriptor, DFTI_BACKWARD_SCALE, scale);
        DftiSetValue(descriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE);

        DftiCommitDescriptor(descriptor);
      }
예제 #3
0
void FFT_MKL(double *inputSignal, double *outputSignal,int n) 
{ 
	DFTI_DESCRIPTOR_HANDLE handle;
	MKL_LONG status; 
	status = DftiCreateDescriptor(&handle, DFTI_DOUBLE, DFTI_REAL, 1, n);
	status = DftiSetValue(handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); 
	status = DftiCommitDescriptor(handle);
	status = DftiComputeForward(handle, inputSignal, outputSignal); 
	int k = n / 2; 
	if (n % 2 != 0) 
	{ 
		k++;
	}
	for (int i = n / 2 + 1; i < n; i++)
	{
		k--; 
		outputSignal[2 * i] = outputSignal[2 * k];
		outputSignal[2 * i + 1] = (-1) * outputSignal[2 * k + 1];
	}
	status = DftiFreeDescriptor(&handle); 
}
예제 #4
0
    // Basic constructor without passing in vector structures, which need to be registered later or
    // passed into the calculate[FFT/IFT] methods.
    N_UTL_IntelFFT_Interface( int length, int numSignals=1, int reqStride=0, bool overwrite=false )
      : N_UTL_FFTInterfaceDecl<VectorType>(length, numSignals, reqStride, overwrite)
    {
      // create the fft descriptor structor
      int fftDimension = 1;  // 1D fft's
      long status = DftiCreateDescriptor( &fftDescriptor, DFTI_DOUBLE, DFTI_REAL, fftDimension, this->signalLength_ );
      checkAndTrapErrors( status );

      // configure the fft library to do numberSignals of 1D FFT's at the same time  
      status = DftiSetValue( fftDescriptor, DFTI_NUMBER_OF_TRANSFORMS, this->numberSignals_);
      checkAndTrapErrors( status );

      // The real input is signalLength long and the output is twice as long, complex
      status = DftiSetValue( fftDescriptor, DFTI_INPUT_DISTANCE, this->signalLength_);
      checkAndTrapErrors( status );
      status = DftiSetValue( fftDescriptor, DFTI_OUTPUT_DISTANCE, 2*this->signalLength_);
      checkAndTrapErrors( status );

      if( !overwrite  )
      {
        // Don't overwrite the input with the results
        status = DftiSetValue( fftDescriptor, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
        checkAndTrapErrors( status );
      }

      // I had set this up to use a slightly tighter packing scheme, but FFTW
      // does not support such schemes.  So, we'll generalize and stick to 
      // one that is supported by both to make it easier for Xyce to use 
      // this inteface regardless of the underlying FFT library.
      //
      // Packing scheme is CSS which for real input of:
      // in0, in1, ... , inN-1
      //
      // produces
      // r0, 0.0, r1, c1, ... , rN, cN  (two extra element if N is even)
      // r0, 0.0, r1, c1, ... , rN, cN  (one extra element if N is odd)
      //
      // see
      //
      // http://www.intel.com/software/products/mkl/docs/WebHelp/mklrefman.htm
      //
      // for more details
    
      // The forward and backward transform must have a consistent scale factor
      // so that the inverse of a forward transform is the same signal.  By default
      // we'll use 1.0 for the forward scale factor and then 1/n for the inverse transform.
      double scaleFactor = 1.0 / this->signalLength_;
      status = DftiSetValue( fftDescriptor, DFTI_BACKWARD_SCALE, scaleFactor);
      checkAndTrapErrors( status );
   
      if( this->stride_ != 0 )
      {
        // try to overwrite the default stride
        int strideArray[2];
        strideArray[0] = 0;        // this is the offset from the start.  We'll fix at zero
        strideArray[1] = this->stride_;   // this is the offset to the next value.
        status = DftiSetValue(fftDescriptor, DFTI_INPUT_STRIDES, strideArray);
        checkAndTrapErrors( status );
        status = DftiSetValue(fftDescriptor, DFTI_OUTPUT_STRIDES, strideArray);
        checkAndTrapErrors( status );
      }
   
      // commit it so that the library can do any needed allocations
      status = DftiCommitDescriptor( fftDescriptor );
      checkAndTrapErrors( status );
    }
예제 #5
0
int main(void)
{
    /* Size of 1D transform */
    int N = 1000000;

    /* Arbitrary harmonic */
    int H = -N/2;

    /* Execution status */
    MKL_LONG status = 0;

    int forward_ok = 1, backward_ok = 1;

    double time_start = 0, time_end = 0;
    double flops = 0;

    printf("Forward and backward 1D complex inplace transforms\n");

    printf("Allocate space for data on the host\n");
    x = (COMPLEX*)malloc( N * sizeof(COMPLEX) );
    if (0 == x) {
        printf("Error: memory allocation on host failed\n");
        exit(1);
    }

    printf("Preallocate memory on the target\n");
    /*
     * SOLUTION: Use offload pragma to preallocate memory for x on the target.
     *      (1) The lenght of x is N
     *      (2) Make sure the memory of x is aligned on 64-byte boundary
     *      (3) Make sure the allocated memory is not freed
     */
#pragma offload target(mic) in(x:length(N) align(64) alloc_if(1) free_if(0))
    {
    }

    printf("Create handle for 1D single-precision forward and backward transforms\n");

    /* 
     * SOLUTION: Offload the call to DftiCreateDescriptor to the target.
     *      (1) What would be the 'in' variables?
     *      (2) What would be the 'out' variables?
     */
#pragma offload target(mic) in(N) nocopy(handle) out(status)
    {
        status = DftiCreateDescriptor(&handle, DFTI_SINGLE,
            DFTI_COMPLEX, 1, (MKL_LONG)N );
        if (0 == status)
		{
			status = DftiCommitDescriptor(handle);
		}
    }

    if (status) {
        printf("Error: cannot create handle\n");
        exit(1);
    }

    /* 
     * SOLUTION: Offload the call to DftiComputeForward to the target.
     *      (1) Make sure x is an 'inout' variable, because this is in-place
     *      transform.
     *      (2) Do not allocate memory for x because it was preallocated.
     *      (3) Do not free momory of x because we will use it again for more
     *      transforms.
     *      (4) What would be the 'out' variables?
     */
    // We do not time the first offload.
#pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \
    nocopy(handle) out(status)
    {
        status = DftiComputeForward(handle, x);
    }

    printf("Initialize input for forward transform\n");
    init(x, N, H);

    printf("Offload forward FFT computation to the target\n");
	time_start = dsecnd();
    /*
     * SOLUTION: Offload the call to DftiComputeForward to the target.
     * This should be the same as the previous offload.
     */
#pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \
    nocopy(handle) out(status)
    {
        status = DftiComputeForward(handle, x);
    }
	time_end = dsecnd();

    if (status) {
        printf("Error: DftiComputeForward failed\n");
        exit(1);
    }

    printf("Verify result of forward FFT\n");
    forward_ok = verify(x, N, H);
    if (0 == forward_ok) {
	    flops    = 5 * N * log2((double)N) / (time_end - time_start);
        printf("\t Forward: size = %d, GFlops  = %.3f  \n", N, flops/1000000000);
    }

    printf("Initialize input for backward transform\n");
    init(x, N, -H);

    printf("Offload backward FFT computation to the target\n");

	time_start = dsecnd();
    /* 
     * SOLUTION: Offload the call to DftiComputeBackward to the target.
     *      (1) Make sure x is an 'inout' variable, because this is in-place
     *      transform.
     *      (2) Do not allocate memory for x because it was preallocated.
     *      (3) Do not free momory of x at this time.
     *      (4) What would be the 'out' variables?
     */
#pragma offload target(mic) inout(x:length(N) alloc_if(0) free_if(0)) \
    nocopy(handle) out(status)
    {
        status = DftiComputeBackward(handle, x);
    }
	time_end = dsecnd();

    if (status) {
        printf("Error: DftiComputeBackward failed\n");
        exit(1);
    }

    printf("Verify result of backward FFT\n");
    backward_ok = verify(x, N, H);
    if (0 == backward_ok) {
	    flops    = 5 * N * log2((double)N) / (time_end - time_start);
        printf("\t Backward: size = %d, GFlops  = %.3f  \n", N, flops/1000000000 );
    }

    printf("Destroy DFTI handle and free space on the target\n");
    /*
     * SOLUTION: Use offload pragma to deallocate memory of x on the target.
     *      (1) What would be 'in' variables?
     *      (2) Do the 'in' variables need to be copied in?
     */
#pragma offload target(mic) nocopy(x:length(N) alloc_if(0) free_if(1)) \
    nocopy(handle)
    {
        DftiFreeDescriptor(&handle);
    }

    printf("Free space on host\n");
    free(x);

    printf("TEST %s\n",0==forward_ok ? 
            "FORWARD FFT PASSED" : "FORWARD FFT FAILED");
    printf("TEST %s\n",0==backward_ok ? 
            "BACKWARD FFT PASSED" : "BACKWARD FFT FAILED");

    return 0;
}
예제 #6
0
int
gmx_fft_init_1d(gmx_fft_t *        pfft,
                int                nx,
                enum gmx_fft_flag  flags) 
{
    gmx_fft_t      fft;
    int            d;
    int            status;
    
    if(pfft==NULL)
    {
        gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
        return EINVAL;
    }
    *pfft = NULL;
    
    if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
    {
        return ENOMEM;
    }    

    /* Mark all handles invalid */
    for(d=0;d<3;d++)
    {
        fft->inplace[d] = fft->ooplace[d] = NULL;
    }
    fft->ooplace[3] = NULL;

    
    status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);

    if( status == 0 )
        status = DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE);
    
    if( status == 0 )
        status = DftiCommitDescriptor(fft->inplace[0]);
    
    
    if( status == 0 )    
        status = DftiCreateDescriptor(&fft->ooplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);
    
    if( status == 0)
        DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE);
    
    if( status == 0)
        DftiCommitDescriptor(fft->ooplace[0]);
    
    
    if( status != 0 )
    {
        gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status);
        gmx_fft_destroy(fft);
        return status;
    }
    
    fft->ndim     = 1;
    fft->nx       = nx;
    fft->real_fft = 0;
    fft->work     = NULL;
        
    *pfft = fft;
    return 0;
}
예제 #7
0
int
gmx_fft_init_3d_real(gmx_fft_t *        pfft,
                     int                nx, 
                     int                ny,
                     int                nz,
                     enum gmx_fft_flag  flags) 
{
    gmx_fft_t      fft;
    int            d;
    int            status;
    MKL_LONG       stride[2];
    int            nzc;
    
    if(pfft==NULL)
    {
        gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
        return EINVAL;
    }
    *pfft = NULL;

    nzc = (nz/2 + 1);
    
    if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
    {
        return ENOMEM;
    }    
    
    /* Mark all handles invalid */
    for(d=0;d<3;d++)
    {
        fft->inplace[d] = fft->ooplace[d] = NULL;
    }
    fft->ooplace[3] = NULL;
    
    /* Roll our own 3D real transform using multiple transforms in MKL,
     * since the current MKL versions does not support our storage format
     * or 3D real transforms.
     */
    
    /* In-place X FFT.
     * ny*nzc complex-to-complex transforms, length nx 
     * transform distance: 1
     * element strides: ny*nzc
     */
    status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);
    
    if ( status == 0)
    {
        stride[0] = 0;
        stride[1] = ny*nzc;
        
        status = 
        (DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE)                ||
         DftiSetValue(fft->inplace[0],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)ny*nzc) ||
         DftiSetValue(fft->inplace[0],DFTI_INPUT_DISTANCE,1)                      ||
         DftiSetValue(fft->inplace[0],DFTI_INPUT_STRIDES,stride)                  ||
         DftiSetValue(fft->inplace[0],DFTI_OUTPUT_DISTANCE,1)                     ||
         DftiSetValue(fft->inplace[0],DFTI_OUTPUT_STRIDES,stride)                 ||
         DftiCommitDescriptor(fft->inplace[0]));
    }
    
    /* Out-of-place X FFT: 
     * ny*nzc complex-to-complex transforms, length nx 
     * transform distance: 1
     * element strides: ny*nzc
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = ny*nzc;
        
        status =
        (DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE)              ||
         DftiSetValue(fft->ooplace[0],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)ny*nzc)   ||
         DftiSetValue(fft->ooplace[0],DFTI_INPUT_DISTANCE,1)                        ||
         DftiSetValue(fft->ooplace[0],DFTI_INPUT_STRIDES,stride)                    ||
         DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_DISTANCE,1)                       ||
         DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_STRIDES,stride)                   ||
         DftiCommitDescriptor(fft->ooplace[0]));
    }
    
    
    /* In-place Y FFT.
     * We cannot do all NX*NZC transforms at once, so define a handle to do
     * NZC transforms, and then execute it NX times.
     * nzc complex-to-complex transforms, length ny 
     * transform distance: 1
     * element strides: nzc
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->inplace[1],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)ny);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = nzc;
        
        status = 
        (DftiSetValue(fft->inplace[1],DFTI_PLACEMENT,DFTI_INPLACE)                ||
         DftiSetValue(fft->inplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nzc)    ||
         DftiSetValue(fft->inplace[1],DFTI_INPUT_DISTANCE,1)                      ||
         DftiSetValue(fft->inplace[1],DFTI_INPUT_STRIDES,stride)                  ||
         DftiSetValue(fft->inplace[1],DFTI_OUTPUT_DISTANCE,1)                     ||
         DftiSetValue(fft->inplace[1],DFTI_OUTPUT_STRIDES,stride)                 ||
         DftiCommitDescriptor(fft->inplace[1]));
    }
    
    
    /* Out-of-place Y FFT: 
     * We cannot do all NX*NZC transforms at once, so define a handle to do
     * NZC transforms, and then execute it NX times.
     * nzc complex-to-complex transforms, length ny 
     * transform distance: 1
     * element strides: nzc
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[1],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)ny);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = nzc;
        
        status =
        (DftiSetValue(fft->ooplace[1],DFTI_PLACEMENT,DFTI_NOT_INPLACE)            ||
         DftiSetValue(fft->ooplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nzc)    ||
         DftiSetValue(fft->ooplace[1],DFTI_INPUT_DISTANCE,1)                      ||
         DftiSetValue(fft->ooplace[1],DFTI_INPUT_STRIDES,stride)                  ||
         DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_DISTANCE,1)                     ||
         DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_STRIDES,stride)                 ||
         DftiCommitDescriptor(fft->ooplace[1]));
    }
    
    /* In-place Z FFT: 
     * nx*ny real-to-complex transforms, length nz
     * transform distance: nzc*2 -> nzc*2
     * element strides: 1
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->inplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
        
        status = 
        (DftiSetValue(fft->inplace[2],DFTI_PLACEMENT,DFTI_INPLACE)               ||
         DftiSetValue(fft->inplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) ||
         DftiSetValue(fft->inplace[2],DFTI_INPUT_DISTANCE,(MKL_LONG)nzc*2)       ||
         DftiSetValue(fft->inplace[2],DFTI_INPUT_STRIDES,stride)                 ||
         DftiSetValue(fft->inplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nzc*2)      ||
         DftiSetValue(fft->inplace[2],DFTI_OUTPUT_STRIDES,stride)                ||
         DftiCommitDescriptor(fft->inplace[2]));
    }
    
    
    /* Out-of-place real-to-complex (affects distance) Z FFT: 
     * nx*ny real-to-complex transforms, length nz
     * transform distance: nz -> nzc*2
     * element STRIDES: 1
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
        
        status =
        (DftiSetValue(fft->ooplace[2],DFTI_PLACEMENT,DFTI_NOT_INPLACE)           ||
         DftiSetValue(fft->ooplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) ||
         DftiSetValue(fft->ooplace[2],DFTI_INPUT_DISTANCE,(MKL_LONG)nz)          ||
         DftiSetValue(fft->ooplace[2],DFTI_INPUT_STRIDES,stride)                 ||
         DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nzc*2)      ||
         DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_STRIDES,stride)                ||
         DftiCommitDescriptor(fft->ooplace[2]));
    }

    
    /* Out-of-place complex-to-real (affects distance) Z FFT: 
     * nx*ny real-to-complex transforms, length nz
     * transform distance: nzc*2 -> nz
     * element STRIDES: 1
     */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[3],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)nz);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
        
        status =
            (DftiSetValue(fft->ooplace[3],DFTI_PLACEMENT,DFTI_NOT_INPLACE)           ||
             DftiSetValue(fft->ooplace[3],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx*ny) ||
             DftiSetValue(fft->ooplace[3],DFTI_INPUT_DISTANCE,(MKL_LONG)nzc*2)       ||
             DftiSetValue(fft->ooplace[3],DFTI_INPUT_STRIDES,stride)                 ||
             DftiSetValue(fft->ooplace[3],DFTI_OUTPUT_DISTANCE,(MKL_LONG)nz)         ||
             DftiSetValue(fft->ooplace[3],DFTI_OUTPUT_STRIDES,stride)                ||
             DftiCommitDescriptor(fft->ooplace[3]));
    }
    
    
    if ( status == 0 )
    {
        if ((fft->work = malloc(sizeof(t_complex)*(nx*ny*(nz/2+1)))) == NULL)
        {
            status = ENOMEM;
        }
    }
    
    
    if( status != 0 ) 
    {
        gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status);
        gmx_fft_destroy(fft);
        return status;
    }
    
    
    fft->ndim     = 3;
    fft->nx       = nx;
    fft->ny       = ny;
    fft->nz       = nz;
    fft->real_fft = 1;

    *pfft = fft;
    return 0;
} 
예제 #8
0
int 
gmx_fft_init_2d_real(gmx_fft_t *        pfft,
                     int                nx, 
                     int                ny,
                     enum gmx_fft_flag  flags) 
{
    gmx_fft_t      fft;
    int            d;
    int            status;
    MKL_LONG       stride[2];
    MKL_LONG       nyc;
    
    if(pfft==NULL)
    {
        gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
        return EINVAL;
    }
    *pfft = NULL;

    if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
    {
        return ENOMEM;
    }    
    
    nyc = (ny/2 + 1);
    
    /* Mark all handles invalid */
    for(d=0;d<3;d++)
    {
        fft->inplace[d] = fft->ooplace[d] = NULL;
    }
    fft->ooplace[3] = NULL;
    
    /* Roll our own 2D real transform using multiple transforms in MKL,
     * since the current MKL versions does not support our storage format,
     * and all but the most recent don't even have 2D real FFTs.
     */
    
    /* In-place X FFT */
    status = DftiCreateDescriptor(&fft->inplace[0],GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);
    
    if ( status == 0 )
    {
        stride[0]  = 0;
        stride[1]  = nyc;
     
        status = 
            (DftiSetValue(fft->inplace[0],DFTI_PLACEMENT,DFTI_INPLACE)    ||
             DftiSetValue(fft->inplace[0],DFTI_NUMBER_OF_TRANSFORMS,nyc)  ||
             DftiSetValue(fft->inplace[0],DFTI_INPUT_DISTANCE,1)          ||
             DftiSetValue(fft->inplace[0],DFTI_INPUT_STRIDES,stride)      ||
             DftiSetValue(fft->inplace[0],DFTI_OUTPUT_DISTANCE,1)         ||
             DftiSetValue(fft->inplace[0],DFTI_OUTPUT_STRIDES,stride));
    }
    
    if( status == 0 )
        status = DftiCommitDescriptor(fft->inplace[0]);

    /* Out-of-place X FFT */
    if( status == 0 )
        status = DftiCreateDescriptor(&(fft->ooplace[0]),GMX_DFTI_PREC,DFTI_COMPLEX,1,(MKL_LONG)nx);

    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = nyc;

        status =
            (DftiSetValue(fft->ooplace[0],DFTI_PLACEMENT,DFTI_NOT_INPLACE) ||
             DftiSetValue(fft->ooplace[0],DFTI_NUMBER_OF_TRANSFORMS,nyc)   ||
             DftiSetValue(fft->ooplace[0],DFTI_INPUT_DISTANCE,1)           ||
             DftiSetValue(fft->ooplace[0],DFTI_INPUT_STRIDES,stride)       ||
             DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_DISTANCE,1)          ||
             DftiSetValue(fft->ooplace[0],DFTI_OUTPUT_STRIDES,stride));
    }

    if( status == 0 )
        status = DftiCommitDescriptor(fft->ooplace[0]);

   
    /* In-place Y FFT  */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->inplace[1],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
               
        status = 
            (DftiSetValue(fft->inplace[1],DFTI_PLACEMENT,DFTI_INPLACE)             ||
             DftiSetValue(fft->inplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx)  ||
             DftiSetValue(fft->inplace[1],DFTI_INPUT_DISTANCE,2*nyc)               ||
             DftiSetValue(fft->inplace[1],DFTI_INPUT_STRIDES,stride)               ||
             DftiSetValue(fft->inplace[1],DFTI_OUTPUT_DISTANCE,2*nyc)              ||
             DftiSetValue(fft->inplace[1],DFTI_OUTPUT_STRIDES,stride)              ||
             DftiCommitDescriptor(fft->inplace[1]));
    }


    /* Out-of-place real-to-complex (affects output distance) Y FFT */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[1],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
         
        status =
            (DftiSetValue(fft->ooplace[1],DFTI_PLACEMENT,DFTI_NOT_INPLACE)           ||
             DftiSetValue(fft->ooplace[1],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx)    ||
             DftiSetValue(fft->ooplace[1],DFTI_INPUT_DISTANCE,(MKL_LONG)ny)          ||
             DftiSetValue(fft->ooplace[1],DFTI_INPUT_STRIDES,stride)                 ||
             DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_DISTANCE,2*nyc)                ||
             DftiSetValue(fft->ooplace[1],DFTI_OUTPUT_STRIDES,stride)                ||
             DftiCommitDescriptor(fft->ooplace[1]));
    }


    /* Out-of-place complex-to-real (affects output distance) Y FFT */
    if( status == 0 )
        status = DftiCreateDescriptor(&fft->ooplace[2],GMX_DFTI_PREC,DFTI_REAL,1,(MKL_LONG)ny);
    
    if( status == 0 )
    {
        stride[0] = 0;
        stride[1] = 1;
               
        status =
            (DftiSetValue(fft->ooplace[2],DFTI_PLACEMENT,DFTI_NOT_INPLACE)           ||
             DftiSetValue(fft->ooplace[2],DFTI_NUMBER_OF_TRANSFORMS,(MKL_LONG)nx)    ||
             DftiSetValue(fft->ooplace[2],DFTI_INPUT_DISTANCE,2*nyc)                 ||
             DftiSetValue(fft->ooplace[2],DFTI_INPUT_STRIDES,stride)                 ||
             DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_DISTANCE,(MKL_LONG)ny)         ||
             DftiSetValue(fft->ooplace[2],DFTI_OUTPUT_STRIDES,stride)                ||
             DftiCommitDescriptor(fft->ooplace[2]));
    }
    
    
    if ( status == 0 )
    {
        if ((fft->work = malloc(sizeof(t_complex)*(nx*(ny/2+1)))) == NULL)
        {
            status = ENOMEM;
        }
    }
    
    if( status != 0 )
    {
        gmx_fatal(FARGS,"Error initializing Intel MKL FFT; status=%d",status);
        gmx_fft_destroy(fft);
        return status;
    }
    
    fft->ndim     = 2;
    fft->nx       = nx;
    fft->ny       = ny;
    fft->real_fft = 1;
    
    *pfft = fft;
    return 0;
}
예제 #9
0
파일: kernel_main.c 프로젝트: nschloe/sippp
int bi_entry(void * mdpv, int iproblemsize, double * dresults)
{
   /* dstart, dend: the start and end time of the measurement */
   /* dtime: the time for a single measurement in seconds */
   double dstart = 0.0, dend = 0.0, dtime = 0.0, dinit = 0.0;
   /* flops stores the calculated FLOPS */
   double flops = 0.0;
   /* ii is used for loop iterations */
   myinttype ii, jj, imyproblemsize, numberOfRuns;
   /* cast void* pointer */
   mydata_t* pmydata = (mydata_t*)mdpv;
   int invalid = 0;
   long status;

   /* calculate real problemsize */
   imyproblemsize = (int)pow(2, (log2(pmydata->min) + (myinttype)iproblemsize - 1));

   /* store the value for the x axis in results[0] */
   dresults[0] = (double)imyproblemsize;


   /*** in place run ***/

   /* malloc */
   pmydata->inout = (float*)malloc(sizeof(float) * imyproblemsize * 2);

   /* create FFT plan */
   status = DftiCreateDescriptor(&pmydata->my_desc_handle, DFTI_SINGLE, DFTI_COMPLEX, 1, imyproblemsize);
   status = DftiCommitDescriptor(pmydata->my_desc_handle);


   /* init stuff */
   initData_ip(pmydata, imyproblemsize);

   numberOfRuns = 1;

   dstart = bi_gettime();
   /* fft calculation */
   status = DftiComputeForward(pmydata->my_desc_handle, pmydata->inout);
   dend = bi_gettime();

   /* calculate the used time*/
   dtime = dend - dstart;
   dtime -= dTimerOverhead;

   /* loop calculation if accuracy is insufficient */
   while (dtime < 100 * dTimerGranularity) {

     numberOfRuns = numberOfRuns * 2;

     dstart = bi_gettime();
     for (jj = 0; jj < numberOfRuns; jj++) {
       /* fft calculation */
       status = DftiComputeForward(pmydata->my_desc_handle, pmydata->inout);
     }
     dend = bi_gettime();

     dtime = dend - dstart;
     dtime -= dTimerOverhead;
   }

   /* check for overflows */
   for (ii = 0; ii < imyproblemsize; ii++) {
     if (isnan(pmydata->inout[2 * ii]) || isnan(pmydata->inout[2 * ii + 1])) invalid = 1;
     if (isinf(pmydata->inout[2 * ii]) || isinf(pmydata->inout[2 * ii + 1])) invalid = 1;
   }

   /* if loop was necessary */
   if (numberOfRuns > 1) dtime = dtime / numberOfRuns;

   /* calculate the used FLOPS */
   flops = (double)(5.0 * imyproblemsize * (log2(1.0 * imyproblemsize)) / dtime);

   /* store the FLOPS in results[1] */
   if (invalid == 1) dresults[1] = INVALID_MEASUREMENT;
     else dresults[1] = flops;

   status = DftiFreeDescriptor(&pmydata->my_desc_handle);

   /* free data */
   free(pmydata->inout);


   /*** out of place run ***/

   /* malloc */
   pmydata->in = (float*)malloc(sizeof(float) * imyproblemsize * 2);
   pmydata->out = (float*)malloc(sizeof(float) * imyproblemsize * 2);

   /* create FFT plan */
   status = DftiCreateDescriptor(&pmydata->my_desc_handle, DFTI_SINGLE, DFTI_COMPLEX, 1, imyproblemsize);
   status = DftiSetValue(pmydata->my_desc_handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
   status = DftiCommitDescriptor(pmydata->my_desc_handle);

   /* init stuff */
   initData_oop(pmydata, imyproblemsize);

   numberOfRuns = 1;

   dstart = bi_gettime();
   /* fft calculation */
   status = DftiComputeForward(pmydata->my_desc_handle, pmydata->in, pmydata->out);
   dend = bi_gettime();

   /* calculate the used time*/
   dtime = dend - dstart;
   dtime -= dTimerOverhead;

   /* loop calculation if accuracy is insufficient */
   while (dtime < 100 * dTimerGranularity) {

     numberOfRuns = numberOfRuns * 2;

     dstart = bi_gettime();
     for (ii = 0; ii < numberOfRuns; ii++) {
        /* fft calculation */
        status = DftiComputeForward(pmydata->my_desc_handle, pmydata->in, pmydata->out);
     }
     dend = bi_gettime();

     /* calculate the used time*/
     dtime = dend - dstart;
     dtime -= dTimerOverhead;
   }

   /* if loop was necessary */
   if (numberOfRuns > 1) dtime = dtime / numberOfRuns;

   /* check for overflows */
   for (ii = 0; ii < imyproblemsize; ii++) {
     if (isnan(pmydata->out[2 * ii]) || isnan(pmydata->out[2 * ii + 1])) invalid = 1;
     if (isinf(pmydata->out[2 * ii]) || isinf(pmydata->out[2 * ii + 1])) invalid = 1;
   }

   /* calculate the used FLOPS */
   flops = (double)(5.0 * imyproblemsize * (log2(1.0 * imyproblemsize)) / dtime);

   /* store the FLOPS in results[2] */
   if (invalid == 1) dresults[2] = INVALID_MEASUREMENT;
     else dresults[2] = flops;

   status = DftiFreeDescriptor(&pmydata->my_desc_handle);

   /* free data */
   free(pmydata->in);
   free(pmydata->out);

   return 0;
}
예제 #10
0
void ccmfft(complex *data, int n1, int n2, int ld1, int sign)
{
#if defined(HAVE_LIBSCS)
	int   ntable, nwork, zero=0;
	static int isys, nprev=0;
	static float *work, *table, scale=1.0;
#elif defined(ACML440)
	static int nprev=0;
	int   nwork, zero=0, one=1, i, j, inpl;
	static int isys;
	static complex *work;
	REAL scl;
	complex *y;
#elif defined(MKL)
	static DFTI_DESCRIPTOR_HANDLE handle[MAX_NUMTHREADS];
	static int nprev[MAX_NUMTHREADS];
    MKL_LONG Status;
	int j;
#endif
	int id;

#ifdef _OPENMP
	id = omp_get_thread_num();
#else
	id = 0;
#endif

#if defined(HAVE_LIBSCS)
	if (n1 != nprev) {
		isys   = 0;
		ntable = 2*n1 + 30;
		nwork  = 2*n1;
		if (work) free(work);
		work = (float *)malloc(nwork*sizeof(float));
		if (work == NULL) fprintf(stderr,"ccmfft: memory allocation error\n");
		if (table) free(table);
		table = (float *)malloc(ntable*sizeof(float));
		if (table == NULL) fprintf(stderr,"ccmfft: memory allocation error\n");
		ccfftm_(&zero, &n1, &n2, &scale, data, &ld1, data, &ld1, table, work, &isys);
		nprev = n1;
	}
	ccfftm_(&sign, &n1, &n2, &scale, data, &ld1, data, &ld1, table, work, &isys);
#elif defined(ACML440)
	scl = 1.0;
	inpl = 1;
	if (n1 != nprev) {
		isys   = 0;
		nwork  = 5*n1 + 100;
		if (work) free(work);
		work = (complex *)malloc(nwork*sizeof(complex));
		if (work == NULL) fprintf(stderr,"rc1fft: memory allocation error\n");
		acmlccmfft(zero, scl, inpl, n2, n1, data, 1, ld1, y, 1, ld1, work, &isys);
		nprev = n1;
	}
	acmlccmfft(sign, scl, inpl, n2, n1, data, 1, ld1, y, 1, ld1, work, &isys);
#elif defined(MKL)
    if (n1 != nprev[id]) {
        DftiFreeDescriptor(&handle[id]);
        
        Status = DftiCreateDescriptor(&handle[id], DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG)n1);
        if(! DftiErrorClass(Status, DFTI_NO_ERROR)){
            dfti_status_print(Status);
            printf(" DftiCreateDescriptor FAIL\n");
        }
        Status = DftiCommitDescriptor(handle[id]);
        if(! DftiErrorClass(Status, DFTI_NO_ERROR)){
            dfti_status_print(Status);
            printf(" DftiCommitDescriptor FAIL\n");
        }
        nprev[id] = n1;
    }
    if (sign < 0) {
    	for (j=0; j<n2; j++) {
        	Status = DftiComputeBackward(handle[id], &data[j*ld1]);
		}
    }
    else {
    	for (j=0; j<n2; j++) {
        	Status = DftiComputeForward(handle[id], &data[j*ld1]);
		}
    }
#else
	ccm_fft(data, n1, n2, ld1, sign);
#endif

	return;
}
예제 #11
0
struct fft_plan_3d *fft_3d_create_plan(
       MPI_Comm comm, int nfast, int nmid, int nslow,
       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
       int in_klo, int in_khi,
       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
       int out_klo, int out_khi,
       int scaled, int permute, int *nbuf)
{
  struct fft_plan_3d *plan;
  int me,nprocs;
  int i,num,flag,remapflag,fftflag;
  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
  int np1,np2,ip1,ip2;
  int list[50];

  // system specific variables

#ifdef FFT_SCSL
  FFT_DATA dummy_d[5];
  FFT_PREC dummy_p[5];
  int isign,isys;
  FFT_PREC scalef;
#endif
#ifdef FFT_INTEL
  FFT_DATA dummy;
#endif
#ifdef FFT_T3E
  FFT_DATA dummy[5];
  int isign,isys;
  double scalef;
#endif

  // query MPI info

  MPI_Comm_rank(comm,&me);
  MPI_Comm_size(comm,&nprocs);

  // compute division of procs in 2 dimensions not on-processor

  bifactor(nprocs,&np1,&np2);
  ip1 = me % np1;
  ip2 = me/np1;

  // allocate memory for plan data struct

  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
  if (plan == NULL) return NULL;

  // remap from initial distribution to layout needed for 1st set of 1d FFTs
  // not needed if all procs own entire fast axis initially
  // first indices = distribution after 1st set of FFTs

  if (in_ilo == 0 && in_ihi == nfast-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    first_ilo = in_ilo;
    first_ihi = in_ihi;
    first_jlo = in_jlo;
    first_jhi = in_jhi;
    first_klo = in_klo;
    first_khi = in_khi;
    plan->pre_plan = NULL;
  }
  else {
    first_ilo = 0;
    first_ihi = nfast - 1;
    first_jlo = ip1*nmid/np1;
    first_jhi = (ip1+1)*nmid/np1 - 1;
    first_klo = ip2*nslow/np2;
    first_khi = (ip2+1)*nslow/np2 - 1;
    plan->pre_plan =
      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,2,0,0,FFT_PRECISION);
    if (plan->pre_plan == NULL) return NULL;
  }

  // 1d FFTs along fast axis

  plan->length1 = nfast;
  plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);

  // remap from 1st to 2nd FFT
  // choose which axis is split over np1 vs np2 to minimize communication
  // second indices = distribution after 2nd set of FFTs

  second_ilo = ip1*nfast/np1;
  second_ihi = (ip1+1)*nfast/np1 - 1;
  second_jlo = 0;
  second_jhi = nmid - 1;
  second_klo = ip2*nslow/np2;
  second_khi = (ip2+1)*nslow/np2 - 1;
  plan->mid1_plan =
      remap_3d_create_plan(comm,
                           first_ilo,first_ihi,first_jlo,first_jhi,
                           first_klo,first_khi,
                           second_ilo,second_ihi,second_jlo,second_jhi,
                           second_klo,second_khi,2,1,0,FFT_PRECISION);
  if (plan->mid1_plan == NULL) return NULL;

  // 1d FFTs along mid axis

  plan->length2 = nmid;
  plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);

  // remap from 2nd to 3rd FFT
  // if final distribution is permute=2 with all procs owning entire slow axis
  //   then this remapping goes directly to final distribution
  //  third indices = distribution after 3rd set of FFTs

  if (permute == 2 && out_klo == 0 && out_khi == nslow-1)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0) {
    third_ilo = out_ilo;
    third_ihi = out_ihi;
    third_jlo = out_jlo;
    third_jhi = out_jhi;
    third_klo = out_klo;
    third_khi = out_khi;
  }
  else {
    third_ilo = ip1*nfast/np1;
    third_ihi = (ip1+1)*nfast/np1 - 1;
    third_jlo = ip2*nmid/np2;
    third_jhi = (ip2+1)*nmid/np2 - 1;
    third_klo = 0;
    third_khi = nslow - 1;
  }

  plan->mid2_plan =
    remap_3d_create_plan(comm,
                         second_jlo,second_jhi,second_klo,second_khi,
                         second_ilo,second_ihi,
                         third_jlo,third_jhi,third_klo,third_khi,
                         third_ilo,third_ihi,2,1,0,FFT_PRECISION);
  if (plan->mid2_plan == NULL) return NULL;

  // 1d FFTs along slow axis

  plan->length3 = nslow;
  plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;

  // remap from 3rd FFT to final distribution
  //  not needed if permute = 2 and third indices = out indices on all procs

  if (permute == 2 &&
      out_ilo == third_ilo && out_ihi == third_ihi &&
      out_jlo == third_jlo && out_jhi == third_jhi &&
      out_klo == third_klo && out_khi == third_khi)
    flag = 0;
  else
    flag = 1;

  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);

  if (remapflag == 0)
    plan->post_plan = NULL;
  else {
    plan->post_plan =
      remap_3d_create_plan(comm,
                           third_klo,third_khi,third_ilo,third_ihi,
                           third_jlo,third_jhi,
                           out_klo,out_khi,out_ilo,out_ihi,
                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION);
    if (plan->post_plan == NULL) return NULL;
  }

  // configure plan memory pointers and allocate work space
  // out_size = amount of memory given to FFT by user
  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
  // copy_size = amount needed internally for extra copy of data
  // scratch_size = amount needed internally for remap scratch space
  // for each remap:
  //   out space used for result if big enough, else require copy buffer
  //   accumulate largest required remap scratch space

  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
    (first_khi-first_klo+1);
  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
    (second_khi-second_klo+1);
  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
    (third_khi-third_klo+1);

  copy_size = 0;
  scratch_size = 0;

  if (plan->pre_plan) {
    if (first_size <= out_size)
      plan->pre_target = 0;
    else {
      plan->pre_target = 1;
      copy_size = MAX(copy_size,first_size);
    }
    scratch_size = MAX(scratch_size,first_size);
  }

  if (plan->mid1_plan) {
    if (second_size <= out_size)
      plan->mid1_target = 0;
    else {
      plan->mid1_target = 1;
      copy_size = MAX(copy_size,second_size);
    }
    scratch_size = MAX(scratch_size,second_size);
  }

  if (plan->mid2_plan) {
    if (third_size <= out_size)
      plan->mid2_target = 0;
    else {
      plan->mid2_target = 1;
      copy_size = MAX(copy_size,third_size);
    }
    scratch_size = MAX(scratch_size,third_size);
  }

  if (plan->post_plan)
    scratch_size = MAX(scratch_size,out_size);

  *nbuf = copy_size + scratch_size;

  if (copy_size) {
    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
    if (plan->copy == NULL) return NULL;
  }
  else plan->copy = NULL;

  if (scratch_size) {
    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
    if (plan->scratch == NULL) return NULL;
  }
  else plan->scratch = NULL;

  // system specific pre-computation of 1d FFT coeffs
  // and scaling normalization

#if defined(FFT_SGI)

  plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  FFT_1D_INIT(nfast,plan->coeff1);
  FFT_1D_INIT(nmid,plan->coeff2);
  FFT_1D_INIT(nslow,plan->coeff3);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_SCSL)

  plan->coeff1 = (FFT_PREC *) malloc((2*nfast+30)*sizeof(FFT_PREC));
  plan->coeff2 = (FFT_PREC *) malloc((2*nmid+30)*sizeof(FFT_PREC));
  plan->coeff3 = (FFT_PREC *) malloc((2*nslow+30)*sizeof(FFT_PREC));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  plan->work1 = (FFT_PREC *) malloc((2*nfast)*sizeof(FFT_PREC));
  plan->work2 = (FFT_PREC *) malloc((2*nmid)*sizeof(FFT_PREC));
  plan->work3 = (FFT_PREC *) malloc((2*nslow)*sizeof(FFT_PREC));

  if (plan->work1 == NULL || plan->work2 == NULL ||
      plan->work3 == NULL) return NULL;

  isign = 0;
  scalef = 1.0;
  isys = 0;

  FFT_1D_INIT(isign,nfast,scalef,dummy_d,dummy_d,plan->coeff1,dummy_p,&isys);
  FFT_1D_INIT(isign,nmid,scalef,dummy_d,dummy_d,plan->coeff2,dummy_p,&isys);
  FFT_1D_INIT(isign,nslow,scalef,dummy_d,dummy_d,plan->coeff3,dummy_p,&isys);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_ACML)

  plan->coeff1 = (FFT_DATA *) malloc((3*nfast+100)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((3*nmid+100)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((3*nslow+100)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  int isign = 100;
  int isys = 1;
  int info = 0;
  FFT_DATA *dummy = NULL;

  FFT_1D(&isign,&isys,&nfast,dummy,plan->coeff1,&info);
  FFT_1D(&isign,&isys,&nmid,dummy,plan->coeff2,&info);
  FFT_1D(&isign,&isys,&nslow,dummy,plan->coeff3,&info);

  if (scaled == 0) {
    plan->scaled = 0;
    plan->norm = sqrt(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  } else {
    plan->scaled = 1;
    plan->norm = sqrt(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_INTEL)

  flag = 0;

  num = 0;
  factor(nfast,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
  num = 0;
  factor(nmid,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;
  num = 0;
  factor(nslow,&num,list);
  for (i = 0; i < num; i++)
    if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1;

  MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm);
  if (fftflag) {
    if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n");
    return NULL;
  }

  plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA));
  plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA));
  plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  flag = 0;
  FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1);
  FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2);
  FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3);

  if (scaled == 0) {
    plan->scaled = 1;
    plan->norm = nfast*nmid*nslow;
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
  else
    plan->scaled = 0;

#elif defined(FFT_MKL)
  DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nfast);
  DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total1/nfast);
  DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
  DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
  DftiCommitDescriptor(plan->handle_fast);

  DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nmid);
  DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total2/nmid);
  DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
  DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
  DftiCommitDescriptor(plan->handle_mid);

  DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, (MKL_LONG)nslow);
  DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, (MKL_LONG)plan->total3/nslow);
  DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
  DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
  DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
  DftiCommitDescriptor(plan->handle_slow);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_DEC)

  if (scaled == 0) {
    plan->scaled = 1;
    plan->norm = nfast*nmid*nslow;
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
  else
    plan->scaled = 0;

#elif defined(FFT_T3E)

  plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double));
  plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double));
  plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double));

  if (plan->coeff1 == NULL || plan->coeff2 == NULL ||
      plan->coeff3 == NULL) return NULL;

  plan->work1 = (double *) malloc((8*nfast)*sizeof(double));
  plan->work2 = (double *) malloc((8*nmid)*sizeof(double));
  plan->work3 = (double *) malloc((8*nslow)*sizeof(double));

  if (plan->work1 == NULL || plan->work2 == NULL ||
      plan->work3 == NULL) return NULL;

  isign = 0;
  scalef = 1.0;
  isys = 0;

  FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys);
  FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys);
  FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_FFTW2)

  plan->plan_fast_forward =
    fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  plan->plan_fast_backward =
    fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);

  if (nmid == nfast) {
    plan->plan_mid_forward = plan->plan_fast_forward;
    plan->plan_mid_backward = plan->plan_fast_backward;
  }
  else {
    plan->plan_mid_forward =
      fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    plan->plan_mid_backward =
      fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  }

  if (nslow == nfast) {
    plan->plan_slow_forward = plan->plan_fast_forward;
    plan->plan_slow_backward = plan->plan_fast_backward;
  }
  else if (nslow == nmid) {
    plan->plan_slow_forward = plan->plan_mid_forward;
    plan->plan_slow_backward = plan->plan_mid_backward;
  }
  else {
    plan->plan_slow_forward =
      fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
    plan->plan_slow_backward =
      fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
  }

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#elif defined(FFT_FFTW3)
  plan->plan_fast_forward =
    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
                            NULL,&nfast,1,plan->length1,
                            NULL,&nfast,1,plan->length1,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_fast_backward =
    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
                            NULL,&nfast,1,plan->length1,
                            NULL,&nfast,1,plan->length1,
                            FFTW_BACKWARD,FFTW_ESTIMATE);
  plan->plan_mid_forward =
    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
                            NULL,&nmid,1,plan->length2,
                            NULL,&nmid,1,plan->length2,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_mid_backward =
    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
                            NULL,&nmid,1,plan->length2,
                            NULL,&nmid,1,plan->length2,
                            FFTW_BACKWARD,FFTW_ESTIMATE);
  plan->plan_slow_forward =
    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
                            NULL,&nslow,1,plan->length3,
                            NULL,&nslow,1,plan->length3,
                            FFTW_FORWARD,FFTW_ESTIMATE);
  plan->plan_slow_backward =
    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
                            NULL,&nslow,1,plan->length3,
                            NULL,&nslow,1,plan->length3,
                            FFTW_BACKWARD,FFTW_ESTIMATE);

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }
#else
  plan->cfg_fast_forward = kiss_fft_alloc(nfast,0,NULL,NULL);
  plan->cfg_fast_backward = kiss_fft_alloc(nfast,1,NULL,NULL);

  if (nmid == nfast) {
    plan->cfg_mid_forward = plan->cfg_fast_forward;
    plan->cfg_mid_backward = plan->cfg_fast_backward;
  }
  else {
    plan->cfg_mid_forward = kiss_fft_alloc(nmid,0,NULL,NULL);
    plan->cfg_mid_backward = kiss_fft_alloc(nmid,1,NULL,NULL);
  }

  if (nslow == nfast) {
    plan->cfg_slow_forward = plan->cfg_fast_forward;
    plan->cfg_slow_backward = plan->cfg_fast_backward;
  }
  else if (nslow == nmid) {
    plan->cfg_slow_forward = plan->cfg_mid_forward;
    plan->cfg_slow_backward = plan->cfg_mid_backward;
  }
  else {
    plan->cfg_slow_forward = kiss_fft_alloc(nslow,0,NULL,NULL);
    plan->cfg_slow_backward = kiss_fft_alloc(nslow,1,NULL,NULL);
  }

  if (scaled == 0)
    plan->scaled = 0;
  else {
    plan->scaled = 1;
    plan->norm = 1.0/(nfast*nmid*nslow);
    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
      (out_khi-out_klo+1);
  }

#endif

  return plan;
}
예제 #12
0
int
gmx_fft_init_1d_real(gmx_fft_t *             pfft,
                     int                     nx,
                     gmx_fft_flag gmx_unused flags)
{
    gmx_fft_t      fft;
    int            d;
    int            status;

    if (pfft == NULL)
    {
        gmx_fatal(FARGS, "Invalid opaque FFT datatype pointer.");
        return EINVAL;
    }
    *pfft = NULL;

    if ( (fft = (gmx_fft_t)malloc(sizeof(struct gmx_fft))) == NULL)
    {
        return ENOMEM;
    }

    /* Mark all handles invalid */
    for (d = 0; d < 3; d++)
    {
        fft->inplace[d] = fft->ooplace[d] = NULL;
    }
    fft->ooplace[3] = NULL;

    status = DftiCreateDescriptor(&fft->inplace[0], GMX_DFTI_PREC, DFTI_REAL, 1, (MKL_LONG)nx);

    if (status == 0)
    {
        status = DftiSetValue(fft->inplace[0], DFTI_PLACEMENT, DFTI_INPLACE);
    }

    if (status == 0)
    {
        status = DftiCommitDescriptor(fft->inplace[0]);
    }


    if (status == 0)
    {
        status = DftiCreateDescriptor(&fft->ooplace[0], GMX_DFTI_PREC, DFTI_REAL, 1, (MKL_LONG)nx);
    }

    if (status == 0)
    {
        status = DftiSetValue(fft->ooplace[0], DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    }

    if (status == 0)
    {
        status = DftiCommitDescriptor(fft->ooplace[0]);
    }


    if (status == DFTI_UNIMPLEMENTED)
    {
        gmx_fatal(FARGS,
                  "The linked Intel MKL version (<6.0?) cannot do real FFTs.");
        gmx_fft_destroy(fft);
        return status;
    }


    if (status != 0)
    {
        gmx_fatal(FARGS, "Error initializing Intel MKL FFT; status=%d", status);
        gmx_fft_destroy(fft);
        return status;
    }

    fft->ndim     = 1;
    fft->nx       = nx;
    fft->real_fft = 1;
    fft->work     = NULL;

    *pfft = fft;
    return 0;
}