C++ (Cpp) vloadu Examples

Programming Language: C++ (Cpp)

Method/Function: vloadu

Examples at hotexamples.com: 3

C++ (Cpp) vloadu - 3 examples found. These are the top rated real world C++ (Cpp) examples of vloadu extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: avxmath.c Project: jsandber1138/avxmath

static void double_sincos(char **args, npy_intp *dimensions,
                            npy_intp* steps, void* data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out1 = args[1], *out2 = args[2];
    npy_intp in_step = steps[0], out1_step = steps[1], out2_step = steps[2];
    double tmp1[VECTLENDP], tmp2[VECTLENDP];
    vdouble a;
    vdouble2 b;
    int slow_n = n % VECTLENDP;
    if(in_step != sizeof(double) || out1_step != sizeof(double) || 
       out2_step != sizeof(double))
    {
        slow_n = n;
    }
    for(i = 0; i < slow_n; i += VECTLENDP)
    {
        int j;
        for(j = 0; j < VECTLENDP && i + j < slow_n; j++)
        {
            tmp1[j] = *(double *)in;
            in += in_step;            
        }
        a = vloadu(tmp1);
        b = xsincos(a);
        vstoreu(tmp1, b.x);    
        vstoreu(tmp2, b.y);    
        for(j = 0; j < VECTLENDP && i + j < slow_n; j++)
        {
            *(double *)out1 = tmp1[j];
            *(double *)out2 = tmp2[j];
            out1 += out1_step;
            out2 += out2_step;
        }        
    }
    if(n > slow_n)
    {
        double *in_array = (double *)in;
        double *out_array1 = (double *)out1;
        double *out_array2 = (double *)out2;
        for(i = 0; i < n - slow_n; i += VECTLENDP)
        {  
            a = vloadu(in_array + i);
    	    b = xsincos(a);
	    	vstoreu(out_array1 + i, b.x);    
	    	vstoreu(out_array2 + i, b.y); 
        }
    }
}

Example #2

Show file

File: avxmath.c Project: jsandber1138/avxmath

static void double_xloop(char **args, npy_intp *dimensions,
                            npy_intp* steps, void* data)
{
    npy_intp i;
    npy_intp n = dimensions[0];
    char *in = args[0], *out = args[1];
    npy_intp in_step = steps[0], out_step = steps[1];
    avx_func *func = (avx_func *)data;
    vdouble (*f)(vdouble) = func->f;
    double tmp[VECTLENDP];
    vdouble a;
    int slow_n = n % VECTLENDP;
    if(in_step != sizeof(double) || out_step != sizeof(double))
        slow_n = n;
    for(i = 0; i < slow_n; i += VECTLENDP)
    {
        int j;
        for(j = 0; j < VECTLENDP && i + j < slow_n; j++)
        {
            tmp[j] = *(double *)in;
            in += in_step;            
        }
        a = vloadu(tmp);
        a = (*f)(a);
        vstoreu(tmp, a);
        for(j = 0; j < VECTLENDP && i + j < slow_n; j++)
        {
            *(double *)out = tmp[j];
            out += out_step;
        }        
    }
    if(n > slow_n)
    {
        double *in_array = (double *)in;
        double *out_array = (double *)out;
        for(i = 0; i < n - slow_n; i += VECTLENDP)
        {
            a = vloadu(in_array + i);
        	a = (*f)(a);
	        vstoreu(out_array + i, a);    
        }
    }
}

Example #3

Show file

int job_core(int pm,                   // Hemisphere
	     int mm,                   // Grid 'sky position'
	     int nn,                   // Second grid 'sky position'
	     Search_settings *sett,    // Search settings
	     Command_line_opts *opts,  // Search options 
	     Search_range *s_range,    // Range for searching
	     FFTW_plans *plans,        // Plans for fftw
	     FFTW_arrays *fftw_arr,    // Arrays for fftw
	     Aux_arrays *aux,          // Auxiliary arrays
	     int *sgnlc,               // Candidate trigger parameters 
	     FLOAT_TYPE *sgnlv,        // Candidate array 
	     int *FNum) {              // Candidate signal number

  int i, j, n;
  int smin = s_range->sst, smax = s_range->spndr[1];
  double al1, al2, sinalt, cosalt, sindelt, cosdelt, sgnlt[NPAR], 
    nSource[3], het0, sgnl0, ft;
  double _tmp1[sett->nifo][sett->N];

#undef NORMTOMAX
#ifdef NORMTOMAX
  double blkavg, threshold = 6.;
  int imax, imax0, iblk, blkstart, ihi;
  int blksize = 1024;
  int nfft = sett->nmax - sett->nmin;
  static int *Fmax;
  if (!Fmax) Fmax = (int *) malloc(nfft*sizeof(int));
#endif

  struct timespec tstart, tend;
  double spindown_timer = 0;
  int spindown_counter  = 0;
  
  //tstart = get_current_time(CLOCK_REALTIME);

  /* Matrix	M(.,.) (defined on page 22 of PolGrawCWAllSkyReview1.pdf file)
     defines the transformation form integers (bin, ss, nn, mm) determining
     a grid point to linear coordinates omega, omegadot, alpha_1, alpha_2),
     where bin is the frequency bin number and alpha_1 and alpha_2 are
     defined on p. 22 of PolGrawCWAllSkyReview1.pdf file.

     [omega]                          [bin]
     [omegadot]       = M(.,.) \times [ss]
     [alpha_1/omega]                  [nn]
     [alpha_2/omega]                  [mm]

     Array M[.] is related to matrix M(.,.) in the following way;

                 [ M[0] M[4] M[8]  M[12] ]
      M(.,.) =   [ M[1] M[5] M[9]  M[13] ]
                 [ M[2] M[6] M[10] M[14] ]
                 [ M[3] M[7] M[11] M[15] ]

     and

     M[1] = M[2] = M[3] = M[6] = M[7] = 0
  */

  // Grid positions
  al1 = nn*sett->M[10] + mm*sett->M[14];
  al2 = nn*sett->M[11] + mm*sett->M[15];

  // check if the search is in an appropriate region of the grid
  // if not, returns NULL
  if ((sqr(al1)+sqr(al2))/sqr(sett->oms) > 1.) return 0;

  int ss;
  double shft1, phase, cp, sp;
  complex double exph;

  // Change linear (grid) coordinates to real coordinates
  lin2ast(al1/sett->oms, al2/sett->oms, 
	  pm, sett->sepsm, sett->cepsm,
	  &sinalt, &cosalt, &sindelt, &cosdelt);

  // calculate declination and right ascention
  // written in file as candidate signal sky positions
  sgnlt[2] = asin(sindelt);
  sgnlt[3] = fmod(atan2(sinalt, cosalt) + 2.*M_PI, 2.*M_PI);

  het0 = fmod(nn*sett->M[8] + mm*sett->M[12], sett->M[0]);

  // Nyquist frequency 
  int nyqst = (sett->nfft)/2 + 1;

  // Loop for each detector 
  /* Amplitude modulation functions aa and bb 
   * for each detector (in signal sub-struct 
   * of _detector, ifo[n].sig.aa, ifo[n].sig.bb) 
   */
  for(n=0; n<sett->nifo; ++n) {
    modvir(sinalt, cosalt, sindelt, cosdelt,
           sett->N, &ifo[n], aux);

    // Calculate detector positions with respect to baricenter
    nSource[0] = cosalt*cosdelt;
    nSource[1] = sinalt*cosdelt;
    nSource[2] = sindelt;
    
    shft1 = nSource[0]*ifo[n].sig.DetSSB[0]
          + nSource[1]*ifo[n].sig.DetSSB[1]
          + nSource[2]*ifo[n].sig.DetSSB[2];

#define CHUNK 4
#pragma omp parallel default(shared) private(phase,cp,sp,exph)
    {
#pragma omp for schedule(static,CHUNK)
      for(i=0; i<sett->N; ++i) {
	ifo[n].sig.shft[i] = nSource[0]*ifo[n].sig.DetSSB[i*3]
	                   + nSource[1]*ifo[n].sig.DetSSB[i*3+1]
	                   + nSource[2]*ifo[n].sig.DetSSB[i*3+2];
	ifo[n].sig.shftf[i] = ifo[n].sig.shft[i] - shft1;
	_tmp1[n][i] = aux->t2[i] + (double)(2*i)*ifo[n].sig.shft[i];
      }

#pragma omp for schedule(static,CHUNK) 
      for(i=0; i<sett->N; ++i) {
	// Phase modulation 
	phase = het0*i + sett->oms*ifo[n].sig.shft[i];
#ifdef NOSINCOS
	cp = cos(phase);
	sp = sin(phase);
#else
	sincos(phase, &sp, &cp);
#endif

	exph = cp - I*sp;

	// Matched filter 
	ifo[n].sig.xDatma[i] = ifo[n].sig.xDat[i]*ifo[n].sig.aa[i]*exph;
	ifo[n].sig.xDatmb[i] = ifo[n].sig.xDat[i]*ifo[n].sig.bb[i]*exph;
      }

      /* Resampling using spline interpolation:
       * This will double the sampling rate 
       */ 
#pragma omp for schedule(static,CHUNK)  
      for(i=0; i < sett->N; ++i) {
	fftw_arr->xa[i] = ifo[n].sig.xDatma[i];
	fftw_arr->xb[i] = ifo[n].sig.xDatmb[i];
      }
 
      // Zero-padding (filling with 0s up to sett->nfft, 
      // the nearest power of 2)
#pragma omp for schedule(static,CHUNK)
      for (i=sett->N; i<sett->nfft; ++i) {
	fftw_arr->xa[i] = 0.;
	fftw_arr->xb[i] = 0.;
      }
      
    } //omp parallel

    fftw_execute_dft(plans->pl_int,fftw_arr->xa,fftw_arr->xa);  //forward fft (len nfft)
    fftw_execute_dft(plans->pl_int,fftw_arr->xb,fftw_arr->xb);  //forward fft (len nfft)

    // move frequencies from second half of spectrum; 
    // and zero frequencies higher than nyquist
    // loop length: nfft - nyqst = nfft - nfft/2 - 1 = nfft/2 - 1

    for(i=nyqst + sett->Ninterp - sett->nfft, j=nyqst; i<sett->Ninterp; ++i, ++j) {
      fftw_arr->xa[i] = fftw_arr->xa[j];
      fftw_arr->xa[j] = 0.;
    }

    for(i=nyqst + sett->Ninterp - sett->nfft, j=nyqst; i<sett->Ninterp; ++i, ++j) {
      fftw_arr->xb[i] = fftw_arr->xb[j];
      fftw_arr->xb[j] = 0.;
    }

    // Backward fft (len Ninterp = nfft*interpftpad)
    fftw_execute_dft(plans->pl_inv,fftw_arr->xa,fftw_arr->xa);
    fftw_execute_dft(plans->pl_inv,fftw_arr->xb,fftw_arr->xb);

    ft = (double)sett->interpftpad / sett->Ninterp; //scale FFT
    for (i=0; i < sett->Ninterp; ++i) {
      fftw_arr->xa[i] *= ft;
      fftw_arr->xb[i] *= ft;
    }

    //  struct timeval tstart = get_current_time(), tend;

    // Spline interpolation to xDatma, xDatmb arrays
    splintpad(fftw_arr->xa, ifo[n].sig.shftf, sett->N, 
	      sett->interpftpad, ifo[n].sig.xDatma);   
    splintpad(fftw_arr->xb, ifo[n].sig.shftf, sett->N, 
	      sett->interpftpad, ifo[n].sig.xDatmb);

  } // end of detector loop 

  // square sums of modulation factors 
  double aa = 0., bb = 0.; 

  for(n=0; n<sett->nifo; ++n) {

    double aatemp = 0., bbtemp = 0.;
 
    for(i=0; i<sett->N; ++i) {
      aatemp += sqr(ifo[n].sig.aa[i]);
      bbtemp += sqr(ifo[n].sig.bb[i]);
    }

    for(i=0; i<sett->N; ++i) {
      ifo[n].sig.xDatma[i] /= ifo[n].sig.sig2;
      ifo[n].sig.xDatmb[i] /= ifo[n].sig.sig2;
    }

    aa += aatemp/ifo[n].sig.sig2; 
    bb += bbtemp/ifo[n].sig.sig2;   
  }

#ifdef YEPPP
#define VLEN 1024
  int bnd = (sett->N/VLEN)*VLEN;
#endif

  // Check if the signal is added to the data or the range file is given:  
  // if not, proceed with the wide range of spindowns 
  // if yes, use smin = s_range->sst, smax = s_range->spndr[1]  
  if(!strcmp(opts->addsig, "") && !strcmp(opts->range, "")) {
    // Spindown range defined using Smin and Smax (settings.c)  
    smin = trunc((sett->Smin - nn*sett->M[9] - mm*sett->M[13])/sett->M[5]);
    smax = trunc(-(nn*sett->M[9] + mm*sett->M[13] + sett->Smax)/sett->M[5]);
  } 
  
  if(opts->s0_flag) smin = smax;
  // if spindown parameter is taken into account, smin != smax
  
  printf ("\n>>%d\t%d\t%d\t[%d..%d]\n", *FNum, mm, nn, smin, smax);

  static fftw_complex *fxa, *fxb;
  static double *F;
#pragma omp threadprivate(fxa,fxb, F)
#pragma omp threadprivate(F)

  //private loop counter: ss
  //private (declared inside): ii,Fc,het1,k,veto_status,a,v,_p,_c,_s,status
  //shared default: nn,mm,sett,_tmp1,ifo,het0,bnd,plans,opts,aa,bb,
  //                fftw_arr (zostawiamy i robimy nowe), FNum (atomic!)
  //we use shared plans and  fftw_execute with 'new-array' interface
#pragma omp parallel default(shared)				\
  private(i, j, n, sgnl0, exph, phase, cp, sp, tstart, tend)	\
  firstprivate(sgnlt)						\
  reduction(+ : spindown_timer, spindown_counter)

  {
#ifdef YEPPP
    Yep64f _p[VLEN], _s[VLEN], _c[VLEN];
    enum YepStatus status;
#endif
#ifdef SLEEF
    double _p[VECTLENDP], _c[VECTLENDP];
    vdouble2 v;
    vdouble a;
#endif
    
    if (!fxa) fxa = (fftw_complex *)fftw_malloc(fftw_arr->arr_len*sizeof(fftw_complex));
    if (!fxb) fxb = (fftw_complex *)fftw_malloc(fftw_arr->arr_len*sizeof(fftw_complex));
    if (!F) F = (double *)calloc(2*sett->nfft, sizeof(double));
  
    /* Spindown loop  */

#pragma omp for schedule(static,4)
    for(ss=smin; ss<=smax; ++ss) {

#if TIMERS>2
      tstart = get_current_time(CLOCK_PROCESS_CPUTIME_ID);
#endif 

      // Spindown parameter
      sgnlt[1] = ss*sett->M[5] + nn*sett->M[9] + mm*sett->M[13];

      int ii;
      double Fc, het1;
      
#ifdef VERBOSE
      //print a 'dot' every new spindown
      printf ("."); fflush (stdout);
#endif 
      
      het1 = fmod(ss*sett->M[4], sett->M[0]);
      if(het1<0) het1 += sett->M[0];

      sgnl0 = het0 + het1;

      // phase modulation before fft

#if defined(SLEEF)
      // use simd sincos from the SLEEF library;
      // VECTLENDP is a simd vector length defined in the SLEEF library
      // and it depends on selected instruction set e.g. -DENABLE_AVX
      for(i=0; i<sett->N; i+=VECTLENDP) {
	for(j=0; j<VECTLENDP; j++)
	  _p[j] =  het1*(i+j) + sgnlt[1]*_tmp1[0][i+j];
	
	a = vloadu(_p);
	v = xsincos(a);
	vstoreu(_p, v.x); // reuse _p for sin
	vstoreu(_c, v.y);
	
	for(j=0; j<VECTLENDP; ++j){
	  exph = _c[j] - I*_p[j];
	  fxa[i+j] = ifo[0].sig.xDatma[i+j]*exph; //ifo[0].sig.sig2;
	  fxb[i+j] = ifo[0].sig.xDatmb[i+j]*exph; //ifo[0].sig.sig2;
	}
      } 
#elif defined(YEPPP)
      // use yeppp! library;
      // VLEN is length of vector to be processed
      // for caches L1/L2 64/256kb optimal value is ~2048
      for (j=0; j<bnd; j+=VLEN) {
	//double *_tmp2 = &_tmp1[0][j];
	for (i=0; i<VLEN; ++i)
	  //_p[i] =  het1*(i+j) + sgnlt[1]*_tmp2[i];
       	  _p[i] =  het1*(i+j) + sgnlt[1]*_tmp1[0][i+j];
	
	status = yepMath_Sin_V64f_V64f(_p, _s, VLEN);
	assert(status == YepStatusOk);
	status = yepMath_Cos_V64f_V64f(_p, _c, VLEN);
	assert(status == YepStatusOk);

	for (i=0; i<VLEN; ++i) {
	  //	  exph = _c[i] - I*_s[i];
	  fxa[i+j] = ifo[0].sig.xDatma[i+j]*_c[i]-I*ifo[0].sig.xDatma[i+j]*_s[i];
	  fxb[i+j] = ifo[0].sig.xDatmb[i+j]*_c[i]-I*ifo[0].sig.xDatmb[i+j]*_s[i];
	}
      }
      // remaining part is shorter than VLEN - no need to vectorize
      for (i=0; i<sett->N-bnd; ++i){
	j = bnd + i;
	_p[i] =  het1*j + sgnlt[1]*_tmp1[0][j];
      }

      status = yepMath_Sin_V64f_V64f(_p, _s, sett->N-bnd);
      assert(status == YepStatusOk);
      status = yepMath_Cos_V64f_V64f(_p, _c, sett->N-bnd);
      assert(status == YepStatusOk);

      for (i=0; i<sett->N-bnd; ++i) {
	j = bnd + i;
	//exph = _c[i] - I*_s[i];
	//fxa[j] = ifo[0].sig.xDatma[j]*exph;
	//fxb[j] = ifo[0].sig.xDatmb[j]*exph;
	fxa[j] = ifo[0].sig.xDatma[j]*_c[i]-I*ifo[0].sig.xDatma[j]*_s[i];
	fxb[j] = ifo[0].sig.xDatmb[j]*_c[i]-I*ifo[0].sig.xDatmb[j]*_s[i];
      }
#elif defined(GNUSINCOS)
      for(i=sett->N-1; i!=-1; --i) {
        phase = het1*i + sgnlt[1]*_tmp1[0][i];
	sincos(phase, &sp, &cp);
	exph = cp - I*sp;
        fxa[i] = ifo[0].sig.xDatma[i]*exph; //ifo[0].sig.sig2;
        fxb[i] = ifo[0].sig.xDatmb[i]*exph; //ifo[0].sig.sig2;
      }
#else
      for(i=sett->N-1; i!=-1; --i) {
        phase = het1*i + sgnlt[1]*_tmp1[0][i];
	cp = cos(phase);
      	sp = sin(phase);
	exph = cp - I*sp;
        fxa[i] = ifo[0].sig.xDatma[i]*exph; //ifo[0].sig.sig2;
        fxb[i] = ifo[0].sig.xDatmb[i]*exph; //ifo[0].sig.sig2;
      }
#endif

      for(n=1; n<sett->nifo; ++n) {
#if defined(SLEEF)
	// use simd sincos from the SLEEF library;
	// VECTLENDP is a simd vector length defined in the SLEEF library
	// and it depends on selected instruction set e.g. -DENABLE_AVX
	for (i=0; i<sett->N; i+=VECTLENDP) {
	  for(j=0; j<VECTLENDP; j++)
	    _p[j] =  het1*(i+j) + sgnlt[1]*_tmp1[n][i+j];
	  
	  a = vloadu(_p);
	  v = xsincos(a);
	  vstoreu(_p, v.x); // reuse _p for sin
	  vstoreu(_c, v.y);
	
	  for(j=0; j<VECTLENDP; ++j){
	    exph = _c[j] - I*_p[j];
	    fxa[i+j] = ifo[n].sig.xDatma[i+j]*exph;
	    fxb[i+j] = ifo[n].sig.xDatmb[i+j]*exph;
	  }
	} 
#elif defined(YEPPP)
	// use yeppp! library;
	// VLEN is length of vector to be processed
	// for caches L1/L2 64/256kb optimal value is ~2048
	for (j=0; j<bnd; j+=VLEN) {
	  //double *_tmp2 = &_tmp1[n][j];
	  for (i=0; i<VLEN; ++i)
	    //  _p[i] =  het1*(i+j) + sgnlt[1]*_tmp2[i];
	    _p[i] =  het1*(j+i) + sgnlt[1]*_tmp1[n][j+i];
	
	  status = yepMath_Sin_V64f_V64f(_p, _s, VLEN);
	  assert(status == YepStatusOk);
	  status = yepMath_Cos_V64f_V64f(_p, _c, VLEN);
	  assert(status == YepStatusOk);
	
	  for (i=0; i<VLEN; ++i) {
	    //exph = _c[i] - I*_s[i];
	    //fxa[j+i] += ifo[n].sig.xDatma[j+i]*exph;
	    //fxb[j+i] += ifo[n].sig.xDatmb[j+i]*exph;
	    fxa[i+j] += ifo[n].sig.xDatma[i+j]*_c[i]-I*ifo[n].sig.xDatma[i+j]*_s[i];
	    fxb[i+j] += ifo[n].sig.xDatmb[i+j]*_c[i]-I*ifo[n].sig.xDatmb[i+j]*_s[i];
	  }
	}
	// remaining part is shorter than VLEN - no need to vectorize
	for (i=0; i<sett->N-bnd; ++i){
	  j = bnd + i;
	  _p[i] =  het1*j + sgnlt[1]*_tmp1[n][j];
	}

	status = yepMath_Sin_V64f_V64f(_p, _s, sett->N-bnd);
	assert(status == YepStatusOk);
	status = yepMath_Cos_V64f_V64f(_p, _c, sett->N-bnd);
	assert(status == YepStatusOk);

	for (i=0; i<sett->N-bnd; ++i) {
	  j = bnd + i;
	  //exph = _c[i] - I*_s[i];
	  //fxa[j] += ifo[n].sig.xDatma[j]*exph;
	  //fxb[j] += ifo[n].sig.xDatmb[j]*exph;
	  fxa[j] += ifo[n].sig.xDatma[j]*_c[i]-I*ifo[n].sig.xDatma[j]*_s[i];
	  fxb[j] += ifo[n].sig.xDatmb[j]*_c[i]-I*ifo[n].sig.xDatmb[j]*_s[i];
	}

#elif defined(GNUSINCOS)
	for(i=sett->N-1; i!=-1; --i) {
	  phase = het1*i + sgnlt[1]*_tmp1[n][i];
	  sincos(phase, &sp, &cp);
	  exph = cp - I*sp;
	  fxa[i] += ifo[n].sig.xDatma[i]*exph;
	  fxb[i] += ifo[n].sig.xDatmb[i]*exph;
	}
#else
	for(i=sett->N-1; i!=-1; --i) {
	  phase = het1*i + sgnlt[1]*_tmp1[n][i];
	  cp = cos(phase);
	  sp = sin(phase);
	  exph = cp - I*sp;
	  fxa[i] += ifo[n].sig.xDatma[i]*exph;
	  fxb[i] += ifo[n].sig.xDatmb[i]*exph;
	}
#endif

      } 

      // Zero-padding 
      for(i = sett->fftpad*sett->nfft-1; i != sett->N-1; --i)
	fxa[i] = fxb[i] = 0.; 

      fftw_execute_dft(plans->plan, fxa, fxa);
      fftw_execute_dft(plans->plan, fxb, fxb);
      
      // Computing F-statistic 
      for (i=sett->nmin; i<sett->nmax; i++) {
	F[i] = (sqr(creal(fxa[i])) + sqr(cimag(fxa[i])))/aa +
	       (sqr(creal(fxb[i])) + sqr(cimag(fxb[i])))/bb;
      }

      //      for (i=sett->nmin; i<sett->nmax; i++) 
      //	F[i] += (sqr(creal(fxb[i])) + sqr(cimag(fxb[i])))/bb;

#pragma omp atomic
      (*FNum)++;

      
#if 0
      FILE *f1 = fopen("fraw-1.dat", "w");
      for(i=sett->nmin; i<sett->nmax; i++)
	fprintf(f1, "%d   %lf   %lf\n", i, F[i], 2.*M_PI*i/((double) sett->fftpad*sett->nfft) + sgnl0);
      fclose(f1);
#endif 

#ifndef NORMTOMAX
      //#define NAVFSTAT 4096
      // Normalize F-statistics 
      if(!(opts->white_flag))  // if the noise is not white noise
        FStat(F + sett->nmin, sett->nmax - sett->nmin, NAVFSTAT, 0);

      // f1 = fopen("fnorm-4096-1.dat", "w");
      //for(i=sett->nmin; i<sett->nmax; i++)
      //fprintf(f1, "%d   %lf   %lf\n", i, F[i], 2.*M_PI*i/((double) sett->fftpad*sett->nfft) + sgnl0);
      //fclose(f1);
      //      exit(EXIT_SUCCESS);

      for(i=sett->nmin; i<sett->nmax; i++) {
        if ((Fc = F[i]) > opts->trl) { // if F-stat exceeds trl (critical value)
          // Find local maximum for neighboring signals 
          ii = i;

	  while (++i < sett->nmax && F[i] > opts->trl) {
	    if(F[i] >= Fc) {
	      ii = i;
	      Fc = F[i];
	    } // if F[i] 
	  } // while i 
	  // Candidate signal frequency
	  sgnlt[0] = 2.*M_PI*ii/((FLOAT_TYPE)sett->fftpad*sett->nfft) + sgnl0;
	  // Signal-to-noise ratio
	  sgnlt[4] = sqrt(2.*(Fc-sett->nd));
	  
	  // Checking if signal is within a known instrumental line 
	  int k, veto_status = 0; 
	  for(k=0; k<sett->numlines_band; k++)
	    if(sgnlt[0]>=sett->lines[k][0] && sgnlt[0]<=sett->lines[k][1]) { 
	      veto_status=1; 
	      break; 
	    }   
	  
	  int _sgnlc;
	  if(!veto_status) {

	    /* 
#pragma omp critical
	    {
	      (*sgnlc)++; // increase found number
	      // Add new parameters to output array 
	      for (j=0; j<NPAR; ++j)    // save new parameters
		sgnlv[NPAR*(*sgnlc-1)+j] = (FLOAT_TYPE)sgnlt[j];
	    }
	    */

#pragma omp atomic capture
	    {
	      (*sgnlc)++; // increase found number
	      _sgnlc = *sgnlc;
	    }
	    // Add new parameters to output array 
	    for (j=0; j<NPAR; ++j)    // save new parameters
	      sgnlv[NPAR*(_sgnlc-1)+j] = (FLOAT_TYPE)sgnlt[j];
	    
#ifdef VERBOSE
	    printf ("\nSignal %d: %d %d %d %d %d snr=%.2f\n", 
		    *sgnlc, pm, mm, nn, ss, ii, sgnlt[4]);
#endif
	  }
	} // if Fc > trl 
      } // for i
      
#else // new version
      imax = -1;
      // find local maxima first
      //printf("nmin=%d   nmax=%d    nfft=%d   nblocks=%d\n", sett->nmin, sett->nmax, nfft, nfft/blksize);
      for(iblk=0; iblk < nfft/blksize; ++iblk) {
	blkavg = 0.;
	blkstart = sett->nmin + iblk*blksize; // block start index in F 
	// in case the last block is shorter than blksize, include its elements in the previous block
	if(iblk==(nfft/blksize-1)) {blksize = sett->nmax - blkstart;}
	imax0 = imax+1; // index of first maximum in current block
	//printf("\niblk=%d   blkstart=%d   blksize=%d    imax0=%d\n", iblk, blkstart, blksize, imax0);
	for(i=1; i <= blksize; ++i) { // include first element of the next block
	  ii = blkstart + i;
	  if(ii < sett->nmax) 
	    {ihi=ii+1;} 
	  else 
	    {ihi = sett->nmax; /*printf("ihi=%d  ii=%d\n", ihi, ii);*/};
	  if(F[ii] > F[ii-1] && F[ii] > F[ihi]) {
	    blkavg += F[ii];
	    Fmax[++imax] = ii;
	    ++i; // next element can't be maximum - skip it
	  }
	} // i
	// now imax points to the last element of Fmax
	// normalize in blocks 
	blkavg /= (double)(imax - imax0 + 1);
	for(i=imax0; i <= imax; ++i)
	  F[Fmax[i]] /= blkavg;

      } // iblk

      //f1 = fopen("fmax.dat", "w");
      //for(i=1; i < imax; i++)
      //fprintf(f1, "%d   %lf \n", Fmax[i], F[Fmax[i]]);
      //fclose(f1);
      //exit(EXIT_SUCCESS);

      // apply threshold limit
      for(i=0; i <= imax; ++i){
	//if(F[Fmax[i]] > opts->trl) {
	if(F[Fmax[i]] > threshold) {
	  sgnlt[0] = 2.*M_PI*i/((FLOAT_TYPE)sett->fftpad*sett->nfft) + sgnl0;
	  // Signal-to-noise ratio
	  sgnlt[4] = sqrt(2.*(F[Fmax[i]] - sett->nd));

	  // Checking if signal is within a known instrumental line 
	  int k, veto_status=0; 
	  for(k=0; k<sett->numlines_band; k++)
	    if(sgnlt[0]>=sett->lines[k][0] && sgnlt[0]<=sett->lines[k][1]) { 
	      veto_status=1; 
	      break; 
	    }   
	  
	  if(!veto_status) { 
	    
	    (*sgnlc)++; // increase number of found candidates
	    // Add new parameters to buffer array 
	    for (j=0; j<NPAR; ++j)
	      sgnlv[NPAR*(*sgnlc-1)+j] = (FLOAT_TYPE)sgnlt[j];
#ifdef VERBOSE
	    printf ("\nSignal %d: %d %d %d %d %d snr=%.2f\n", 
		    *sgnlc, pm, mm, nn, ss, Fmax[i], sgnlt[4]);
#endif 
	  }
	}
      } // i
#endif // old/new version
      
#if TIMERS>2
      tend = get_current_time(CLOCK_PROCESS_CPUTIME_ID);
      spindown_timer += get_time_difference(tstart, tend);
      spindown_counter++;
#endif
      
    } // for ss 
  } // omp parallel
  
#ifndef VERBOSE 
  printf("Number of signals found: %d\n", *sgnlc); 
#endif 

  //  tend = get_current_time(CLOCK_REALTIME);
  //time_elapsed = get_time_difference(tstart, tend);
  //printf("Parallel part: %e  ( per thread %e ) s\n", time_elapsed, time_elapsed/omp_get_max_threads());


#if TIMERS>2
  printf("\nTotal spindown loop time: %e s, mean spindown cpu-time: %e s (%d runs)\n",
	 spindown_timer, spindown_timer/spindown_counter, spindown_counter);
#endif

  return 0;
  
} // jobcore