示例#1
0
int main(void)
{

   //struct timespec st,end,st2,end2;

   INT4 ii, length = 100000;
   REAL4VectorAligned *floatvalues1 = NULL, *floatvalues2 = NULL, *floatvalues3 = NULL;
   alignedREAL8Vector *doublevalues1 = NULL, *doublevalues2 = NULL, *doublevalues3 = NULL;
   alignedREAL4VectorArray *floatvalues = NULL;
   XLAL_CHECK( (floatvalues1 = XLALCreateREAL4VectorAligned(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatvalues2 = XLALCreateREAL4VectorAligned(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatvalues3 = XLALCreateREAL4VectorAligned(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (doublevalues1 = createAlignedREAL8Vector(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (doublevalues2 = createAlignedREAL8Vector(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (doublevalues3 = createAlignedREAL8Vector(length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatvalues = createAlignedREAL4VectorArray(2, length, 32)) != NULL, XLAL_EFUNC );

   for (ii=0; ii<length; ii++) {
      floatvalues1->data[ii] = (REAL4)(ii-length/2)*2.0e-3;
      doublevalues1->data[ii] = (REAL8)(ii-length/2)*2.0e-3;
      floatvalues2->data[ii] = (REAL4)(ii)*1.0e-3;
      doublevalues2->data[ii] = (REAL8)(ii)*1.0e-3;
      floatvalues3->data[ii] = (REAL4)(ii-length/2)*2.0e-4;
      doublevalues3->data[ii] = (REAL8)(ii-length/2)*2.0e-4;
   }
   memcpy(floatvalues->data[0]->data, floatvalues1->data, sizeof(REAL4)*length);
   memcpy(floatvalues->data[1]->data, floatvalues2->data, sizeof(REAL4)*length);

   REAL4VectorAligned *floatresult_vecsum = NULL, *floatresult_vecmult = NULL, *floatresult_addscalar = NULL, *floatresult_scale = NULL;
   alignedREAL8Vector *doubleresult_exp = NULL, *doubleresult_addscalar = NULL, *doubleresult_scale = NULL;
   alignedREAL4VectorArray *arraysumresult = NULL;
   XLAL_CHECK( (doubleresult_exp = createAlignedREAL8Vector(doublevalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatresult_vecsum = XLALCreateREAL4VectorAligned(floatvalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatresult_vecmult = XLALCreateREAL4VectorAligned(floatvalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatresult_addscalar = XLALCreateREAL4VectorAligned(floatvalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (floatresult_scale = XLALCreateREAL4VectorAligned(floatvalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (doubleresult_addscalar = createAlignedREAL8Vector(doublevalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (doubleresult_scale = createAlignedREAL8Vector(doublevalues1->length, 32)) != NULL, XLAL_EFUNC );
   XLAL_CHECK( (arraysumresult = createAlignedREAL4VectorArray(2, length, 32)) != NULL, XLAL_EFUNC );
   memset(arraysumresult->data[0]->data, 0, sizeof(REAL4)*length);
   memset(arraysumresult->data[1]->data, 0, sizeof(REAL4)*length);

   //clock_gettime(CLOCK_REALTIME, &st);

   XLAL_CHECK( sse_exp_REAL8Vector(doubleresult_exp, doublevalues3) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseSSVectorSum(floatresult_vecsum, floatvalues1, floatvalues2) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseSSVectorMultiply(floatresult_vecmult, floatvalues1, floatvalues2) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseAddScalarToREAL4Vector(floatresult_addscalar, floatvalues1, (REAL4)100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseScaleREAL4Vector(floatresult_scale, floatvalues1, (REAL4)100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseAddScalarToREAL8Vector(doubleresult_addscalar, doublevalues1, 100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseScaleREAL8Vector(doubleresult_scale, doublevalues1, 100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( sseSSVectorArraySum(arraysumresult, floatvalues, floatvalues, 0, 1, 0, 1) == XLAL_SUCCESS, XLAL_EFUNC );

   //clock_gettime(CLOCK_REALTIME, &end);

   REAL4 maxfloaterr_vecsum = 0.0, maxfloatrelerr_vecsum = 0.0, maxfloaterr_vecmult = 0.0, maxfloatrelerr_vecmult = 0.0, maxfloaterr_addscalar = 0.0, maxfloatrelerr_addscalar = 0.0, maxfloaterr_scale = 0.0, maxfloatrelerr_scale = 0.0, maxfloaterr_seqsum = 0.0, maxfloatrelerr_seqsum = 0.0;
   REAL8 maxdoubleerr_exp = 0.0, maxdoublerelerr_exp = 0.0, maxdoubleerr_addscalar = 0.0, maxdoublerelerr_addscalar = 0.0, maxdoubleerr_scale = 0.0, maxdoublerelerr_scale = 0.0;
   for (ii=0; ii<length; ii++) {
      REAL8 exp_libm = exp(doublevalues3->data[ii]);
      REAL8 doubleerr = fabs(doubleresult_exp->data[ii] - exp_libm);
      REAL8 doublerelerr = Relerr( doubleerr, exp_libm );
      maxdoubleerr_exp = fmax(doubleerr, maxdoubleerr_exp);
      maxdoublerelerr_exp = fmax(doublerelerr, maxdoublerelerr_exp);

      REAL4 sumval = (REAL4)(floatvalues1->data[ii] + floatvalues2->data[ii]);
      REAL4 floaterr = fabsf(floatresult_vecsum->data[ii] - sumval);
      REAL4 floatrelerr = Relfloaterr( floaterr, sumval );
      maxfloaterr_vecsum = fmaxf(floaterr, maxfloaterr_vecsum);
      maxfloatrelerr_vecsum = fmaxf(floatrelerr, maxfloatrelerr_vecsum);

      REAL4 multval = (REAL4)(floatvalues1->data[ii]*floatvalues2->data[ii]);
      floaterr = fabsf(floatresult_vecmult->data[ii] - multval);
      floatrelerr = Relfloaterr(floaterr, multval);
      maxfloaterr_vecmult = fmaxf(floaterr, maxfloaterr_vecmult);
      maxfloatrelerr_vecmult = fmaxf(floatrelerr, maxfloatrelerr_vecmult);

      sumval = (REAL4)(floatvalues1->data[ii]+(REAL4)100.0);
      REAL8 sumvald = (doublevalues1->data[ii]+100.0);
      floaterr = fabsf(floatresult_addscalar->data[ii] - sumval);
      floatrelerr = Relfloaterr(floaterr, sumval);
      doubleerr = fabs(doubleresult_addscalar->data[ii] - sumvald);
      doublerelerr = Relerr( doubleerr, sumvald );
      maxfloaterr_addscalar = fmaxf(floaterr, maxfloaterr_addscalar);
      maxfloatrelerr_addscalar = fmaxf(floatrelerr, maxfloatrelerr_addscalar);
      maxdoubleerr_addscalar = fmax(doubleerr, maxdoubleerr_addscalar);
      maxdoublerelerr_addscalar = fmax(doublerelerr, maxdoublerelerr_addscalar);

      multval = (REAL4)(floatvalues1->data[ii]*(REAL4)100.0);
      REAL8 multvald = (doublevalues1->data[ii]*100.0);
      floaterr = fabsf(floatresult_scale->data[ii] - multval);
      floatrelerr = Relfloaterr(floaterr, multval);
      doubleerr = fabs(doubleresult_scale->data[ii] - multvald);
      doublerelerr = Relerr( doubleerr, multvald );
      maxfloaterr_scale = fmaxf(floaterr, maxfloaterr_scale);
      maxfloatrelerr_scale = fmaxf(floatrelerr, maxfloatrelerr_scale);
      maxdoubleerr_scale = fmax(doubleerr, maxdoubleerr_scale);
      maxdoublerelerr_scale = fmax(doublerelerr, maxdoublerelerr_scale);

      floaterr = fabsf(arraysumresult->data[0]->data[ii] - (REAL4)(floatvalues1->data[ii]+floatvalues2->data[ii]));
      floatrelerr = Relfloaterr(floaterr, (REAL4)(floatvalues1->data[ii]+floatvalues2->data[ii]));
      maxfloaterr_seqsum = fmaxf(floaterr, maxfloaterr_seqsum);
      maxfloatrelerr_seqsum = fmaxf(floatrelerr, maxfloatrelerr_seqsum);
   }

   fprintf(stderr, "Test results SSE:\n");
   fprintf(stderr, "-----------------\n");
   fprintf(stderr, "Add REAL4Vectors: max error = %g, max relative error = %g\n", maxfloaterr_vecsum, maxfloatrelerr_vecsum);
   fprintf(stderr, "Multiply REAL4Vectors: max error = %g, max relative error = %g\n", maxfloaterr_vecmult, maxfloatrelerr_vecmult);
   fprintf(stderr, "Add scalar to REAL4Vector: max error = %g, max relative error = %g\n", maxfloaterr_addscalar, maxfloatrelerr_addscalar);
   fprintf(stderr, "Add scalar to REAL8Vector: max error = %g, max relative error = %g\n", maxdoubleerr_addscalar, maxdoublerelerr_addscalar);
   fprintf(stderr, "Scale REAL4Vector: max error = %g, max relative error = %g\n", maxfloaterr_scale, maxfloatrelerr_scale);
   fprintf(stderr, "Scale REAL8Vector: max error = %g, max relative error = %g\n", maxdoubleerr_scale, maxdoublerelerr_scale);
   fprintf(stderr, "exp(REAL8Vector): max error = %g, max relative error = %g\n", maxdoubleerr_exp, maxdoublerelerr_exp);
   fprintf(stderr, "Sum vectors of vector array into vector array: max error = %g, max relative error = %g\n", maxfloaterr_seqsum, maxfloatrelerr_seqsum);
   //fprintf(stderr, "Time elapsed: %li\n", (end.tv_sec-st.tv_sec)*GIGA+(end.tv_nsec-st.tv_nsec));

#ifdef __AVX__
   //clock_gettime(CLOCK_REALTIME, &st2);

   XLAL_CHECK( avxSSVectorSum(floatresult_vecsum, floatvalues1, floatvalues2) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxSSVectorMultiply(floatresult_vecmult, floatvalues1, floatvalues2) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxAddScalarToREAL4Vector(floatresult_addscalar, floatvalues1, (REAL4)100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxScaleREAL4Vector(floatresult_scale, floatvalues1, (REAL4)100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxAddScalarToREAL8Vector(doubleresult_addscalar, doublevalues1, 100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxScaleREAL8Vector(doubleresult_scale, doublevalues1, 100.0) == XLAL_SUCCESS, XLAL_EFUNC );
   XLAL_CHECK( avxSSVectorArraySum(arraysumresult, floatvalues, floatvalues, 0, 1, 0, 1) == XLAL_SUCCESS, XLAL_EFUNC );

   //clock_gettime(CLOCK_REALTIME, &end2);

   //REAL4 maxfloaterr_vecsum = 0.0, maxfloatrelerr_vecsum = 0.0, maxfloaterr_vecmult = 0.0, maxfloatrelerr_vecmult = 0.0, maxfloaterr_addscalar = 0.0, maxfloatrelerr_addscalar = 0.0, maxfloaterr_scale = 0.0, maxfloatrelerr_scale = 0.0, maxfloaterr_seqsum = 0.0, maxfloatrelerr_seqsum = 0.0, maxfloaterr_seqsub = 0.0, maxfloatrelerr_seqsub = 0.0;
   //REAL8 maxdoubleerr_addscalar = 0.0, maxdoublerelerr_addscalar = 0.0, maxdoubleerr_scale = 0.0, maxdoublerelerr_scale = 0.0;
   for (ii=0; ii<length; ii++) {
      REAL4 floaterr = fabsf(floatresult_vecsum->data[ii] - (REAL4)(floatvalues1->data[ii] + floatvalues2->data[ii]));
      REAL4 floatrelerr = fabsf((REAL4)(1.0 - floatresult_vecsum->data[ii]/(REAL4)(floatvalues1->data[ii] + floatvalues2->data[ii])));
      if (floaterr>maxfloaterr_vecsum) maxfloaterr_vecsum = floaterr;
      if (floatrelerr>maxfloatrelerr_vecsum) maxfloatrelerr_vecsum = floatrelerr;

      floaterr = fabsf(floatresult_vecmult->data[ii] - (REAL4)(floatvalues1->data[ii]*floatvalues2->data[ii]));
      floatrelerr = fabsf((REAL4)(1.0 - floatresult_vecmult->data[ii]/(REAL4)(floatvalues1->data[ii]*floatvalues2->data[ii])));
      if (floaterr>maxfloaterr_vecmult) maxfloaterr_vecmult = floaterr;
      if (floatrelerr>maxfloatrelerr_vecmult) maxfloatrelerr_vecmult = floatrelerr;

      floaterr = fabsf(floatresult_addscalar->data[ii] - (REAL4)(floatvalues1->data[ii]+(REAL4)100.0));
      REAL8 doubleerr = fabs(doubleresult_addscalar->data[ii] - (doublevalues1->data[ii]+100.0));
      floatrelerr = fabsf((REAL4)(1.0 - floatresult_addscalar->data[ii]/(REAL4)(floatvalues1->data[ii]+(REAL4)100.0)));
      REAL8 doublerelerr = fabs(1.0 - doubleresult_addscalar->data[ii]/(doublevalues1->data[ii]+100.0));
      if (floaterr>maxfloaterr_addscalar) maxfloaterr_addscalar = floaterr;
      if (floatrelerr>maxfloatrelerr_addscalar) maxfloatrelerr_addscalar = floatrelerr;
      if (doubleerr>maxdoubleerr_addscalar) maxdoubleerr_addscalar = doubleerr;
      if (doublerelerr>maxdoublerelerr_addscalar) maxdoublerelerr_addscalar = doublerelerr;

      floaterr = fabsf(floatresult_scale->data[ii] - (REAL4)(floatvalues1->data[ii]*(REAL4)100.0));
      doubleerr = fabs(doubleresult_scale->data[ii] - (doublevalues1->data[ii]*100.0));
      floatrelerr = fabsf((REAL4)(1.0 - floatresult_scale->data[ii]/(REAL4)(floatvalues1->data[ii]*(REAL4)100.0)));
      doublerelerr = fabs(1.0 - doubleresult_scale->data[ii]/(doublevalues1->data[ii]*100.0));
      if (floaterr>maxfloaterr_scale) maxfloaterr_scale = floaterr;
      if (floatrelerr>maxfloatrelerr_scale) maxfloatrelerr_scale = floatrelerr;
      if (doubleerr>maxdoubleerr_scale) maxdoubleerr_scale = doubleerr;
      if (doublerelerr>maxdoublerelerr_scale) maxdoublerelerr_scale = doublerelerr;

      floaterr = fabsf(arraysumresult->data[ii] - (REAL4)(floatvalues1->data[ii]+floatvalues2->data[ii]));
      floatrelerr = fabsf((REAL4)(1.0 - arraysumresult->data[ii]/(REAL4)(floatvalues1->data[ii]+floatvalues2->data[ii])));
      if (floaterr>maxfloaterr_seqsum) maxfloaterr_seqsum = floaterr;
      if (floatrelerr>maxfloatrelerr_seqsum) maxfloatrelerr_seqsum = floatrelerr;
   }

   fprintf(stderr, "Test results AVX:\n");
   fprintf(stderr, "-----------------\n");
   fprintf(stderr, "Add REAL4Vectors: max error = %g, max relative error = %g\n", maxfloaterr_vecsum, maxfloatrelerr_vecsum);
   fprintf(stderr, "Multiply REAL4Vectors: max error = %g, max relative error = %g\n", maxfloaterr_vecmult, maxfloatrelerr_vecmult);
   fprintf(stderr, "Add scalar to REAL4Vector: max error = %g, max relative error = %g\n", maxfloaterr_addscalar, maxfloatrelerr_addscalar);
   fprintf(stderr, "Add scalar to REAL8Vector: max error = %g, max relative error = %g\n", maxdoubleerr_addscalar, maxdoublerelerr_addscalar);
   fprintf(stderr, "Scale REAL4Vector: max error = %g, max relative error = %g\n", maxfloaterr_scale, maxfloatrelerr_scale);
   fprintf(stderr, "Scale REAL8Vector: max error = %g, max relative error = %g\n", maxdoubleerr_scale, maxdoublerelerr_scale);
   fprintf(stderr, "Sum vectors of vector array into vector array: max error = %g, max relative error = %g\n", maxfloaterr_seqsum, maxfloatrelerr_seqsum);
   //fprintf(stderr, "Time elapsed: %li\n", (end2.tv_sec-st2.tv_sec)*GIGA+(end2.tv_nsec-st2.tv_nsec));
#endif

   XLALDestroyREAL4VectorAligned(floatvalues1);
   destroyAlignedREAL8Vector(doublevalues1);
   XLALDestroyREAL4VectorAligned(floatvalues2);
   destroyAlignedREAL8Vector(doublevalues2);
   destroyAlignedREAL8Vector(doubleresult_exp);
   XLALDestroyREAL4VectorAligned(floatresult_vecsum);
   XLALDestroyREAL4VectorAligned(floatresult_vecmult);
   XLALDestroyREAL4VectorAligned(floatresult_addscalar);
   XLALDestroyREAL4VectorAligned(floatresult_scale);
   destroyAlignedREAL8Vector(doubleresult_addscalar);
   destroyAlignedREAL8Vector(doubleresult_scale);
   destroyAlignedREAL4VectorArray(floatvalues);
   destroyAlignedREAL4VectorArray(arraysumresult);

   return 0;

}
示例#2
0
// ---------- main ----------
int
main ( int argc, char *argv[] )
{
  UserInput_t XLAL_INIT_DECL(uvar_s);
  UserInput_t *uvar = &uvar_s;

  uvar->randSeed = 1;
  uvar->Nruns = 1;
  uvar->inAlign = uvar->outAlign = sizeof(void*);
  // ---------- register user-variable ----------
  XLALRegisterUvarMember(  randSeed,            INT4, 's', OPTIONAL, "Random-number seed");
  XLALRegisterUvarMember(  Nruns,               INT4, 'r', OPTIONAL, "Number of repeated timing 'runs' to average over (=improves variance)" );
  XLALRegisterUvarMember(  inAlign,             INT4, 'a', OPTIONAL, "Alignment of input vectors; default is sizeof(void*), i.e. no particular alignment" );
  XLALRegisterUvarMember(  outAlign,            INT4, 'b', OPTIONAL, "Alignment of output vectors; default is sizeof(void*), i.e. no particular alignment" );

  BOOLEAN should_exit = 0;
  XLAL_CHECK( XLALUserVarReadAllInput( &should_exit, argc, argv, lalVCSInfoList ) == XLAL_SUCCESS, XLAL_EFUNC );
  if ( should_exit ) {
    exit (1);
  }

  srand ( uvar->randSeed );
  XLAL_CHECK ( uvar->Nruns >= 1, XLAL_EDOM );
  UINT4 Nruns = (UINT4)uvar->Nruns;

  UINT4 Ntrials = 1000000 + 7;
  REAL4VectorAligned *xIn_a, *xIn2_a, *xOut_a, *xOut2_a;
  XLAL_CHECK ( ( xIn_a   = XLALCreateREAL4VectorAligned ( Ntrials, uvar->inAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xIn2_a  = XLALCreateREAL4VectorAligned ( Ntrials, uvar->inAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xOut_a  = XLALCreateREAL4VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xOut2_a = XLALCreateREAL4VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );
  REAL4VectorAligned *xOutRef_a, *xOutRef2_a;
  XLAL_CHECK ( (xOutRef_a  = XLALCreateREAL4VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( (xOutRef2_a = XLALCreateREAL4VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );

  // extract aligned REAL4 vectors from these
  REAL4 *xIn      = xIn_a->data;
  REAL4 *xIn2     = xIn2_a->data;
  REAL4 *xOut     = xOut_a->data;
  REAL4 *xOut2    = xOut2_a->data;
  REAL4 *xOutRef  = xOutRef_a->data;
  REAL4 *xOutRef2 = xOutRef2_a->data;

  UINT4Vector *xOutU4;
  UINT4Vector *xOutRefU4;
  XLAL_CHECK ( ( xOutU4 = XLALCreateUINT4Vector ( Ntrials )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xOutRefU4 = XLALCreateUINT4Vector ( Ntrials )) != NULL, XLAL_EFUNC );

  REAL8VectorAligned *xInD_a, *xIn2D_a, *xOutD_a, *xOutRefD_a;
  XLAL_CHECK ( ( xInD_a   = XLALCreateREAL8VectorAligned ( Ntrials, uvar->inAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xIn2D_a  = XLALCreateREAL8VectorAligned ( Ntrials, uvar->inAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( ( xOutD_a  = XLALCreateREAL8VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );
  XLAL_CHECK ( (xOutRefD_a= XLALCreateREAL8VectorAligned ( Ntrials, uvar->outAlign )) != NULL, XLAL_EFUNC );

  // extract aligned REAL8 vectors from these
  REAL8 *xInD      = xInD_a->data;
  REAL8 *xIn2D     = xIn2D_a->data;
  REAL8 *xOutD     = xOutD_a->data;
  REAL8 *xOutRefD  = xOutRefD_a->data;


  REAL8 tic, toc;
  REAL4 maxErr = 0, maxRelerr = 0;
  REAL4 abstol, reltol;

  XLALPrintInfo ("Testing sin(x), cos(x) for x in [-1000, 1000]\n");
  for ( UINT4 i = 0; i < Ntrials; i ++ ) {
    xIn[i] = 2000 * ( frand() - 0.5 );
  }
  abstol = 2e-7, reltol = 1e-5;
  // ==================== SIN() ====================
  TESTBENCH_VECTORMATH_S2S(Sin,xIn);

  // ==================== COS() ====================
  TESTBENCH_VECTORMATH_S2S(Cos,xIn);

  // ==================== SINCOS() ====================
  TESTBENCH_VECTORMATH_S2SS(SinCos,xIn);

  // ==================== SINCOS(2PI*x) ====================
  TESTBENCH_VECTORMATH_S2SS(SinCos2Pi,xIn);

  // ==================== EXP() ====================
  XLALPrintInfo ("\nTesting exp(x) for x in [-10, 10]\n");
  for ( UINT4 i = 0; i < Ntrials; i ++ ) {
    xIn[i] = 20 * ( frand() - 0.5 );
  }

  abstol = 4e-3, reltol = 3e-7;
  TESTBENCH_VECTORMATH_S2S(Exp,xIn);

  // ==================== LOG() ====================
  XLALPrintInfo ("\nTesting log(x) for x in (0, 10000]\n");
  for ( UINT4 i = 0; i < Ntrials; i ++ ) {
    xIn[i] = 10000.0f * frand() + 1e-6;
  } // for i < Ntrials
  abstol = 2e-6, reltol = 2e-7;

  TESTBENCH_VECTORMATH_S2S(Log,xIn);

  // ==================== ADD,MUL ====================
  for ( UINT4 i = 0; i < Ntrials; i ++ ) {
    xIn[i]  = -10000.0f + 20000.0f * frand() + 1e-6;
    xIn2[i] = -10000.0f + 20000.0f * frand() + 1e-6;
    xInD[i] = -100000.0 + 200000.0 * frand() + 1e-6;
    xIn2D[i]= -100000.0 + 200000.0 * frand() + 1e-6;
  } // for i < Ntrials
  abstol = 2e-7, reltol = 2e-7;

  XLALPrintInfo ("\nTesting add,multiply,shift,scale(x,y) for x,y in (-10000, 10000]\n");
  TESTBENCH_VECTORMATH_SS2S(Add,xIn,xIn2);
  TESTBENCH_VECTORMATH_SS2S(Multiply,xIn,xIn2);
  TESTBENCH_VECTORMATH_SS2S(Max,xIn,xIn2);

  TESTBENCH_VECTORMATH_SS2S(Shift,xIn[0],xIn2);
  TESTBENCH_VECTORMATH_SS2S(Scale,xIn[0],xIn2);

  TESTBENCH_VECTORMATH_DD2D(Scale,xInD[0],xIn2D);

  // ==================== FIND ====================
  for ( UINT4 i = 0; i < Ntrials; i ++ ) {
    xIn[i]  = -10000.0f + 20000.0f * frand() + 1e-6;
    xIn2[i] = -10000.0f + 20000.0f * frand() + 1e-6;
  } // for i < Ntrials

  XLALPrintInfo ("\nTesting find for x,y in (-10000, 10000]\n");
  TESTBENCH_VECTORMATH_SS2uU(FindVectorLessEqual,xIn,xIn2);

  TESTBENCH_VECTORMATH_SS2uU(FindScalarLessEqual,xIn[0],xIn2);

  XLALPrintInfo ("\n");

  // ---------- clean up memory ----------
  XLALDestroyREAL4VectorAligned ( xIn_a );
  XLALDestroyREAL4VectorAligned ( xIn2_a );
  XLALDestroyREAL4VectorAligned ( xOut_a );
  XLALDestroyREAL4VectorAligned ( xOut2_a );

  XLALDestroyREAL4VectorAligned ( xOutRef_a );
  XLALDestroyREAL4VectorAligned ( xOutRef2_a );

  XLALDestroyUINT4Vector ( xOutU4 );
  XLALDestroyUINT4Vector ( xOutRefU4 );

  XLALDestroyREAL8VectorAligned ( xInD_a );
  XLALDestroyREAL8VectorAligned ( xIn2D_a );
  XLALDestroyREAL8VectorAligned ( xOutD_a );
  XLALDestroyREAL8VectorAligned ( xOutRefD_a );

  XLALDestroyUserVars();

  LALCheckMemoryLeaks();

  return XLAL_SUCCESS;

} // main()