예제 #1
0
void FATR util_mic_set_affinity_() {
	char affinity[BUFSZ];
	char num_threads[BUFSZ];
	int pos;

	int micdev;
	int nprocs;
	int ranks_per_dev;
	int rank_on_dev;
	int nthreads;
	int ppn;
	int ranks_per_device=util_getenv_nwc_ranks_per_device_();

	if (ranks_per_device == 0) {
	  return ;
	} else if (ranks_per_device < 0){
	  ranks_per_device = RANKS_PER_DEVICE;
	}
	
	pos=snprintf(affinity, BUFSZ, "KMP_PLACE_THREADS=");
	micdev=util_mic_get_device_();
	ppn=util_cgetppn();
#pragma offload target(mic:micdev) out(nprocs)
	{
		/* do one offload to query the coprocessor for the number of cores */
		nprocs = ((int) sysconf(_SC_NPROCESSORS_ONLN) / 4) - 1;
	}

	rank_on_dev = util_my_smp_index() % util_nwc_ranks_per_device_();
	
	nthreads = nprocs / ranks_per_device * DEFAULT_OFFLOAD_THREAD_MULTIPLIER;
	
	pos+=snprintf(affinity+pos, BUFSZ-pos, "%dc,%dt,%do",
	              nprocs / ranks_per_device, DEFAULT_OFFLOAD_THREAD_MULTIPLIER,
				  rank_on_dev * (nprocs / ranks_per_device));
	snprintf(num_threads, BUFSZ, "OMP_NUM_THREADS=%d", nthreads);
	
	printf("%02d: micdev=%d nprocs=%d rank_on_dev=%d ranks_per_device=%d affinity='%s' pos=%d\n", 
	       GA_Nodeid(), micdev, nprocs, rank_on_dev, ranks_per_device, affinity, pos);
	fflush(stdout);
#pragma offload target(mic:micdev) in(affinity) in(num_threads)
	{
		/* set the affinity masks and the number of offloaded OpenMP threads */
		kmp_set_defaults("KMP_AFFINITY=compact");
		kmp_set_defaults(affinity);
		kmp_set_defaults(num_threads);
	}
}
void calculate_production_rate(REACTION_DATA_ver2 &reaction_data, vector<double> &Rf, vector<double> &Rb, vector<double> &S_chem)
{
	kmp_set_defaults("KMP_AFFINITY = scatter");

	if (S_chem.size() != reaction_data.NS)	S_chem.resize(reaction_data.NS);

#pragma omp parallel for
	for (int s = 0; s <= reaction_data.NS-1; s++)	S_chem[s]	= 0.0;


#pragma omp parallel for
	for (int s = 0; s <= reaction_data.NS-1; s++)
	{
		for (int k = 0; k <= reaction_data.NR-1; k++)
		{
			double Rf_m_Rb	= Rf[k] - Rb[k];
			S_chem[s]	+= (reaction_data.reaction_k[k].Product_coeff[s] - reaction_data.reaction_k[k].Reactant_coeff[s]) * Rf_m_Rb;
		}
	}


#pragma omp parallel for
	for (int s = 0; s <= reaction_data.NS-1; s++)
		S_chem[s]	= S_chem[s] * reaction_data.species_data[s].basic_data.M;
}
예제 #3
0
파일: main.cpp 프로젝트: ujhpc/mic-test
int main(int argc, char* argv[]) {

  try {
    cmdline::parser cl;
    cl.add<int>("threads", 't', "number of threads", false);
    cl.add<int>("iter", 'i', "millions of iterations", false, 10);
    cl.parse_check(argc, argv);

    std::cout << "Initializing" << std::endl;

    if (cl.exist("threads")) {
      omp_set_num_threads(cl.get<int>("threads"));
    }
#if 0
    kmp_set_defaults("KMP_AFFINITY=compact");
#endif

#pragma omp parallel for
    for (auto i = 0; i < FLOPS_ARRAY_SIZE; i++) {
      fa[i] = (float)i + 0.1f;
      fb[i] = (float)i + 0.2f;
    }

    int max_threads = omp_get_max_threads();
    std::cout << "Starting compute on " << max_threads << " threads"
              << std::endl;

    Timer timer;

    float a = 1.1f;
    int iters = 1000000 * cl.get<int>("iter");

#pragma omp parallel for
    for (int i = 0; i < max_threads; ++i) {
      // each thread will work its own array section
      int offset = i * LOOP_COUNT;
      // loop many times to get lots of calculations
      for (int j = 0; j < iters; ++j) {
        // scale 1st array and add in the 2nd array
        for (int k = 0; k < LOOP_COUNT; ++k) {
          fa[k + offset] = a * fa[k + offset] + fb[k + offset];
        }
      }
    }

    double gflops =
        (double)(1e-9 * max_threads * LOOP_COUNT * iters * FLOPSPERCALC);
    // elasped time
    double sec = timer();
    std::cout << "Gflops = " << gflops << std::endl;
    std::cout << "secs = " << sec << std::endl;
    std::cout << "Gflops/s = " << (gflops / sec) << std::endl;
  }
  catch (std::string& ex) {
    std::cerr << "error: " << ex << std::endl;
  }
  catch (const char* ex) {
    std::cerr << "error: " << ex << std::endl;
  }
}
int test_kmp_set_defaults_lock_bug()
{
  /* checks that omp_get_num_threads is equal to the number of
     threads */
  int nthreads_lib;
  int nthreads = 0;

  nthreads_lib = -1;

  #pragma omp parallel
  {
    omp_set_lock(&lock);
    nthreads++;
    omp_unset_lock(&lock);
    #pragma omp single
    {
      nthreads_lib = omp_get_num_threads ();
    }  /* end of single */
  } /* end of parallel */
  kmp_set_defaults("OMP_NUM_THREADS");
  #pragma omp parallel
  {
    omp_set_lock(&lock);
    nthreads++;
    omp_unset_lock(&lock);
  } /* end of parallel */

  return (nthreads == 2*nthreads_lib);
}
void initomp (int nthreads, int verbose)
{
    char schedule[1024];

    if (verbose == 1)
    {
        sprintf (schedule,
                 "KMP_AFFINITY=granularity=fine,compact,verbose");
    }
    else
    {
        sprintf (schedule,
                 "KMP_AFFINITY=granularity=fine,compact");
    }
    kmp_set_defaults (schedule);
    omp_set_num_threads (nthreads);
}
예제 #6
0
/*
 * FN-01: Prepararization OP2A
 * @author	Minkwan Kim
 * @version 1.0	25/5/2015
 */
void ApplicationOP2A::preparation(int argc, char *argv[], string modulename)
{
	/*
	 * ==============================================
	 * STEP 1: Initialize Parallel Communication:
	 * 		- Development Status : Done
	 *		- Last modified on: July 23, 2014
	 *						by: Minkwan Kim
	 * =============================================
	 * */

	time_running.initStartTime();

#ifdef	MPI
	MPI_Init(&argc, &argv);					// INITIALIZE MPI
	MPI_Comm_size(MPI_COMM_WORLD, &NP); 	// FINDOUT HOW MANY PROCESSORS IN THERE
	MPI_Comm_rank(MPI_COMM_WORLD, &P);  	// FINDOUT WHICH PROCESSOR I AM
	t0 = MPI_Wtime();
#endif

	if (NP > OP2A_MAX_N_TASK)
		throw Common::ExceptionNPExceed (FromHere(), "Number of processors exceeds MAX_PROCESSOR. Need to adjust value of MAX_N_TASKS");

	kmp_set_defaults("KMP_AFFINITY = scatter");


	/*
	 * =================================================
	 * STEP 2: Show Version information:
	 *		- Development Status : Done
	 *		- Last modified on: July 23, 2014
	 *		-				by: Minkwan Kim
	 * =================================================
	 */
	Version versionOP2A(OP2A_VERSION_MAIN, OP2A_VERSION_SUB, m_now->tm_year + 1900, m_now->tm_mon + 1, m_now->tm_mday, modulename.c_str());

	if (P == 0)
	{
		versionOP2A.info();			// Show the version information
		cout << " --> t = " << time_running.getDeltaT() << "[sec]" << endl << endl;
	}
}
void calculate_reaction_rate(REACTION_DATA_ver2 &reaction_data, vector<double> &rhos, vector<double> &T, vector<double> &kf, vector<double> &kb, vector<double> &Rf, vector<double> &Rb)
{
	kmp_set_defaults("KMP_AFFINITY = scatter");


	// 1. Initialize Rf and Rb
	Rf.reserve(reaction_data.NR);
	Rb.reserve(reaction_data.NR);


	// 2. Calculate 10^-3 * rho_s / Ms
	vector<double>	rhos_Ms(reaction_data.NS, 0.0);

#pragma omp parallel for
	for (int s = 0; s <= reaction_data.NS-1; s++)	rhos_Ms[s]	= 0.001 * rhos[s]/reaction_data.species_data[s].basic_data.M;

	double n_mix	= 0.0;
	for (int s = 0; s <= reaction_data.NS-1; s++)	n_mix	= n_mix + rhos[s]/reaction_data.species_data[s].basic_data.m;
	n_mix	= n_mix * 1.0e-6;



	//3. Calculate Forward/backward reaction rate
	for (int k = 0; k <= reaction_data.NR-1; k++)
	{
		double Tf, Tb;
		calculate_reaction_temperature(reaction_data.reaction_k[k], T, Tf, Tb);
		kf[k]	= cal_kf(reaction_data.reaction_k[k], Tf);
		kb[k]	= cal_kb(reaction_data.reaction_k[k], n_mix, Tb);


		double Rf_temp	= 1.0;
		for (int j = 0; j <= reaction_data.NS-1; j++)	Rf_temp	*= pow(rhos_Ms[j], reaction_data.reaction_k[k].Reactant_coeff[j]);
		Rf[k]	= 1000.0*kf[k] * Rf_temp;

		double Rb_temp	= 1.0;
		for (int j = 0; j <= reaction_data.NS-1; j++)	Rf_temp	*= pow(rhos_Ms[j], reaction_data.reaction_k[k].Product_coeff[j]);
		Rb[k]	= 1000.0*kb[k] * Rb_temp;
	}
}
예제 #8
0
/***************************************************************************//**
 *
 * @ingroup Auxiliary
 *
 *  PLASMA_Init_Affinity - Initialize PLASMA.
 *
 *******************************************************************************
 *
 * @param[in] cores
 *          Number of cores to use (threads to launch).
 *          If cores = 0, cores = PLASMA_NUM_THREADS if it is set, the
 *          system number of core otherwise.
 *
 * @param[in] coresbind
 *          Array to specify where to bind each thread.
 *          Each thread i is binded to coresbind[hwloc(i)] if hwloc is
 *          provided, or to coresbind[i] otherwise.
 *          If coresbind = NULL, coresbind = PLASMA_AFF_THREADS if it
 *          is set, the identity function otherwise.
 *
 *******************************************************************************
 *
 * @return
 *          \retval PLASMA_SUCCESS successful exit
 *
 ******************************************************************************/
int PLASMA_Init_Affinity(int cores, int *coresbind)
{
    plasma_context_t *plasma;
    int status;
    int core;

    /* Create context and insert in the context map */
    plasma = plasma_context_create();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_Init", "plasma_context_create() failed");
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }
    status = plasma_context_insert(plasma, pthread_self());
    if (status != PLASMA_SUCCESS) {
        plasma_fatal_error("PLASMA_Init", "plasma_context_insert() failed");
        return PLASMA_ERR_OUT_OF_RESOURCES;
    }
    /* Init number of cores and topology */
    plasma_topology_init();

    /* Set number of cores */
    if ( cores < 1 ) {
        plasma->world_size = plasma_get_numthreads();
        if ( plasma->world_size == -1 ) {
            plasma->world_size = 1;
            plasma_warning("PLASMA_Init", "Could not find the number of cores: the thread number is set to 1");
        }
    }
    else
      plasma->world_size = cores;

    if (plasma->world_size <= 0) {
        plasma_fatal_error("PLASMA_Init", "failed to get system size");
        return PLASMA_ERR_NOT_FOUND;
    }
    /* Check if not more cores than the hard limit */
    if (plasma->world_size > CONTEXT_THREADS_MAX) {
        plasma_fatal_error("PLASMA_Init", "not supporting so many cores");
        return PLASMA_ERR_INTERNAL_LIMIT;
    }

    /* Get the size of each NUMA node */
    plasma->group_size = plasma_get_numthreads_numa();
    while ( ((plasma->world_size)%(plasma->group_size)) != 0 )
        (plasma->group_size)--;

    /* Initialize barriers */
    plasma_barrier_init(plasma);
    plasma_barrier_bw_init(plasma);

    /* Initialize default thread attributes */
    status = pthread_attr_init(&plasma->thread_attr);
    if (status != 0) {
        plasma_fatal_error("PLASMA_Init", "pthread_attr_init() failed");
        return status;
    }
    /* Set scope to system */
    status = pthread_attr_setscope(&plasma->thread_attr, PTHREAD_SCOPE_SYSTEM);
    if (status != 0) {
        plasma_fatal_error("PLASMA_Init", "pthread_attr_setscope() failed");
        return status;
    }
    /* Set concurrency */
    status = pthread_setconcurrency(plasma->world_size);
    if (status != 0) {
        plasma_fatal_error("PLASMA_Init", "pthread_setconcurrency() failed");
        return status;
    }
    /*  Launch threads */
    memset(plasma->thread_id,   0, CONTEXT_THREADS_MAX*sizeof(pthread_t));
    if (coresbind != NULL) {
        memcpy(plasma->thread_bind, coresbind, plasma->world_size*sizeof(int));
    }
    else {
        plasma_get_affthreads(plasma->thread_bind);
    }
    /* Assign rank and thread ID for the master */
    plasma->thread_rank[0] = 0;
    plasma->thread_id[0] = pthread_self();

    for (core = 1; core < plasma->world_size; core++) {
        plasma->thread_rank[core] = core;
        pthread_create(
            &plasma->thread_id[core],
            &plasma->thread_attr,
             plasma_parallel_section,
             (void*)plasma);
    }

    /* Ensure BLAS are sequential and set thread affinity for the master */
#if defined(PLASMA_WITH_MKL)
#if defined(__ICC) || defined(__INTEL_COMPILER)
    kmp_set_defaults("KMP_AFFINITY=disabled");
#endif
#endif

    /* Initialize the dynamic scheduler */
    plasma->quark =  QUARK_Setup(plasma->world_size);
    plasma_barrier(plasma);

    plasma_setlapack_sequential(plasma);

    return PLASMA_SUCCESS;
}
예제 #9
0
//
// Main program - pedal to the metal...calculate using tons o'flops!
// 
int main(int argc, char *argv[] ) 
{
    int i,j,k;
    int numthreads;
    double tstart, tstop, ttime;
    double gflops = 0.0;
    float a=1.1;

    //
    // initialize the compute arrays 
    //
    //

    omp_set_num_threads(2);
    kmp_set_defaults("KMP_AFFINITY=compact");

#pragma omp parallel
#pragma omp master
    numthreads = omp_get_num_threads();

    printf("Initializing\r\n");
#pragma omp parallel for
    for(i=0; i<FLOPS_ARRAY_SIZE; i++)
    {
        fa[i] = (float)i + 0.1;
        fb[i] = (float)i + 0.2;
    }	
    printf("Starting Compute on %d threads\r\n",numthreads);

    tstart = dtime();
	
    // scale the calculation across threads requested 
    // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY

#pragma omp parallel for private(j,k)
    for (i=0; i<numthreads; i++)
    {
        // each thread will work it's own array section
        // calc offset into the right section
        int offset = i*LOOP_COUNT;

        // loop many times to get lots of calculations
        for(j=0; j<MAXFLOPS_ITERS; j++)  
        {
            // scale 1st array and add in the 2nd array 
            for(k=0; k<LOOP_COUNT; k++)  
   	    {
                fa[k+offset] = a * fa[k+offset] + fb[k+offset];
            }
        }
    }
    tstop = dtime();
    // # of gigaflops we just calculated  
    gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT*
                        MAXFLOPS_ITERS*FLOPSPERCALC);    

    //elasped time
    ttime = tstop - tstart;
    //
    // Print the results
    //
    if ((ttime) > 0.0)
    {
        printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n",                   gflops, ttime, gflops/ttime);
    }
    return( 0 );
}
int main(int argc, char *argv[]) 
{
#ifdef _OPENMP

#ifndef KMP_AFFINITY
  kmp_set_defaults("KMP_AFFINITY=compact, granularity=fine");
//  kmp_set_defaults("KMP_AFFINITY=scatter, granularity=fine");//this gives much slower performance...
#endif

#pragma omp parallel
#pragma omp master

//#ifndef OMP_NUM_THREADS
 omp_set_num_threads(240);
//#endif

 printf("\nComputing 7-point stencil on Intel Xeon Phi in %d threads.\n", omp_get_num_threads());

#endif
  const int    nx    = problem_dim;
  const int    ny    = problem_dim;
  const int    nz    = problem_dim;

  const int problem_size = sizeof(float)*nx*ny*nz; 

  float *f1 = (float *)_mm_malloc(problem_size, 64);
  float *f2 = (float *)_mm_malloc(problem_size, 64);

  assert(f1 != MAP_FAILED);
  assert(f2 != MAP_FAILED);

  float *answer = (float *)_mm_malloc(problem_size, 64);
  float *f_final = NULL;

  int   count = 0;  

  float c0, c1;

  float l = 1.0;
  float kappa = 0.1;
  float dx = l / nx;
  float dy = l / ny;
  float dz = l / nz;

  float dt    = 0.1*dx*dx / kappa;
  float scale = 0.1;
  count = scale / dt;
  f_final = (count % 2)? f2 : f1;

  create_field<float>(f1, nx, ny, nz, dx, dy, dz, kappa, 0.0);

  c1 = kappa*dt/(dx*dx);
  c0 = 1.0 - 6*c1;

  printf("Running heat kernel %d times\n", count); 
  fflush(stdout);

  float *f1_t = f1;
  float *f2_t = f2;

  struct timeval time_begin, time_end;

  gettimeofday(&time_begin, NULL);

  for (int i = 0; i < count; ++i) {
    compute_7pt_stencile_mm512_ps(f2_t, f1_t, nx, ny, nz, c0, c1); 
    float *t = f1_t;
    f1_t    = f2_t;
    f2_t    = t;
  }
  gettimeofday(&time_end, NULL);

  float time = count * dt;
 
  create_field<float>(answer, nx, ny, nz, dx, dy, dz, kappa, time);
  float err = accuracy<float>(answer,f_final, nx*ny*nz);

  double elapsed_time = (time_end.tv_sec - time_begin.tv_sec)
      + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6;
  float Gflops = (nx*ny*nz)*8.0*count/elapsed_time * 1.0e-09;
  float Gstens = (nx*ny*nz)*1.0*count/elapsed_time * 1.0e-06;
  double thput = (nx * ny * nz) * sizeof(float) * 3.0 * count
      / elapsed_time * 1.0e-09;

  fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time);
  fprintf(stderr, "FLOPS        : %.3f (GFlops)\n", Gflops);
  fprintf(stderr, "Updates      : %.3f (Mupdates/sec)\n", Gstens);
  fprintf(stderr, "Throughput   : %.3f (GB/s)\n", thput);  
  fprintf(stderr, "Accuracy     : %e\n", err);
  
  _mm_free(f1);
  _mm_free(f2);
  _mm_free(answer);
  return 0;
}
예제 #11
0
int main(int argc, char* argv[])
{
	double sum_delta = 0.0, sum_ref = 0.0, L1norm = 0.0;
        unsigned int seed = 123;
	int verbose = 0;
	if (argc > 2)
	{
		printf("usage: Black-Scholes <verbose> verbose = 1 for validtating result, the default is 0. \n");
        	exit(1);
	}
	if (argc == 1)
	verbose = 0;
	else if (argc == 2)
		verbose = atoi(argv[1]);
	kmp_set_defaults("KMP_AFFINITY=compact,granularity=fine");
#ifdef _OPENMP
	int ThreadNum = omp_get_max_threads();
	omp_set_num_threads(ThreadNum);
#else
	int ThreadNum = 1;
#endif
	int OptPerThread = OPT_N / ThreadNum;
	int mem_size = sizeof(double) * OptPerThread;
	setlocale(LC_ALL,"");
	printf("Black-Scholes Formula Double Precision.\n");
	printf("Compiler Version  = %d\n", __INTEL_COMPILER/100);
	printf("Release Update    = %d\n", __INTEL_COMPILER_UPDATE);
	printf("Build Time        = %s %s\n", __DATE__, __TIME__);
	printf("Input Dataset     = %d\n", OPT_N);
	printf("Repetitions       = %d\n", NUM_ITERATIONS);
	printf("Chunk Size        = %d\n", CHUNKSIZE);
	printf("Worker Threads    = %d\n\n", ThreadNum);

	if (verbose)
		printf("Allocate and initialize memory on %d boundary,\n", SIMDALIGN);
#pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref)
{
#ifdef _OPENMP
	int threadID = omp_get_thread_num();
#else
	int threadID = 0;
#endif
	double *CallResult = (double *)_mm_malloc(mem_size, SIMDALIGN);
	double *PutResult  = (double *)_mm_malloc(mem_size, SIMDALIGN);
	double *StockPrice    = (double *)_mm_malloc(mem_size, SIMDALIGN);
	double *OptionStrike  = (double *)_mm_malloc(mem_size, SIMDALIGN);
	double *OptionYears   = (double *)_mm_malloc(mem_size, SIMDALIGN);

	seed += threadID;
	for(int i = OptPerThread-1; i > -1 ; i--)
	{
		CallResult[i] = 0.0;
		PutResult[i]  = -1.0;
		StockPrice[i]    = RandDouble(5.0, 30.0, &seed);
		OptionStrike[i]  = RandDouble(1.0, 100.0, &seed);
		OptionYears[i]   = RandDouble(0.25, 10.0, &seed);
	}
#pragma omp barrier
	if (threadID == 0) {
		start_cyc = _rdtsc();
	}

	for(int i = 0; i < NUM_ITERATIONS; i++)
	    for (int chunkBase = 0; chunkBase < OptPerThread; chunkBase += CHUNKSIZE)
	    {
#pragma simd vectorlength(CHUNKSIZE)
#pragma simd
#pragma vector aligned
		for(int opt = chunkBase; opt < (chunkBase+CHUNKSIZE); opt++)
		{
			double CNDD1, CNDD2;
			double T = OptionYears[opt];
			double X = OptionStrike[opt];
			double XexpRT = X*exp2(RLOG2E * T);
			double S = StockPrice[opt];
			double sqrtT = sqrt(T);
			double d1 = log2(S / X) / (VLOG2E * sqrtT) +  RVV * sqrtT;
			CNDF_C (&CNDD1, &d1 );
			double d2 = d1 - VOLATILITY * sqrtT;
			CNDF_C (&CNDD2, &d2 );
			double CallVal  = S * CNDD1 - XexpRT * CNDD2;
			double PutVal  = CallVal  +  XexpRT - S;
			CallResult[opt] = CallVal ;
			PutResult[opt] = PutVal ;
		}
	    }
#pragma omp barrier
	if (threadID == 0) {
        	end_cyc = _rdtsc();
	}

	if (verbose)
	{
        	double delta = 0.0, ref = 0.0, L1norm = 0.0;
		int max_index = 0;

		for(int i = 0; i < OptPerThread; i++)
		{
			double callReference, putReference;
			BlackScholesReference(
    				callReference,
				putReference,
				StockPrice[i], //Stock price
				OptionStrike[i], //Option strike
				OptionYears[i], //Option years
				RISKFREE, //Riskless rate
				VOLATILITY  //Volatility rate
				);
			ref   = callReference;
			delta = fabs(callReference - CallResult[i]);
			sum_delta += delta;
			sum_ref   += fabs(ref);
		}
	}
	_mm_free(CallResult);
	_mm_free(PutResult);
	_mm_free(StockPrice);
  _mm_free(OptionStrike);
	_mm_free(OptionYears);
} //parallel section
	const unsigned long long cyc   = end_cyc - start_cyc;
	double sec = cyc/(FREQ*1e9);
	printf("=============================================\n");
	printf("Total Cycles                   = %lld\n", cyc);
	printf("Cycles/OptionPair at thread 0  = %5.2f\n", cyc/(1.0f*NUM_ITERATIONS*OptPerThread));
	printf("Time Elapsed                   = %5.2f\n", sec);
	printf("Options/sec                    = %5.2f\n", (2.0f*NUM_ITERATIONS*OPT_N)/(1e9*sec));
	printf("=============================================\n");
	if (verbose)
	{
		L1norm = sum_delta / sum_ref;
		printf("L1 norm: %E\n", L1norm);
		printf((L1norm < 9e-5) ? "TEST PASSED\n" : "TEST FAILED\n");
	}
}