Esempio n. 1
0
const cudaDeviceProp& getCurrentDeviceProperties() {
  int device = 0;
  auto err = cudaGetDevice(&device);
  checkCuda(err, std::string("CUDA ERROR: cudaGetDeviceCount "));

  return getDeviceProperties(device);
}
Esempio n. 2
0
/*! \brief Destroy distributed L & U matrices. */
void
Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
{
    int_t i, nb, nsupers;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;

#if ( DEBUGlevel>=1 )
    int iam;
    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
    CHECK_MALLOC(iam, "Enter Destroy_LU()");
#endif

    nsupers = Glu_persist->supno[n-1] + 1;

    nb = CEILING(nsupers, grid->npcol);
    for (i = 0; i < nb; ++i) 
	if ( Llu->Lrowind_bc_ptr[i] ) {
	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
#ifdef GPU_ACC
	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
#else
	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
#endif
	}
    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
    SUPERLU_FREE (Llu->Lnzval_bc_ptr);

    nb = CEILING(nsupers, grid->nprow);
    for (i = 0; i < nb; ++i)
	if ( Llu->Ufstnz_br_ptr[i] ) {
	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
	}
    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
    SUPERLU_FREE (Llu->Unzval_br_ptr);

    /* The following can be freed after factorization. */
    SUPERLU_FREE(Llu->ToRecv);
    SUPERLU_FREE(Llu->ToSendD);
    SUPERLU_FREE(Llu->ToSendR[0]);
    SUPERLU_FREE(Llu->ToSendR);

    /* The following can be freed only after iterative refinement. */
    SUPERLU_FREE(Llu->ilsum);
    SUPERLU_FREE(Llu->fmod);
    SUPERLU_FREE(Llu->fsendx_plist[0]);
    SUPERLU_FREE(Llu->fsendx_plist);
    SUPERLU_FREE(Llu->bmod);
    SUPERLU_FREE(Llu->bsendx_plist[0]);
    SUPERLU_FREE(Llu->bsendx_plist);
    SUPERLU_FREE(Llu->mod_bit);

    SUPERLU_FREE(Glu_persist->xsup);
    SUPERLU_FREE(Glu_persist->supno);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit Destroy_LU()");
#endif
}
Esempio n. 3
0
int main() {
  // Initialize variables
  double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ));

  // Set Domain Initial Condition and BCs
  Call_IC(h_u);

  // GPU Memory Arrays
  double *d_u;  checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY)));
  double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY)));

  // Copy Initial Condition from host to device
  checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));
  checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice));

  // GPU kernel launch parameters
  dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z);
  dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); 

  // Request computer current time
  time_t t = clock();

  // Solver Loop 
  for (int step=0; step < NO_STEPS; step+=2) {
    if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS);
      // Compute Laplace
      Call_Laplace(dimGrid,dimBlock,d_u,d_un);
      // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un);
    }
  if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError()));

  // Measure and Report computation time
  t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC);

  // Copy data from device to host
  checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost));

  // uncomment to print solution to terminal
  if (DEBUG) Print2D(h_u);

  // Write solution to file
  Save_Results(h_u); 

  // Free device memory
  checkCuda(cudaFree(d_u));
  checkCuda(cudaFree(d_un));

  // Reset device
  checkCuda(cudaDeviceReset());

  // Free memory on host and device
  free(h_u);

  return 0;
}
Esempio n. 4
0
void ChannelInfo::free() {
    if (use_gpu) {
#ifdef __CUDACC__
        checkCuda(cudaFree(channels));
#else
        assert(false);
#endif
    } else {
        delete[] channels;
    }
}
Esempio n. 5
0
ChannelInfo::ChannelInfo(const std::vector<Channels> &channels, bool use_gpu) : use_gpu(use_gpu) {
    num_channels = (int)channels.size();
    radiance_dimension = -1;
    num_total_dimensions = compute_num_channels(channels);
    if (use_gpu) {
#ifdef __CUDACC__
        checkCuda(cudaMallocManaged(&this->channels, channels.size() * sizeof(Channels)));
#else
        assert(false);
#endif
    } else {
        this->channels = new Channels[channels.size()];
    }
    for (int i = 0; i < (int)channels.size(); i++) {
        if (channels[i] == Channels::radiance) {
            if (radiance_dimension != -1) {
                throw std::runtime_error("Duplicated radiance channel");
            }
            radiance_dimension = i;
        }
        this->channels[i] = channels[i];
    }
}
Esempio n. 6
0
////////////////////////////////////////////////////////////////////////////////
// Program Main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
	int Nx, Ny, Nz, max_iters;
	int blockX, blockY, blockZ;

	if (argc == 8) {
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", 
			argv[0]);
		exit(1);
	}

	// Get the number of GPUS
	int number_of_devices;
	checkCuda(cudaGetDeviceCount(&number_of_devices));
  
  if (number_of_devices < 2) {
  	printf("Less than two devices were found.\n");
  	printf("Exiting...\n");

  	return -1;
  }

	// Decompose along the Z-axis
	int _Nz = Nz/number_of_devices;

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Check if ECC is turned on
	ECCCheck(number_of_devices);

	// Set the number of OpenMP threads
	omp_set_num_threads(number_of_devices);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_num_threads();

		#pragma omp single
		{
			printf("Number of OpenMP threads: %d\n", tid);
		}
	}

  // CPU memory operations
  int dt_size = sizeof(_DOUBLE_);

	_DOUBLE_ *u_new, *u_old;

	u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(u_old, u_new, h, Nx, Ny, Nz);

	// Allocate and generate arrays on the host
	size_t pitch_bytes;
	size_t pitch_gc_bytes;

	_DOUBLE_ *h_Unew, *h_Uold;
	_DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices];
	_DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices];
	_DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices];

	h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(h_Uold, h_Unew, h, Nx, Ny, Nz);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
		h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

		right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

		checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

		init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

	// GPU memory operations
	_DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices];
	_DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices];
	_DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices];

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		checkCuda(cudaSetDevice(tid));

		CopyToConstantMemory(c0, c1);

		checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
	}

	// Copy data from host to the device
	double HtD_timer = 0.;
	HtD_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
		checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
	}
	HtD_timer += omp_get_wtime();

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

    // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);
	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	double compute_timer = 0.;
  compute_timer -= omp_get_wtime();

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		for(int iterations = 0; iterations < max_iters; iterations++)
		{
			// Compute inner nodes
			checkCuda(cudaSetDevice(tid));
			ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz);

			// Copy right boundary data to host
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
				checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}
			// Copy left boundary data to host
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
				checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}

			#pragma omp barrier

			// Copy right boundary data to device 1
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				
				checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
			}

			// Copy left boundary data to device 0
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));

				checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
			}

			// Swap pointers on the host
			#pragma omp barrier
			checkCuda(cudaSetDevice(tid));
			checkCuda(cudaDeviceSynchronize());
			swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]);
		}
	}

	compute_timer += omp_get_wtime();

  // Copy data from device to host
	double DtH_timer = 0;
  DtH_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost));
	}
	DtH_timer += omp_get_wtime();

	// Merge sub-domains into a one big domain
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

   	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
#endif

    float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
    PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

    _DOUBLE_ t = max_iters * dt;
    CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);

#if defined(DEBUG) || defined(_DEBUG)
    //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz);
#endif

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaFree(d_s_Unews[tid]));
    checkCuda(cudaFree(d_s_Uolds[tid]));
    checkCuda(cudaFree(d_right_send_buffer[tid]));
    checkCuda(cudaFree(d_left_send_buffer[tid]));
    checkCuda(cudaFree(d_right_receive_buffer[tid]));
    checkCuda(cudaFree(d_left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(h_s_Unews[tid]));
    checkCuda(cudaFreeHost(h_s_Uolds[tid]));
    checkCuda(cudaFreeHost(left_send_buffer[tid]));
    checkCuda(cudaFreeHost(right_send_buffer[tid]));
    checkCuda(cudaFreeHost(left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(right_receive_buffer[tid]));
    checkCuda(cudaDeviceReset());
  }

  free(u_old);
  free(u_new);

	return 0;
}
Esempio n. 7
0
int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    printf("Initializing data...\n");

    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;

    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );

    initOptions(OPT_N, stockPrice, optionStrike, optionYears);

    printf("Running Host Version...\n");

    StartTimer();

    BlackScholesLaunch_host(callResult, putResult, stockPrice, optionStrike,
                            optionYears, RISKFREE, VOLATILITY, OPT_N);

    printf("Option 0 call: %f\n", callResult[0]);
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n",
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );

    printf("Running Device Version...\n");

    StartTimer();
    cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice);

    BlackScholesLaunch_device(d_callResult, d_putResult, d_stockPrice, d_optionStrike,
                              d_optionYears, RISKFREE, VOLATILITY, OPT_N);

    cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost);
    cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost);

    printf("Option 0 call: %f\n", callResult[0]);
    printf("Option 0 put:  %f\n", putResult[0]);

    ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n",
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
}
Esempio n. 8
0
int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    BlackScholes bs;
               
    printf("Initializing data...\n");

    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
    
    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );
    
    initOptions(OPT_N, stockPrice, optionStrike, optionYears);

    printf("Running Host Version...\n");

    StartTimer();
    
    // run BlackScholes operator on host
    bs(callResult, putResult, stockPrice, optionStrike, 
       optionYears, RISKFREE, VOLATILITY, OPT_N);

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
       printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );

    printf("Running Device Version...\n");

    StartTimer();
    
    // Launch Black-Scholes operator on device
#ifdef HEMI_CUDA_COMPILER
    cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice);

    hemi::launch(bs, 
                 d_callResult, d_putResult, d_stockPrice, d_optionStrike, 
                 d_optionYears, RISKFREE, VOLATILITY, OPT_N);

    cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost);
    cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost);
#else // demonstrates that "launch" goes to host when not compiled with NVCC
    hemi::launch(bs, 
                 callResult, putResult, stockPrice, optionStrike, 
                 optionYears, RISKFREE, VOLATILITY, OPT_N);
#endif

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
       printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
}
Esempio n. 9
0
///////////////////////
// Main program entry
///////////////////////
int main(int argc, char** argv)
{
	unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ;
	int rank, numberOfProcesses;

	if (argc == 8)
	{
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]);
		exit(1);
	}

  InitializeMPI(&argc, &argv, &rank, &numberOfProcesses);
  AssignDevices(rank);
  ECCCheck(rank);

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Copy constants to Constant Memory on the GPUs
	CopyToConstantMemory(c0, c1);

	// Decompose along the z-axis
	const int _Nz = Nz/numberOfProcesses;
  const int dt_size = sizeof(_DOUBLE_);

    // Host memory allocations
    _DOUBLE_ *u_new, *u_old;
    _DOUBLE_ *h_Uold;

    u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
    u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

    if (rank == 0)
    {
    	h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); 
    }

    init(u_old, u_new, h, Nx, Ny, Nz);

    // Allocate and generate host subdomains
    _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses];
    _DOUBLE_ *left_send_buffer, *left_receive_buffer;
    _DOUBLE_ *right_send_buffer, *right_receive_buffer;

    h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
    h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
    for (int i = 0; i < numberOfProcesses; i++)
    {
        h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
        checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    }
  }
#endif

    right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
    left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

    checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));

    checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
    checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

    init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank);

	// GPU stream operations
	cudaStream_t compute_stream;
	cudaStream_t data_stream;

	checkCuda(cudaStreamCreate(&compute_stream));
	checkCuda(cudaStreamCreate(&data_stream));

	// GPU Memory Operations
	size_t pitch_bytes, pitch_gc_bytes;

  _DOUBLE_ *d_s_Unews, *d_s_Uolds;
  _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer;
  _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer;

  checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
  checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));

  checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
  checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));

	// Copy subdomains from host to device and get walltime
	double HtD_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
  checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));

  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	HtD_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	unsigned int ghost_width = 1;

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

  // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);

	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	//MPI_Status status;
	MPI_Status status[numberOfProcesses];
	MPI_Request gather_send_request[numberOfProcesses];
	MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses];
	MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses];

	double compute_timer = 0.;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  compute_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	for(unsigned int iterations = 0; iterations < max_iters; iterations++)
	{
		// Compute right boundary data on device 0
		if (rank == 0) {
      int kstart = (_Nz+1)-ghost_width;
	    int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
			
			checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank]));
		}
		else
		{
			int kstart = 1;
			int kstop = 1+ghost_width;

			ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
			CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
			
			checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream));
			checkCuda(cudaStreamSynchronize(data_stream));

			MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank]));
		}

		// Compute inner nodes for device 0
		if (rank == 0) {
			int kstart = 1;
			int kstop = (_Nz+1)-ghost_width;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}
		// Compute inner nodes for device 1
		else
		{
			int kstart = 1+ghost_width;
			int kstop = _Nz+1;

			ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop);
		}

		// Receive data from device 1
		if (rank == 0) {
			MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank]));
		}
		else
		{
			MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank]));
		}

		if (rank == 0) {
      MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0);
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank]));

			checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream));
			CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1);
		}

		if (rank == 0)
		{
			MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE));
		}
		else
		{
			MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE));
		}

		// Swap pointers on the host
		checkCuda(cudaDeviceSynchronize());
		swap(_DOUBLE_*, d_s_Unews, d_s_Uolds);
	}

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	compute_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	// Copy data from device to host
	double DtH_timer = 0;

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  DtH_timer -= MPI_Wtime();
  MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

	checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault));

	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
	DtH_timer += MPI_Wtime();
	MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

  // Gather results from subdomains
  MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank]));

	if (rank == 0)
	{
		for (int i = 0; i < numberOfProcesses; i++)
		{
			MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank]));
			merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i);
		}
	}

	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	if (rank == 0)
	{
		cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
	}
#endif

	if (rank == 0)
	{
		float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
		PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

		_DOUBLE_ t = max_iters * dt;
		CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);
	}

	Finalize();

  // Free device memory
  checkCuda(cudaFree(d_s_Unews));
  checkCuda(cudaFree(d_s_Uolds));
  checkCuda(cudaFree(d_right_send_buffer));
  checkCuda(cudaFree(d_left_send_buffer));
  checkCuda(cudaFree(d_right_receive_buffer));
  checkCuda(cudaFree(d_left_receive_buffer));

  // Free host memory
  checkCuda(cudaFreeHost(h_s_Unews));
  checkCuda(cudaFreeHost(h_s_Uolds));

#if defined(DEBUG) || defined(_DEBUG)
  if (rank == 0)
  {
  	for (int i = 0; i < numberOfProcesses; i++)
  	{
  		checkCuda(cudaFreeHost(h_s_rbuf[i]));
  	}

    free(h_Uold);
  }
#endif

  checkCuda(cudaFreeHost(left_send_buffer));
  checkCuda(cudaFreeHost(left_receive_buffer));
  checkCuda(cudaFreeHost(right_send_buffer));
  checkCuda(cudaFreeHost(right_receive_buffer));

  checkCuda(cudaDeviceReset());

  free(u_old);
  free(u_new);

  return 0;
}
Esempio n. 10
0
int getDevice() {
  int dev;
  checkCuda(cudaGetDevice(&dev), std::string("CUDA ERROR: cudaGetDevice "));
  return dev;
}
Esempio n. 11
0
int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    printf("Initializing data...\n");
    
    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

#ifdef HEMI_CUDA_COMPILER
    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );
#else
    callResult   = (float*)malloc(OPT_SZ);
    putResult    = (float*)malloc(OPT_SZ);
    stockPrice   = (float*)malloc(OPT_SZ);
    optionStrike = (float*)malloc(OPT_SZ);
    optionYears  = (float*)malloc(OPT_SZ);
#endif

    initOptions(OPT_N, stockPrice, optionStrike, optionYears);
        
    int blockDim = 128; // blockDim, gridDim ignored by host code
    int gridDim  = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim);

    printf("Running %s Version...\n", HEMI_LOC_STRING);

    StartTimer();

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(d_stockPrice,   stockPrice,   OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionYears,  optionYears,  OPT_SZ, cudaMemcpyHostToDevice) );
#else
    d_callResult   = callResult; 
    d_putResult    = putResult;
    d_stockPrice   = stockPrice; 
    d_optionStrike = optionStrike;
    d_optionYears  = optionYears;
#endif

    HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0,
                       d_callResult, d_putResult, d_stockPrice, d_optionStrike, 
                       d_optionYears, RISKFREE, VOLATILITY, OPT_N);
       
#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) );
    checkCuda( cudaMemcpy(putResult,  d_putResult,  OPT_SZ, cudaMemcpyDeviceToHost) );
#endif

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
#else
    free(callResult);
    free(putResult);
    free(stockPrice);
    free(optionStrike);
    free(optionYears);
#endif // HEMI_CUDA_COMPILER
}