extern "C" void binomialOptionsGPU(
    real *callValue,
    TOptionData  *optionData,
    int optN,
    int argc,
    char **argv
)
{
    if (!moduleLoaded) {
      kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]);
      compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
      module = loadPTX(ptx, argc, argv);
      moduleLoaded = true;
    }

    __TOptionData h_OptionData[MAX_OPTIONS];

    for (int i = 0; i < optN; i++)
    {
        const real      T = optionData[i].T;
        const real      R = optionData[i].R;
        const real      V = optionData[i].V;

        const real     dt = T / (real)NUM_STEPS;
        const real    vDt = V * sqrt(dt);
        const real    rDt = R * dt;
        //Per-step interest and discount factors
        const real     If = exp(rDt);
        const real     Df = exp(-rDt);
        //Values and pseudoprobabilities of upward and downward moves
        const real      u = exp(vDt);
        const real      d = exp(-vDt);
        const real     pu = (If - d) / (u - d);
        const real     pd = (real)1.0 - pu;
        const real puByDf = pu * Df;
        const real pdByDf = pd * Df;

        h_OptionData[i].S      = (real)optionData[i].S;
        h_OptionData[i].X      = (real)optionData[i].X;
        h_OptionData[i].vDt    = (real)vDt;
        h_OptionData[i].puByDf = (real)puByDf;
        h_OptionData[i].pdByDf = (real)pdByDf;
    }

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel"));

    CUdeviceptr d_OptionData;
    checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData"));
    checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)));

    dim3 cudaBlockSize(128,1,1);
    dim3 cudaGridSize(optN, 1, 1);

    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            NULL, /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());

    CUdeviceptr d_CallValue;
    checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue"));
    checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real)));
}
int main(int argc, char **argv)
{
    // Start logs
    printf("[%s] - Starting...\n", argv[0]);

    //'h_' prefix - CPU (host) memory space
    float
    //Results calculated by CPU for reference
    *h_CallResultCPU,
    *h_PutResultCPU,
    //CPU copy of GPU results
    *h_CallResultGPU,
    *h_PutResultGPU,
    //CPU instance of input data
    *h_StockPrice,
    *h_OptionStrike,
    *h_OptionYears;

    //'d_' prefix - GPU (device) memory space
    CUdeviceptr
    //Results calculated by GPU
    d_CallResult,
    d_PutResult,

    //GPU instance of input data
    d_StockPrice,
    d_OptionStrike,
    d_OptionYears;

    double
    delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;
    int i;

    sdkCreateTimer(&hTimer);

    printf("Initializing data...\n");
    printf("...allocating CPU memory for options.\n");

    h_CallResultCPU = (float *)malloc(OPT_SZ);
    h_PutResultCPU  = (float *)malloc(OPT_SZ);
    h_CallResultGPU = (float *)malloc(OPT_SZ);
    h_PutResultGPU  = (float *)malloc(OPT_SZ);
    h_StockPrice    = (float *)malloc(OPT_SZ);
    h_OptionStrike  = (float *)malloc(OPT_SZ);
    h_OptionYears   = (float *)malloc(OPT_SZ);


    char *ptx, *kernel_file;
    size_t ptxSize;
    kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]);

    // Set a Compiler Option to have maximum register to be used by each thread.
    char *compile_options[1];
    compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16")));
    strcpy((char *)compile_options[0],"--maxrregcount=16");

    // Compile the kernel BlackScholes_kernel.
    compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize);
    CUmodule module = loadPTX(ptx, argc, argv);

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU"));

    printf("...allocating GPU memory for options.\n");
    checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ));
    checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ));

    printf("...generating input data in CPU mem.\n");
    srand(5347);

    //Generate options set
    for (i = 0; i < OPT_N; i++)
    {
        h_CallResultCPU[i] = 0.0f;
        h_PutResultCPU[i]  = -1.0f;
        h_StockPrice[i]    = RandFloat(5.0f, 30.0f);
        h_OptionStrike[i]  = RandFloat(1.0f, 100.0f);
        h_OptionYears[i]   = RandFloat(0.25f, 10.0f);
    }

    printf("...copying input data to GPU mem.\n");
    //Copy options data to GPU memory for further processing
    checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ));
    checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ));

    printf("Data init done.\n\n");
    printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS);

    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    dim3 cudaBlockSize( 128, 1, 1);
    dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1);

    float risk = RISKFREE;
    float volatility = VOLATILITY;
    int optval = OPT_N;

    void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice,
        (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval };

    for (i = 0; i < NUM_ITERATIONS; i++)
    {

        checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            &arr[0], /* arguments */
                                            0));

    }

    checkCudaErrors(cuCtxSynchronize());

    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
    printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
    printf("Gigaoptions per second    : %f     \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
    printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n",
           (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128);

    printf("\nReading back GPU results...\n");

    //Read back GPU results to compare them to CPU results
    checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ));
    checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ));

    printf("Checking the results...\n");
    printf("...running CPU calculations.\n\n");

    //Calculate options values on CPU
    BlackScholesCPU(
        h_CallResultCPU,
        h_PutResultCPU,
        h_StockPrice,
        h_OptionStrike,
        h_OptionYears,
        RISKFREE,
        VOLATILITY,
        OPT_N
    );

    printf("Comparing the results...\n");
    //Calculate max absolute difference and L1 distance
    //between CPU and GPU results
    sum_delta = 0;
    sum_ref   = 0;
    max_delta = 0;

    for (i = 0; i < OPT_N; i++)
    {
        ref   = h_CallResultCPU[i];
        delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);

        if (delta > max_delta)
        {
            max_delta = delta;
        }

        sum_delta += delta;
        sum_ref   += fabs(ref);
    }

    L1norm = sum_delta / sum_ref;
    printf("L1 norm: %E\n", L1norm);
    printf("Max absolute error: %E\n\n", max_delta);

    printf("Shutting down...\n");
    printf("...releasing GPU memory.\n");

    checkCudaErrors(cuMemFree(d_OptionYears));
    checkCudaErrors(cuMemFree(d_OptionStrike));
    checkCudaErrors(cuMemFree(d_StockPrice));
    checkCudaErrors(cuMemFree(d_PutResult));
    checkCudaErrors(cuMemFree(d_CallResult));

    printf("...releasing CPU memory.\n");

    free(h_OptionYears);
    free(h_OptionStrike);
    free(h_StockPrice);
    free(h_PutResultGPU);
    free(h_CallResultGPU);
    free(h_PutResultCPU);
    free(h_CallResultCPU);

    sdkDeleteTimer(&hTimer);
    printf("Shutdown done.\n");

    printf("\n[%s] - Test Summary\n", argv[0]);

    cuProfilerStop();

    if (L1norm > 1e-6)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed\n");
    exit(EXIT_SUCCESS);
}
/////////////////////////////////////////////////////
// Main program
/////////////////////////////////////////////////////
int main(int argc, char **argv)
{
  typedef long clock_t;

  unsigned int num_warps = NUM_BLOCKS * NUM_THREADS / 32;

  // we allocate two timer for each warp
  clock_t *timer = (clock_t*)malloc(num_warps * sizeof(clock_t) * 2);


  // Initialize CUDA driver
  checkCudaErrors(cuInit(0));

  // Get number of devices supporting CUDA
  int deviceCount = 0;
  checkCudaErrors(cuDeviceGetCount(&deviceCount));
  if (deviceCount == 0) {
    printf("There is no device supporting CUDA.\n");
    exit (0);
  }

  // Get handle for device 0
  CUdevice cuDevice;
  checkCudaErrors(cuDeviceGet(&cuDevice, 0));

  // Create context
  CUcontext cuContext;
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

  // JIT compile the kernel from PTX and get the handle
  CUfunction kernel_addr;
  CUmodule cuModule;
  ptxJIT(&cuModule, &kernel_addr, "clock.ptx", "timeDummy");

  // Allocate timer on device
  CUdeviceptr dtimer;
  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * num_warps * 2));
  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
  void *kernel_param[] = {(void*)&dtimer};
  checkCudaErrors(cuLaunchKernel(kernel_addr,
                                 cudaGridSize.x, cudaGridSize.y, cudaGridSize.z,
                                 cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z,
                                 0, 0, &kernel_param[0], 0));

  // Sync the context
  checkCudaErrors(cuCtxSynchronize());
  // copy result back to host
  checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * num_warps * 2));

  // Compute the execution time of the kernel
  clock_t minStart = timer[0];
  clock_t maxEnd = timer[num_warps];
  for (int i = 1 ; i < num_warps ; i ++){
    minStart = timer[i] < minStart ? timer[i] : minStart;
    maxEnd = timer[num_warps + i] > maxEnd ? timer[num_warps + i] : maxEnd;
  }

  printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart));
  printf("Number of warps = %u\n", num_warps);

  // Clean up
  free(timer);
  checkCudaErrors(cuMemFree(dtimer));
  checkCudaErrors(cuModuleUnload(cuModule));
  checkCudaErrors(cuCtxDestroy(cuContext));

  return EXIT_SUCCESS;
}