Example #1
0
int main(int argc, char **argv) {
    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else {
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    const int MAX_GPU_COUNT = 8;
    const int         OPT_N = 256;
    const int        PATH_N = 1 << 18;
    const unsigned int SEED = 777;

    //Input data array
    TOptionData optionData[OPT_N];
    //Final GPU MC results
    TOptionValue callValueGPU[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float callValueBS[OPT_N];
    //Solver config
    TOptionPlan optionSolver[MAX_GPU_COUNT];
    //OS thread ID
    CUTThread threadID[MAX_GPU_COUNT];


    //GPU number present in the system
    int GPU_N;
    int gpuBase, gpuIndex;
    int i;

    //Timer
    unsigned int hTimer;
    float time;

    double
    delta, ref, sumDelta, sumRef, sumReserve;

    cutilSafeCall( cudaGetDeviceCount(&GPU_N) );
    cutilCheckError( cutCreateTimer(&hTimer) );

#ifdef _EMU
    GPU_N = 1;
#endif
    printf("main(): generating input data...\n");
    srand(123);
    for(i = 0; i < OPT_N; i++) {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);
    //Get option count for each GPU
    for(i = 0; i < GPU_N; i++)
        optionSolver[i].optionCount = OPT_N / GPU_N;
    //Take into account cases with "odd" option counts
    for(i = 0; i < (OPT_N % GPU_N); i++)
        optionSolver[i].optionCount++;

    //Assign GPU option ranges
    gpuBase = 0;
    for(i = 0; i < GPU_N; i++) {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }

    //Start the timer
    cutilCheckError( cutResetTimer(hTimer) );
    cutilCheckError( cutStartTimer(hTimer) );

    //Start CPU thread for each GPU
    for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);

    //Stop the timer
    cutilCheckError( cutStopTimer(hTimer) );
    time = cutGetTimerValue(hTimer);

    printf("main(): waiting for GPU results...\n");
    cutWaitForThreads(threadID, GPU_N);

    printf("main(): GPU statistics\n");
    for(i = 0; i < GPU_N; i++) {
        printf("GPU #%i\n", optionSolver[i].device);
        printf("Options         : %i\n", optionSolver[i].optionCount);
        printf("Simulation paths: %i\n", optionSolver[i].pathN);
    }
    printf("\nTotal time (ms.): %f\n", time);
    printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;
    for(i = 0; i < OPT_N; i++) {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }
    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
    sumDelta   = 0;
    sumRef     = 0;
    sumReserve = 0;
    for(i = 0; i < OPT_N; i++) {
        BlackScholesCall(
            callValueBS[i],
            optionData[i]
        );
        delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
        ref       = callValueBS[i];
        sumDelta += delta;
        sumRef   += fabs(ref);
        if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;
#ifdef PRINT_RESULTS
        printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
    }
    sumReserve /= OPT_N;
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf((sumReserve > 1.0f) ? "PASSED\n" : "FAILED.\n");

    printf("Shutting down...\n");

    cutilCheckError( cutDeleteTimer(hTimer) );
    cutilExit(argc, argv);
}
Example #2
0
int main(int argc, char **argv)
{
    char *multiMethodChoice = NULL;
    char *scalingChoice = NULL;
    bool use_threads = true;
    bool bqatest = false;
    bool strongScaling = false;

    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);

    if (checkCmdLineFlag(argc, (const char **)argv, "qatest"))
    {
        bqatest = true;
    }

    getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);
    getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);

    if (checkCmdLineFlag(argc, (const char **)argv, "h") ||
        checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        usage();
        exit(EXIT_SUCCESS);
    }

    if (multiMethodChoice == NULL)
    {
        use_threads = true;
    }
    else
    {
        if (!strcasecmp(multiMethodChoice, "threaded"))
        {
            use_threads = true;
        }
        else
        {
            use_threads = false;
        }
    }

    if (use_threads == false)
    {
        printf("Using single CPU thread for multiple GPUs\n");
    }

    if (scalingChoice == NULL)
    {
        strongScaling = false;
    }
    else
    {
        if (!strcasecmp(scalingChoice, "strong"))
        {
            strongScaling = true;
        }
        else
        {
            strongScaling = false;
        }
    }


    //GPU number present in the system
    int GPU_N;
    checkCudaErrors(cudaGetDeviceCount(&GPU_N));
    int nOptions = 256;

    nOptions = adjustProblemSize(GPU_N, nOptions);

    // select problem size
    int scale = (strongScaling) ? 1 : GPU_N;
    int OPT_N = nOptions * scale;
    int PATH_N = 262144;
    const unsigned long long SEED = 777;

    // initialize the timers
    hTimer = new StopWatchInterface*[GPU_N];

    for (int i=0; i<GPU_N; i++)
    {
        sdkCreateTimer(&hTimer[i]);
        sdkResetTimer(&hTimer[i]);
    }

    //Input data array
    TOptionData  *optionData   = new TOptionData[OPT_N];
    //Final GPU MC results
    TOptionValue *callValueGPU = new TOptionValue[OPT_N];
    //"Theoretical" call values by Black-Scholes formula
    float *callValueBS = new float[OPT_N];
    //Solver config
    TOptionPlan *optionSolver = new TOptionPlan[GPU_N];
    //OS thread ID
    CUTThread *threadID = new CUTThread[GPU_N];

    int gpuBase, gpuIndex;
    int i;

    float time;

    double delta, ref, sumDelta, sumRef, sumReserve;

    printf("MonteCarloMultiGPU\n");
    printf("==================\n");
    printf("Parallelization method  = %s\n", use_threads ? "threaded" : "streamed");
    printf("Problem scaling         = %s\n", strongScaling? "strong" : "weak");
    printf("Number of GPUs          = %d\n", GPU_N);
    printf("Total number of options = %d\n", OPT_N);
    printf("Number of paths         = %d\n", PATH_N);


    printf("main(): generating input data...\n");
    srand(123);

    for (i=0; i < OPT_N; i++)
    {
        optionData[i].S = randFloat(5.0f, 50.0f);
        optionData[i].X = randFloat(10.0f, 25.0f);
        optionData[i].T = randFloat(1.0f, 5.0f);
        optionData[i].R = 0.06f;
        optionData[i].V = 0.10f;
        callValueGPU[i].Expected   = -1.0f;
        callValueGPU[i].Confidence = -1.0f;
    }

    printf("main(): starting %i host threads...\n", GPU_N);


    //Get option count for each GPU
    for (i = 0; i < GPU_N; i++)
    {
        optionSolver[i].optionCount = OPT_N / GPU_N;
    }

    //Take into account cases with "odd" option counts
    for (i = 0; i < (OPT_N % GPU_N); i++)
    {
        optionSolver[i].optionCount++;
    }

    //Assign GPU option ranges
    gpuBase = 0;

    for (i = 0; i < GPU_N; i++)
    {
        optionSolver[i].device     = i;
        optionSolver[i].optionData = optionData   + gpuBase;
        optionSolver[i].callValue  = callValueGPU + gpuBase;
        // all devices use the same global seed, but start
        // the sequence at a different offset
        optionSolver[i].seed       = SEED;
        optionSolver[i].pathN      = PATH_N;
        gpuBase += optionSolver[i].optionCount;
    }


    if (use_threads || bqatest)
    {
        //Start CPU thread for each GPU
        for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
        {
            threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
        }

        printf("main(): waiting for GPU results...\n");
        cutWaitForThreads(threadID, GPU_N);

        printf("main(): GPU statistics, threaded\n");

        for (i = 0; i < GPU_N; i++)
        {
            cudaDeviceProp deviceProp;
            checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
            printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
            printf("Options         : %i\n", optionSolver[i].optionCount);
            printf("Simulation paths: %i\n", optionSolver[i].pathN);
            time = sdkGetTimerValue(&hTimer[i]);
            printf("Total time (ms.): %f\n", time);
            printf("Options per sec.: %f\n", OPT_N / (time * 0.001));
        }

        printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
        sumDelta   = 0;
        sumRef     = 0;
        sumReserve = 0;

        for (i = 0; i < OPT_N; i++)
        {
            BlackScholesCall(callValueBS[i], optionData[i]);
            delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
            ref       = callValueBS[i];
            sumDelta += delta;
            sumRef   += fabs(ref);

            if (delta > 1e-6)
            {
                sumReserve += callValueGPU[i].Confidence / delta;
            }

#ifdef PRINT_RESULTS
            printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif

        }

        sumReserve /= OPT_N;
    }

    if (!use_threads || bqatest)
    {
        multiSolver(optionSolver, GPU_N);

        printf("main(): GPU statistics, streamed\n");

        for (i = 0; i < GPU_N; i++)
        {
            cudaDeviceProp deviceProp;
            checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device));
            printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name);
            printf("Options         : %i\n", optionSolver[i].optionCount);
            printf("Simulation paths: %i\n", optionSolver[i].pathN);
        }

        time = sdkGetTimerValue(&hTimer[0]);
        printf("\nTotal time (ms.): %f\n", time);
        printf("\tNote: This is elapsed time for all to compute.\n");
        printf("Options per sec.: %f\n", OPT_N / (time * 0.001));

        printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
        sumDelta   = 0;
        sumRef     = 0;
        sumReserve = 0;

        for (i = 0; i < OPT_N; i++)
        {
            BlackScholesCall(callValueBS[i], optionData[i]);
            delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);
            ref       = callValueBS[i];
            sumDelta += delta;
            sumRef   += fabs(ref);

            if (delta > 1e-6)
            {
                sumReserve += callValueGPU[i].Confidence / delta;
            }

#ifdef PRINT_RESULTS
            printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
        }

        sumReserve /= OPT_N;
    }

#ifdef DO_CPU
    printf("main(): running CPU MonteCarlo...\n");
    TOptionValue callValueCPU;
    sumDelta = 0;
    sumRef   = 0;

    for (i = 0; i < OPT_N; i++)
    {
        MonteCarloCPU(
            callValueCPU,
            optionData[i],
            NULL,
            PATH_N
        );
        delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
        ref       = callValueCPU.Expected;
        sumDelta += delta;
        sumRef   += fabs(ref);
        printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);
        printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
    }

    printf("L1 norm: %E\n", sumDelta / sumRef);
#endif

    printf("Shutting down...\n");

    for (int i=0; i<GPU_N; i++)
    {
        sdkStartTimer(&hTimer[i]);
        checkCudaErrors(cudaSetDevice(i));

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
    }

    delete[] optionSolver;
    delete[] callValueBS;
    delete[] callValueGPU;
    delete[] optionData;
    delete[] threadID;
    delete[] hTimer;

    printf("Test Summary...\n");
    printf("L1 norm        : %E\n", sumDelta / sumRef);
    printf("Average reserve: %f\n", sumReserve);
    printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n");
    exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE);
}