Exemplo n.º 1
0
int main(int argc, char **argv)
{
    printf("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    StopWatchInterface  *hTimer = NULL;
    const uint N = 13 * 1048576 / 2;

    printf("Allocating and initializing host arrays...\n");
    sdkCreateTimer(&hTimer);
    h_Input     = (uint *)malloc(N * sizeof(uint));
    h_OutputCPU = (uint *)malloc(N * sizeof(uint));
    h_OutputGPU = (uint *)malloc(N * sizeof(uint));
    srand(2009);

    for (uint i = 0; i < N; i++)
    {
        h_Input[i] = rand();
    }

    printf("Allocating and initializing CUDA arrays...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));
    checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice));

    printf("Initializing CUDA-C scan...\n\n");
    initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf(" ...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_SHORT_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }

    printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf("...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_LARGE_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }


    printf("Shutting down...\n");
    closeScan();
    checkCudaErrors(cudaFree(d_Output));
    checkCudaErrors(cudaFree(d_Input));

    sdkDeleteTimer(&hTimer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    // pass or fail (cumulative... all tests in the loop)
    exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
Exemplo n.º 2
0
int main(int argc, char **argv)
{
  // Start logs
  //shrSetLogFileName ("scan.txt");
  printf("%s Starting...\n\n", argv[0]); 

  //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
  /*
  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
     cutilDeviceInit(argc, argv);
  else
     cudaSetDevice( cutGetMaxGflopsDeviceId() );
  */
  uint *d_Input, *d_Output;
  uint *h_Input, *h_OutputCPU, *h_OutputGPU;
  uint hTimer;

  //const uint N = 13 * 1048576 / 2;
  const uint N = 2048;

  printf("Allocating and initializing host arrays...\n");
        //cutCreateTimer(&hTimer);
  h_Input     = (uint *)malloc(N * sizeof(uint));
  h_OutputCPU = (uint *)malloc(N * sizeof(uint));
  h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        //srand(2009);
  //for(uint i = 0; i < N; i++)
  //  h_Input[i] = rand();
  klee_make_symbolic(h_Input, sizeof(uint) * N, "input"); 
  printf("Allocating and initializing CUDA arrays...\n");
  cudaMalloc((void **)&d_Input, N * sizeof(uint));
  cudaMalloc((void **)&d_Output, N * sizeof(uint));

  cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice);

  printf("Initializing CUDA-C scan...\n\n");
  initScan();

  int globalFlag = 1;
  size_t szWorkgroup;
  //const int iCycles = 100;
  const int iCycles = 5;
  printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
  for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){
     printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
               //cutilSafeCall( cudaDeviceSynchronize() );
                //cutResetTimer(hTimer);
                //cutStartTimer(hTimer);
                //for(int i = 0; i < iCycles; i++)
                //{
                    //printf("The arrayLength in scanExclusiveShort: %d, the i: %d\n", arrayLength, i);
     szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
                //}
                //cutilSafeCall( cudaDeviceSynchronize());
                //cutStopTimer(hTimer);
                //double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

     printf("Validating the results...\n");
     printf("...reading back GPU results\n");

     cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost);

     printf(" ...scanExclusiveHost()\n");
     scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength);

     // Compare GPU results with CPU results and accumulate error for this test
     printf(" ...comparing the results\n");
     int localFlag = 1;
#ifndef _SYM
     for(uint i = 0; i < N; i++)
     {
       if(h_OutputCPU[i] != h_OutputGPU[i])
       {
         localFlag = 0;
         break;
       }
     }
#endif

     // Log message on individual test result, then accumulate to global flag
     printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
     globalFlag = globalFlag && localFlag;

     // Data log
     if (arrayLength == MAX_SHORT_ARRAY_SIZE)
     {
        printf("\n");
        //printfEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
        //      (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
        printf("\n");
     }
  }
  printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);
  for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){
     printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
     //for(int i = 0; i < iCycles; i++)
     //{
     printf("The arrayLength in scanExclusiveLarge: %d\n", arrayLength);
     szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
     //}

     printf("Validating the results...\n");
     printf("...reading back GPU results\n");
     cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost);

     printf("...scanExclusiveHost()\n");
     scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength);

     // Compare GPU results with CPU results and accumulate error for this test
     printf(" ...comparing the results\n");
     int localFlag = 1;
#ifndef _SYM
     for(uint i = 0; i < N; i++)
     {
         if(h_OutputCPU[i] != h_OutputGPU[i])
         {
           localFlag = 0;
           break;
         }
     }
#endif
     // Log message on individual test result, then accumulate to global flag
     printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
     globalFlag = globalFlag && localFlag;

     // Data log
     if (arrayLength == MAX_LARGE_ARRAY_SIZE)
     {
        printf("\n");
        //printfEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
        //      (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
        printf("\n");
     }
  }
  // pass or fail (cumulative... all tests in the loop)
  printf(globalFlag ? "PASSED\n\n" : "FAILED\n\n");

  printf("Shutting down...\n");
  closeScan();

  cudaFree(d_Output);
  cudaFree(d_Input);

  free(h_Input);
  free(h_OutputCPU);
  free(h_OutputGPU);
}
Exemplo n.º 3
0
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength)
{

    uint N = 0;
    CUdeviceptr d_Output;
    struct timeval start,stop;
    gettimeofday(&start, NULL);
    initScan();
    gettimeofday(&stop, NULL);

    if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE)
      {    
        for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){
          if(arrayLength <= i){
            N = i;
            break;
          }
        }
        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N);
        //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N);

        checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE)
    {

      N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE);

      checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      
      
      checkCudaErrors(cudaDeviceSynchronize());

      scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N);
      
      checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LL_SIZE)
      {


        N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE);

        printf("N = %d\n",N);

        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N);
        
        checkCudaErrors(cudaDeviceSynchronize());

      }else{
      cuMemFree(d_Output);
      closeScan();

      return NULL;      
    }

    closeScan();

    cuMemFree(*d_Input);
    *d_Input = d_Output;

    
    printf("inside scan time:\n");
    printDiff(start,stop);

    return d_Output;
}