int groupByImpl(cl_mem d_Rin, int rLen, cl_mem d_Rout, cl_mem* d_startPos, int numThread, int numBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU)
{
	cl_mem d_groupLabel=NULL;
	cl_mem d_writePos=NULL;
	cl_mem d_numGroup=NULL;
	int numGroup = 0;
	int memSize=sizeof(Record)*rLen;	
	//sort
	cl_copyBuffer(d_Rout,d_Rin,memSize,index,eventList,Flag_CPU_GPU,burden,_CPU_GPU);
	radixSortImpl(d_Rout, rLen,32, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
	CL_MALLOC(&d_groupLabel, sizeof(int)*rLen );
	groupByImpl_int(d_Rout, rLen, d_groupLabel,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	CL_MALLOC( &d_writePos, sizeof(int)*rLen );
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl( d_groupLabel, rLen, d_writePos,index,eventList,kernel,Flag_CPU_GPU,burden,SP,_CPU_GPU );
	CL_MALLOC( &d_numGroup, sizeof(int));
	groupByImpl_outSize_int( d_numGroup, d_groupLabel, d_writePos, rLen,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 

	int test;
	cl_readbuffer(&test,d_numGroup,sizeof(int),0);



	CL_MALLOC(d_startPos, sizeof(int)*numGroup );
	groupByImpl_write_int((*d_startPos), d_groupLabel, d_writePos, rLen,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	closeScan(SP);
	CL_FREE(d_groupLabel);
	CL_FREE( d_writePos);
	CL_FREE(d_numGroup );
	return numGroup;
}
char *
serializeTableContent(RM_TableData *rel) {
	int i;
	VarString *result;
	RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle));
	Record *r = (Record *) malloc(sizeof(Record));
	MAKE_VARSTRING(result);

	for (i = 0; i < rel->schema->numAttr; i++)
		APPEND(result, "%s%s", (i != 0) ? ", " : "", rel->schema->attrNames[i]);

	startScan(rel, sc, NULL);

	while (next(sc, r) != RC_RM_NO_MORE_TUPLES) {
		APPEND_STRING(result, serializeRecord(r, rel->schema));
		APPEND_STRING(result, "\n");
	}
	closeScan(sc);

	RETURN_STRING(result);
}
void testScanImpl(int rLen)
{
	int _CPU_GPU=0;
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;	
	int result=0;
	int memSize=sizeof(int)*rLen;
	int outSize=sizeof(int)*rLen;
	void *Rin;
	HOST_MALLOC(Rin, memSize);
	generateRandInt((int*)Rin, rLen,rLen,0);
	void *Rout;
	HOST_MALLOC(Rout, outSize);
	cl_mem d_Rin;
	CL_MALLOC(&d_Rin, memSize);
	cl_mem d_Rout;
	CL_MALLOC(&d_Rout, outSize);
	cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU);	
	cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]);
	closeScan(SP);
	deschedule(CPU_GPU,burden);
	//validateScan( (int*)Rin, rLen, (int*)Rout );
	HOST_FREE(Rin);
	HOST_FREE(Rout);
	CL_FREE(d_Rin);
	CL_FREE(d_Rout);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	clReleaseEvent(eventList[1]);
}
void testScansTwo (void)
{
  RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData));
  TestRecord inserts[] = { 
    {1, "aaaa", 3}, 
    {2, "bbbb", 2},
    {3, "cccc", 1},
    {4, "dddd", 3},
    {5, "eeee", 5},
    {6, "ffff", 1},
    {7, "gggg", 3},
    {8, "hhhh", 3},
    {9, "iiii", 2},
    {10, "jjjj", 5},
  };
  bool foundScan[] = {
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE,
    FALSE
  };
  int numInserts = 10, i;
  Record *r;
  RID *rids;
  Schema *schema;
  RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle));
  Expr *sel, *left, *right, *first, *se;
  int rc;

  testName = "test creating a new table and inserting tuples";
  schema = testSchema();
  rids = (RID *) malloc(sizeof(RID) * numInserts);
  
  TEST_CHECK(initRecordManager(NULL));
  TEST_CHECK(createTable("test_table_r",schema));
  TEST_CHECK(openTable(table, "test_table_r"));
  
  // insert rows into table
  for(i = 0; i < numInserts; i++)
  {
    r = fromTestRecord(schema, inserts[i]);
    TEST_CHECK(insertRecord(table,r)); 
    rids[i] = r->id;
  }

  TEST_CHECK(closeTable(table));
  TEST_CHECK(openTable(table, "test_table_r"));

  // Select 1 record with INT in condition a=2.
  MAKE_CONS(left, stringToValue("i2"));
  MAKE_ATTRREF(right, 0);
  MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL);
  createRecord(&r, schema);
  TEST_CHECK(startScan(table, sc, sel));
  while((rc = next(sc, r)) == RC_OK)
  {
     ASSERT_EQUALS_RECORDS(fromTestRecord(schema, inserts[1]), r, schema, "compare records");
  }
  if (rc != RC_NO_TUPLES)
    TEST_CHECK(rc);
  TEST_CHECK(closeScan(sc));
  
  // Select 1 record with STRING in condition b='ffff'.
  MAKE_CONS(left, stringToValue("sffff"));
  MAKE_ATTRREF(right, 1);
  MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL);
  createRecord(&r, schema);
  TEST_CHECK(startScan(table, sc, sel));
  while((rc = next(sc, r)) == RC_OK)
  {
     ASSERT_EQUALS_RECORDS(fromTestRecord(schema, inserts[5]), r, schema, "compare records");
     serializeRecord(r, schema);
  }
  if (rc != RC_NO_TUPLES)
    TEST_CHECK(rc);
  TEST_CHECK(closeScan(sc));
  
  // Select all records, with condition being false
  MAKE_CONS(left, stringToValue("i4"));
  MAKE_ATTRREF(right, 2);
  MAKE_BINOP_EXPR(first, right, left, OP_COMP_SMALLER);
  MAKE_UNOP_EXPR(se, first, OP_BOOL_NOT);
  TEST_CHECK(startScan(table, sc, se));
    while((rc = next(sc, r)) == RC_OK)
    {
     serializeRecord(r, schema);
     for(i = 0; i < numInserts; i++)
     {
       if (memcmp(fromTestRecord(schema, inserts[i])->data,r->data,getRecordSize(schema)) == 0)
	     foundScan[i] = TRUE;
     }
    }
  if (rc != RC_NO_TUPLES)
    TEST_CHECK(rc);
  TEST_CHECK(closeScan(sc));
  
  ASSERT_TRUE(!foundScan[0], "not greater than four");
  ASSERT_TRUE(foundScan[4], "greater than four");
  ASSERT_TRUE(foundScan[9], "greater than four");

  // clean up
  TEST_CHECK(closeTable(table));
  TEST_CHECK(deleteTable("test_table_r"));
  TEST_CHECK(shutdownRecordManager());

  freeRecord(r);
  free(table);
  free(sc);
  freeExpr(sel);
  TEST_DONE();
}
void testScans (void)
{
  RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData));
  TestRecord inserts[] = { 
    {1, "aaaa", 3}, 
    {2, "bbbb", 2},
    {3, "cccc", 1},
    {4, "dddd", 3},
    {5, "eeee", 5},
    {6, "ffff", 1},
    {7, "gggg", 3},
    {8, "hhhh", 3},
    {9, "iiii", 2},
    {10, "jjjj", 5},
  };
  TestRecord scanOneResult[] = { 
    {3, "cccc", 1},
    {6, "ffff", 1},
  };
  bool foundScan[] = {
    FALSE,
    FALSE
  };
  int numInserts = 10, scanSizeOne = 2, i;
  Record *r;
  RID *rids;
  Schema *schema;
  RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle));
  Expr *sel, *left, *right;
  int rc;

  testName = "test creating a new table and inserting tuples";
  schema = testSchema();
  rids = (RID *) malloc(sizeof(RID) * numInserts);
  
  TEST_CHECK(initRecordManager(NULL));
  TEST_CHECK(createTable("test_table_r",schema));
  TEST_CHECK(openTable(table, "test_table_r"));
  
  // insert rows into table
  for(i = 0; i < numInserts; i++)
  {
      r = fromTestRecord(schema, inserts[i]);
      TEST_CHECK(insertRecord(table,r)); 
      rids[i] = r->id;
  }

  TEST_CHECK(closeTable(table));
  TEST_CHECK(openTable(table, "test_table_r"));

  // run some scans
  MAKE_CONS(left, stringToValue("i1"));
  MAKE_ATTRREF(right, 2);
  MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL);

  TEST_CHECK(startScan(table, sc, sel));
  while((rc = next(sc, r)) == RC_OK)
  {
      for(i = 0; i < scanSizeOne; i++)
      {
          if (memcmp(fromTestRecord(schema, scanOneResult[i])->data,r->data,getRecordSize(schema)) == 0)
              foundScan[i] = TRUE;
      }
  }
  if (rc != RC_NO_TUPLES)
    TEST_CHECK(rc);
  TEST_CHECK(closeScan(sc));
  for(i = 0; i < scanSizeOne; i++)
    ASSERT_TRUE(foundScan[i], "check for scan result");
  
  // clean up
  TEST_CHECK(closeTable(table));
  TEST_CHECK(deleteTable("test_table_r"));
  TEST_CHECK(shutdownRecordManager());

  free(table);
  free(sc);
  freeExpr(sel);
  TEST_DONE();
}
void
testMultipleScans(void)
{
  RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData));
  TestRecord inserts[] = { 
    {1, "aaaa", 3}, 
    {2, "bbbb", 2},
    {3, "cccc", 1},
    {4, "dddd", 3},
    {5, "eeee", 5},
    {6, "ffff", 1},
    {7, "gggg", 3},
    {8, "hhhh", 3},
    {9, "iiii", 2},
    {10, "jjjj", 5},
  };
  int numInserts = 10, i, scanOne=0, scanTwo=0;
  Record *r;
  RID *rids;
  Schema *schema;
  testName = "test running muliple scans ";
  schema = testSchema();
  rids = (RID *) malloc(sizeof(RID) * numInserts);
  RM_ScanHandle *sc1 = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle));
  RM_ScanHandle *sc2 = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle));
  Expr *se1, *left, *right;
  int rc,rc2;
  
  TEST_CHECK(initRecordManager(NULL));
  TEST_CHECK(createTable("test_table_r",schema));
  TEST_CHECK(openTable(table, "test_table_r"));
  
  // insert rows into table
  for(i = 0; i < numInserts; i++)
  {
      r = fromTestRecord(schema, inserts[i]);
      TEST_CHECK(insertRecord(table,r)); 
      rids[i] = r->id;
  }

  // Mix 2 scans with c=3 as condition
  MAKE_CONS(left, stringToValue("i3"));
  MAKE_ATTRREF(right, 2);
  MAKE_BINOP_EXPR(se1, left, right, OP_COMP_EQUAL);
  createRecord(&r, schema);
  TEST_CHECK(startScan(table, sc1, se1));
  TEST_CHECK(startScan(table, sc2, se1));
  if ((rc2 = next(sc2, r)) == RC_OK)
    scanTwo++;
  i = 0;
  while((rc = next(sc1, r)) == RC_OK)
  {
      scanOne++;
      i++;
      if (i % 3 == 0)
          if ((rc2 = next(sc2, r)) == RC_OK)
              scanTwo++;
  }
  while((rc2 = next(sc2, r)) == RC_OK)
    scanTwo++;

  ASSERT_TRUE(scanOne == scanTwo, "scans returned same number of tuples");
  if (rc != RC_NO_TUPLES)
    TEST_CHECK(rc);
  TEST_CHECK(closeScan(sc1));
  TEST_CHECK(closeScan(sc2));
 
  TEST_CHECK(closeTable(table));
  TEST_CHECK(deleteTable("test_table_r"));
  TEST_CHECK(shutdownRecordManager());

  free(rids);
  free(table);
  TEST_DONE();
}
Beispiel #7
0
int main(int argc, char **argv)
{
  // Start logs
  //shrSetLogFileName ("scan.txt");
  printf("%s Starting...\n\n", argv[0]); 

  //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
  /*
  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
     cutilDeviceInit(argc, argv);
  else
     cudaSetDevice( cutGetMaxGflopsDeviceId() );
  */
  uint *d_Input, *d_Output;
  uint *h_Input, *h_OutputCPU, *h_OutputGPU;
  uint hTimer;

  //const uint N = 13 * 1048576 / 2;
  const uint N = 2048;

  printf("Allocating and initializing host arrays...\n");
        //cutCreateTimer(&hTimer);
  h_Input     = (uint *)malloc(N * sizeof(uint));
  h_OutputCPU = (uint *)malloc(N * sizeof(uint));
  h_OutputGPU = (uint *)malloc(N * sizeof(uint));
        //srand(2009);
  //for(uint i = 0; i < N; i++)
  //  h_Input[i] = rand();
  klee_make_symbolic(h_Input, sizeof(uint) * N, "input"); 
  printf("Allocating and initializing CUDA arrays...\n");
  cudaMalloc((void **)&d_Input, N * sizeof(uint));
  cudaMalloc((void **)&d_Output, N * sizeof(uint));

  cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice);

  printf("Initializing CUDA-C scan...\n\n");
  initScan();

  int globalFlag = 1;
  size_t szWorkgroup;
  //const int iCycles = 100;
  const int iCycles = 5;
  printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
  for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){
     printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
               //cutilSafeCall( cudaDeviceSynchronize() );
                //cutResetTimer(hTimer);
                //cutStartTimer(hTimer);
                //for(int i = 0; i < iCycles; i++)
                //{
                    //printf("The arrayLength in scanExclusiveShort: %d, the i: %d\n", arrayLength, i);
     szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
                //}
                //cutilSafeCall( cudaDeviceSynchronize());
                //cutStopTimer(hTimer);
                //double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles;

     printf("Validating the results...\n");
     printf("...reading back GPU results\n");

     cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost);

     printf(" ...scanExclusiveHost()\n");
     scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength);

     // Compare GPU results with CPU results and accumulate error for this test
     printf(" ...comparing the results\n");
     int localFlag = 1;
#ifndef _SYM
     for(uint i = 0; i < N; i++)
     {
       if(h_OutputCPU[i] != h_OutputGPU[i])
       {
         localFlag = 0;
         break;
       }
     }
#endif

     // Log message on individual test result, then accumulate to global flag
     printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
     globalFlag = globalFlag && localFlag;

     // Data log
     if (arrayLength == MAX_SHORT_ARRAY_SIZE)
     {
        printf("\n");
        //printfEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
        //      (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
        printf("\n");
     }
  }
  printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);
  for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){
     printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
     //for(int i = 0; i < iCycles; i++)
     //{
     printf("The arrayLength in scanExclusiveLarge: %d\n", arrayLength);
     szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
     //}

     printf("Validating the results...\n");
     printf("...reading back GPU results\n");
     cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost);

     printf("...scanExclusiveHost()\n");
     scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength);

     // Compare GPU results with CPU results and accumulate error for this test
     printf(" ...comparing the results\n");
     int localFlag = 1;
#ifndef _SYM
     for(uint i = 0; i < N; i++)
     {
         if(h_OutputCPU[i] != h_OutputGPU[i])
         {
           localFlag = 0;
           break;
         }
     }
#endif
     // Log message on individual test result, then accumulate to global flag
     printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
     globalFlag = globalFlag && localFlag;

     // Data log
     if (arrayLength == MAX_LARGE_ARRAY_SIZE)
     {
        printf("\n");
        //printfEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
        //      (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
        printf("\n");
     }
  }
  // pass or fail (cumulative... all tests in the loop)
  printf(globalFlag ? "PASSED\n\n" : "FAILED\n\n");

  printf("Shutting down...\n");
  closeScan();

  cudaFree(d_Output);
  cudaFree(d_Input);

  free(h_Input);
  free(h_OutputCPU);
  free(h_OutputGPU);
}
Beispiel #8
0
int main(int argc, char **argv)
{
    printf("%s Starting...\n\n", argv[0]);

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    uint *d_Input, *d_Output;
    uint *h_Input, *h_OutputCPU, *h_OutputGPU;
    StopWatchInterface  *hTimer = NULL;
    const uint N = 13 * 1048576 / 2;

    printf("Allocating and initializing host arrays...\n");
    sdkCreateTimer(&hTimer);
    h_Input     = (uint *)malloc(N * sizeof(uint));
    h_OutputCPU = (uint *)malloc(N * sizeof(uint));
    h_OutputGPU = (uint *)malloc(N * sizeof(uint));
    srand(2009);

    for (uint i = 0; i < N; i++)
    {
        h_Input[i] = rand();
    }

    printf("Allocating and initializing CUDA arrays...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));
    checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice));

    printf("Initializing CUDA-C scan...\n\n");
    initScan();

    int globalFlag = 1;
    size_t szWorkgroup;
    const int iCycles = 100;
    printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf(" ...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_SHORT_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }

    printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles);

    for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1)
    {
        printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
        checkCudaErrors(cudaDeviceSynchronize());
        sdkResetTimer(&hTimer);
        sdkStartTimer(&hTimer);

        for (int i = 0; i < iCycles; i++)
        {
            szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength);
        }

        checkCudaErrors(cudaDeviceSynchronize());
        sdkStopTimer(&hTimer);
        double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles;

        printf("Validating the results...\n");
        printf("...reading back GPU results\n");
        checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost));

        printf("...scanExclusiveHost()\n");
        scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength);

        // Compare GPU results with CPU results and accumulate error for this test
        printf(" ...comparing the results\n");
        int localFlag = 1;

        for (uint i = 0; i < N; i++)
        {
            if (h_OutputCPU[i] != h_OutputGPU[i])
            {
                localFlag = 0;
                break;
            }
        }

        // Log message on individual test result, then accumulate to global flag
        printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
        globalFlag = globalFlag && localFlag;

        // Data log
        if (arrayLength == MAX_LARGE_ARRAY_SIZE)
        {
            printf("\n");
            printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
                   (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup);
            printf("\n");
        }
    }


    printf("Shutting down...\n");
    closeScan();
    checkCudaErrors(cudaFree(d_Output));
    checkCudaErrors(cudaFree(d_Input));

    sdkDeleteTimer(&hTimer);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    // pass or fail (cumulative... all tests in the loop)
    exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
Beispiel #9
0
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength)
{

    uint N = 0;
    CUdeviceptr d_Output;
    struct timeval start,stop;
    gettimeofday(&start, NULL);
    initScan();
    gettimeofday(&stop, NULL);

    if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE)
      {    
        for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){
          if(arrayLength <= i){
            N = i;
            break;
          }
        }
        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N);
        //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N);

        checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE)
    {

      N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE);

      checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      
      
      checkCudaErrors(cudaDeviceSynchronize());

      scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N);
      
      checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LL_SIZE)
      {


        N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE);

        printf("N = %d\n",N);

        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N);
        
        checkCudaErrors(cudaDeviceSynchronize());

      }else{
      cuMemFree(d_Output);
      closeScan();

      return NULL;      
    }

    closeScan();

    cuMemFree(*d_Input);
    *d_Input = d_Output;

    
    printf("inside scan time:\n");
    printDiff(start,stop);

    return d_Output;
}