int groupByImpl(cl_mem d_Rin, int rLen, cl_mem d_Rout, cl_mem* d_startPos, int numThread, int numBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU) { cl_mem d_groupLabel=NULL; cl_mem d_writePos=NULL; cl_mem d_numGroup=NULL; int numGroup = 0; int memSize=sizeof(Record)*rLen; //sort cl_copyBuffer(d_Rout,d_Rin,memSize,index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); radixSortImpl(d_Rout, rLen,32, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); CL_MALLOC(&d_groupLabel, sizeof(int)*rLen ); groupByImpl_int(d_Rout, rLen, d_groupLabel,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); CL_MALLOC( &d_writePos, sizeof(int)*rLen ); ScanPara *SP; SP=(ScanPara*)malloc(sizeof(ScanPara)); initScan(rLen,SP); scanImpl( d_groupLabel, rLen, d_writePos,index,eventList,kernel,Flag_CPU_GPU,burden,SP,_CPU_GPU ); CL_MALLOC( &d_numGroup, sizeof(int)); groupByImpl_outSize_int( d_numGroup, d_groupLabel, d_writePos, rLen,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); int test; cl_readbuffer(&test,d_numGroup,sizeof(int),0); CL_MALLOC(d_startPos, sizeof(int)*numGroup ); groupByImpl_write_int((*d_startPos), d_groupLabel, d_writePos, rLen,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); closeScan(SP); CL_FREE(d_groupLabel); CL_FREE( d_writePos); CL_FREE(d_numGroup ); return numGroup; }
char * serializeTableContent(RM_TableData *rel) { int i; VarString *result; RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle)); Record *r = (Record *) malloc(sizeof(Record)); MAKE_VARSTRING(result); for (i = 0; i < rel->schema->numAttr; i++) APPEND(result, "%s%s", (i != 0) ? ", " : "", rel->schema->attrNames[i]); startScan(rel, sc, NULL); while (next(sc, r) != RC_RM_NO_MORE_TUPLES) { APPEND_STRING(result, serializeRecord(r, rel->schema)); APPEND_STRING(result, "\n"); } closeScan(sc); RETURN_STRING(result); }
void testScanImpl(int rLen) { int _CPU_GPU=0; cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int result=0; int memSize=sizeof(int)*rLen; int outSize=sizeof(int)*rLen; void *Rin; HOST_MALLOC(Rin, memSize); generateRandInt((int*)Rin, rLen,rLen,0); void *Rout; HOST_MALLOC(Rout, outSize); cl_mem d_Rin; CL_MALLOC(&d_Rin, memSize); cl_mem d_Rout; CL_MALLOC(&d_Rout, outSize); cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); ScanPara *SP; SP=(ScanPara*)malloc(sizeof(ScanPara)); initScan(rLen,SP); scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU); cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); closeScan(SP); deschedule(CPU_GPU,burden); //validateScan( (int*)Rin, rLen, (int*)Rout ); HOST_FREE(Rin); HOST_FREE(Rout); CL_FREE(d_Rin); CL_FREE(d_Rout); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); clReleaseEvent(eventList[1]); }
void testScansTwo (void) { RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData)); TestRecord inserts[] = { {1, "aaaa", 3}, {2, "bbbb", 2}, {3, "cccc", 1}, {4, "dddd", 3}, {5, "eeee", 5}, {6, "ffff", 1}, {7, "gggg", 3}, {8, "hhhh", 3}, {9, "iiii", 2}, {10, "jjjj", 5}, }; bool foundScan[] = { FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE }; int numInserts = 10, i; Record *r; RID *rids; Schema *schema; RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle)); Expr *sel, *left, *right, *first, *se; int rc; testName = "test creating a new table and inserting tuples"; schema = testSchema(); rids = (RID *) malloc(sizeof(RID) * numInserts); TEST_CHECK(initRecordManager(NULL)); TEST_CHECK(createTable("test_table_r",schema)); TEST_CHECK(openTable(table, "test_table_r")); // insert rows into table for(i = 0; i < numInserts; i++) { r = fromTestRecord(schema, inserts[i]); TEST_CHECK(insertRecord(table,r)); rids[i] = r->id; } TEST_CHECK(closeTable(table)); TEST_CHECK(openTable(table, "test_table_r")); // Select 1 record with INT in condition a=2. MAKE_CONS(left, stringToValue("i2")); MAKE_ATTRREF(right, 0); MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL); createRecord(&r, schema); TEST_CHECK(startScan(table, sc, sel)); while((rc = next(sc, r)) == RC_OK) { ASSERT_EQUALS_RECORDS(fromTestRecord(schema, inserts[1]), r, schema, "compare records"); } if (rc != RC_NO_TUPLES) TEST_CHECK(rc); TEST_CHECK(closeScan(sc)); // Select 1 record with STRING in condition b='ffff'. MAKE_CONS(left, stringToValue("sffff")); MAKE_ATTRREF(right, 1); MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL); createRecord(&r, schema); TEST_CHECK(startScan(table, sc, sel)); while((rc = next(sc, r)) == RC_OK) { ASSERT_EQUALS_RECORDS(fromTestRecord(schema, inserts[5]), r, schema, "compare records"); serializeRecord(r, schema); } if (rc != RC_NO_TUPLES) TEST_CHECK(rc); TEST_CHECK(closeScan(sc)); // Select all records, with condition being false MAKE_CONS(left, stringToValue("i4")); MAKE_ATTRREF(right, 2); MAKE_BINOP_EXPR(first, right, left, OP_COMP_SMALLER); MAKE_UNOP_EXPR(se, first, OP_BOOL_NOT); TEST_CHECK(startScan(table, sc, se)); while((rc = next(sc, r)) == RC_OK) { serializeRecord(r, schema); for(i = 0; i < numInserts; i++) { if (memcmp(fromTestRecord(schema, inserts[i])->data,r->data,getRecordSize(schema)) == 0) foundScan[i] = TRUE; } } if (rc != RC_NO_TUPLES) TEST_CHECK(rc); TEST_CHECK(closeScan(sc)); ASSERT_TRUE(!foundScan[0], "not greater than four"); ASSERT_TRUE(foundScan[4], "greater than four"); ASSERT_TRUE(foundScan[9], "greater than four"); // clean up TEST_CHECK(closeTable(table)); TEST_CHECK(deleteTable("test_table_r")); TEST_CHECK(shutdownRecordManager()); freeRecord(r); free(table); free(sc); freeExpr(sel); TEST_DONE(); }
void testScans (void) { RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData)); TestRecord inserts[] = { {1, "aaaa", 3}, {2, "bbbb", 2}, {3, "cccc", 1}, {4, "dddd", 3}, {5, "eeee", 5}, {6, "ffff", 1}, {7, "gggg", 3}, {8, "hhhh", 3}, {9, "iiii", 2}, {10, "jjjj", 5}, }; TestRecord scanOneResult[] = { {3, "cccc", 1}, {6, "ffff", 1}, }; bool foundScan[] = { FALSE, FALSE }; int numInserts = 10, scanSizeOne = 2, i; Record *r; RID *rids; Schema *schema; RM_ScanHandle *sc = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle)); Expr *sel, *left, *right; int rc; testName = "test creating a new table and inserting tuples"; schema = testSchema(); rids = (RID *) malloc(sizeof(RID) * numInserts); TEST_CHECK(initRecordManager(NULL)); TEST_CHECK(createTable("test_table_r",schema)); TEST_CHECK(openTable(table, "test_table_r")); // insert rows into table for(i = 0; i < numInserts; i++) { r = fromTestRecord(schema, inserts[i]); TEST_CHECK(insertRecord(table,r)); rids[i] = r->id; } TEST_CHECK(closeTable(table)); TEST_CHECK(openTable(table, "test_table_r")); // run some scans MAKE_CONS(left, stringToValue("i1")); MAKE_ATTRREF(right, 2); MAKE_BINOP_EXPR(sel, left, right, OP_COMP_EQUAL); TEST_CHECK(startScan(table, sc, sel)); while((rc = next(sc, r)) == RC_OK) { for(i = 0; i < scanSizeOne; i++) { if (memcmp(fromTestRecord(schema, scanOneResult[i])->data,r->data,getRecordSize(schema)) == 0) foundScan[i] = TRUE; } } if (rc != RC_NO_TUPLES) TEST_CHECK(rc); TEST_CHECK(closeScan(sc)); for(i = 0; i < scanSizeOne; i++) ASSERT_TRUE(foundScan[i], "check for scan result"); // clean up TEST_CHECK(closeTable(table)); TEST_CHECK(deleteTable("test_table_r")); TEST_CHECK(shutdownRecordManager()); free(table); free(sc); freeExpr(sel); TEST_DONE(); }
void testMultipleScans(void) { RM_TableData *table = (RM_TableData *) malloc(sizeof(RM_TableData)); TestRecord inserts[] = { {1, "aaaa", 3}, {2, "bbbb", 2}, {3, "cccc", 1}, {4, "dddd", 3}, {5, "eeee", 5}, {6, "ffff", 1}, {7, "gggg", 3}, {8, "hhhh", 3}, {9, "iiii", 2}, {10, "jjjj", 5}, }; int numInserts = 10, i, scanOne=0, scanTwo=0; Record *r; RID *rids; Schema *schema; testName = "test running muliple scans "; schema = testSchema(); rids = (RID *) malloc(sizeof(RID) * numInserts); RM_ScanHandle *sc1 = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle)); RM_ScanHandle *sc2 = (RM_ScanHandle *) malloc(sizeof(RM_ScanHandle)); Expr *se1, *left, *right; int rc,rc2; TEST_CHECK(initRecordManager(NULL)); TEST_CHECK(createTable("test_table_r",schema)); TEST_CHECK(openTable(table, "test_table_r")); // insert rows into table for(i = 0; i < numInserts; i++) { r = fromTestRecord(schema, inserts[i]); TEST_CHECK(insertRecord(table,r)); rids[i] = r->id; } // Mix 2 scans with c=3 as condition MAKE_CONS(left, stringToValue("i3")); MAKE_ATTRREF(right, 2); MAKE_BINOP_EXPR(se1, left, right, OP_COMP_EQUAL); createRecord(&r, schema); TEST_CHECK(startScan(table, sc1, se1)); TEST_CHECK(startScan(table, sc2, se1)); if ((rc2 = next(sc2, r)) == RC_OK) scanTwo++; i = 0; while((rc = next(sc1, r)) == RC_OK) { scanOne++; i++; if (i % 3 == 0) if ((rc2 = next(sc2, r)) == RC_OK) scanTwo++; } while((rc2 = next(sc2, r)) == RC_OK) scanTwo++; ASSERT_TRUE(scanOne == scanTwo, "scans returned same number of tuples"); if (rc != RC_NO_TUPLES) TEST_CHECK(rc); TEST_CHECK(closeScan(sc1)); TEST_CHECK(closeScan(sc2)); TEST_CHECK(closeTable(table)); TEST_CHECK(deleteTable("test_table_r")); TEST_CHECK(shutdownRecordManager()); free(rids); free(table); TEST_DONE(); }
int main(int argc, char **argv) { // Start logs //shrSetLogFileName ("scan.txt"); printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s /* if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); */ uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; uint hTimer; //const uint N = 13 * 1048576 / 2; const uint N = 2048; printf("Allocating and initializing host arrays...\n"); //cutCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); //srand(2009); //for(uint i = 0; i < N; i++) // h_Input[i] = rand(); klee_make_symbolic(h_Input, sizeof(uint) * N, "input"); printf("Allocating and initializing CUDA arrays...\n"); cudaMalloc((void **)&d_Input, N * sizeof(uint)); cudaMalloc((void **)&d_Output, N * sizeof(uint)); cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; //const int iCycles = 100; const int iCycles = 5; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1){ printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); //cutilSafeCall( cudaDeviceSynchronize() ); //cutResetTimer(hTimer); //cutStartTimer(hTimer); //for(int i = 0; i < iCycles; i++) //{ //printf("The arrayLength in scanExclusiveShort: %d, the i: %d\n", arrayLength, i); szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); //} //cutilSafeCall( cudaDeviceSynchronize()); //cutStopTimer(hTimer); //double timerValue = 1.0e-3 * cutGetTimerValue(hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; #ifndef _SYM for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } #endif // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); //printfEx(LOGBOTH | MASTER, 0, "scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", // (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1){ printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); //for(int i = 0; i < iCycles; i++) //{ printf("The arrayLength in scanExclusiveLarge: %d\n", arrayLength); szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); //} printf("Validating the results...\n"); printf("...reading back GPU results\n"); cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, d_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; #ifndef _SYM for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } #endif // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); //printfEx(LOGBOTH | MASTER, 0, "scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", // (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); printf("\n"); } } // pass or fail (cumulative... all tests in the loop) printf(globalFlag ? "PASSED\n\n" : "FAILED\n\n"); printf("Shutting down...\n"); closeScan(); cudaFree(d_Output); cudaFree(d_Input); free(h_Input); free(h_OutputCPU); free(h_OutputGPU); }
int main(int argc, char **argv) { printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; StopWatchInterface *hTimer = NULL; const uint N = 13 * 1048576 / 2; printf("Allocating and initializing host arrays...\n"); sdkCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_Input[i] = rand(); } printf("Allocating and initializing CUDA arrays...\n"); checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("Shutting down...\n"); closeScan(); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFree(d_Input)); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); // pass or fail (cumulative... all tests in the loop) exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); }
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength) { uint N = 0; CUdeviceptr d_Output; struct timeval start,stop; gettimeofday(&start, NULL); initScan(); gettimeofday(&stop, NULL); if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE) { for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){ if(arrayLength <= i){ N = i; break; } } checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N); //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE) { N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LL_SIZE) { N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE); printf("N = %d\n",N); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else{ cuMemFree(d_Output); closeScan(); return NULL; } closeScan(); cuMemFree(*d_Input); *d_Input = d_Output; printf("inside scan time:\n"); printDiff(start,stop); return d_Output; }