int groupByImpl(cl_mem d_Rin, int rLen, cl_mem d_Rout, cl_mem* d_startPos, int numThread, int numBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU) { cl_mem d_groupLabel=NULL; cl_mem d_writePos=NULL; cl_mem d_numGroup=NULL; int numGroup = 0; int memSize=sizeof(Record)*rLen; //sort cl_copyBuffer(d_Rout,d_Rin,memSize,index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); radixSortImpl(d_Rout, rLen,32, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); CL_MALLOC(&d_groupLabel, sizeof(int)*rLen ); groupByImpl_int(d_Rout, rLen, d_groupLabel,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); CL_MALLOC( &d_writePos, sizeof(int)*rLen ); ScanPara *SP; SP=(ScanPara*)malloc(sizeof(ScanPara)); initScan(rLen,SP); scanImpl( d_groupLabel, rLen, d_writePos,index,eventList,kernel,Flag_CPU_GPU,burden,SP,_CPU_GPU ); CL_MALLOC( &d_numGroup, sizeof(int)); groupByImpl_outSize_int( d_numGroup, d_groupLabel, d_writePos, rLen,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); int test; cl_readbuffer(&test,d_numGroup,sizeof(int),0); CL_MALLOC(d_startPos, sizeof(int)*numGroup ); groupByImpl_write_int((*d_startPos), d_groupLabel, d_writePos, rLen,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); closeScan(SP); CL_FREE(d_groupLabel); CL_FREE( d_writePos); CL_FREE(d_numGroup ); return numGroup; }
//return bool: if is multiple of maxNumThread //if yes, info[0]: number of blocks, info[1] = maxNumThread //if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block void testReduceImpl( int rLen, int OPERATOR, int numThreadPB , int numMaxBlock) { int _CPU_GPU=0; int result=0; int memSize = sizeof(Record)*rLen; void * h_Rin; HOST_MALLOC(h_Rin, memSize ); generateRand((Record *)h_Rin, TEST_MAX - 11111, rLen, 0 ); void* h_Rout; unsigned int numResult = 0; cl_mem d_Rin=NULL; cl_mem d_Rout; CL_MALLOC( &d_Rin, memSize ); cl_writebuffer( d_Rin, h_Rin, memSize,0); numResult= CL_AggMaxOnly( d_Rin, rLen, &d_Rout, numThreadPB, numMaxBlock,_CPU_GPU); HOST_MALLOC(h_Rout, sizeof(Record)*numResult ); cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*numResult,_CPU_GPU); //validateReduce((Record *)h_Rin, rLen,((Record *)h_Rout)[0].y, OPERATOR ); HOST_FREE( h_Rin ); HOST_FREE( h_Rout ); CL_FREE( d_Rin ); CL_FREE( d_Rout ); printf("testReduceFinish\n"); }
void gatherBeforeAgg(cl_mem d_Rin, int rLen, cl_mem d_Ragg, cl_mem d_S, int numThread, int numBlock ,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU) { cl_mem d_loc; CL_MALLOC(&d_loc, sizeof(int)*rLen ) ; clWaitForEvents(1,&eventList[(*index-1)%2]); cl_mem d_temp; CL_MALLOC(&d_temp, sizeof(int)*rLen ) ; mapBeforeGather_int( d_Rin, rLen, d_loc, d_temp,numBlock, numThread,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); gatherImpl( d_Ragg, rLen, d_loc, d_S,rLen, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); CL_FREE(d_temp); clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE(d_loc); }
extern "C" void CL_getValueList( cl_mem h_Rin, int rLen, cl_mem* h_ValueList,int numThreadPB, int numBlock,int _CPU_GPU) { cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int outputSize=sizeof(int)*rLen; CL_MALLOC(h_ValueList, outputSize); cl_mem d_tempOutput; CL_MALLOC(&d_tempOutput, outputSize); int numThread=numThreadPB*numBlock; dim3 thread( numThreadPB, 1, 1); dim3 grid( numBlock, 1 , 1); // getValueList_kernel(Record *d_R, int delta, int rLen,int *d_ValueList, int *d_output) // getValueList_kernel<<<grid,thread>>>(d_Rin, numThread, rLen, *d_ValueList, d_tempOutput); getValueListImpl(h_Rin,numThread,rLen,h_ValueList,d_tempOutput,numThreadPB,numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); deschedule(CPU_GPU,burden); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); CL_FREE(d_tempOutput); //bufferchecking(*h_ValueList,sizeof(Record)*1); }
extern "C" void CL_setRIDList(cl_mem h_RIDList, int rLen, cl_mem h_destRin, int numThreadPB, int numBlock,int _CPU_GPU) { cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; //// int outputSize=sizeof(int)*rLen; cl_mem d_tempOutput; CL_MALLOC(&d_tempOutput, outputSize); int numThread=numThreadPB*numBlock; dim3 thread( numThreadPB, 1, 1);//???? dim3 grid( numBlock, 1 , 1);//?????????? setRIDListImpl (h_RIDList,d_tempOutput,numThread, rLen, h_destRin, numThreadPB, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); deschedule(CPU_GPU,burden); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); CL_FREE(d_tempOutput); //bufferchecking(h_destRin,sizeof(Record)*1); }
void aggAfterGroupByImpl(cl_mem d_Rin, int rLen, cl_mem d_startPos, int numGroups, cl_mem d_Ragg, cl_mem d_aggResults, int OPERATOR, int numThread,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU) { //gather============================================================= cl_mem d_S; CL_MALLOC( &d_S, sizeof(Record)*rLen ) ; gatherBeforeAgg( d_Rin, rLen, d_Ragg, d_S, 512, 256,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); //parallel aggregation after gather====================================== //numThread = 1; int numChunk = ceil(((float)numGroups)/MAX_NUM_BLOCK); int numBlock; int blockOffset; int sharedMemSize = sizeof(int)*numThread; for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ ) { blockOffset = chunkIdx*MAX_NUM_BLOCK; if( chunkIdx == ( numChunk - 1 ) ) { numBlock = numGroups - chunkIdx*MAX_NUM_BLOCK; } else { numBlock = MAX_NUM_BLOCK; } parallelAggregate_init(d_S, d_startPos, d_aggResults, OPERATOR, blockOffset, numGroups,numBlock, numThread, sharedMemSize, rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE( d_S ); }
void closeScan(ScanPara* SP){ cl_int status; cl_uint refCount; status = clGetMemObjectInfo(SP->tempBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &refCount, NULL); //printf("refCount: %d\n", refCount); while(refCount != 0) { //printf("release tempBuffer\n\n"); CL_FREE(SP->tempBuffer); //reduce count by 1 //printf("success release tempBuffer\n"); refCount--; } for(int i = 0; i < (int)SP->pass; i++) { status = clGetMemObjectInfo(SP->outputBuffer[i], CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &refCount, NULL); while(refCount != 0) { //printf("release outputBuffer\n\n"); CL_FREE(SP->outputBuffer[i]); //reduce count by 1 refCount--; //printf("success release outputBuffer\n"); } status = clGetMemObjectInfo(SP->blockSumBuffer[i], CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &refCount, NULL); while(refCount != 0) { //printf("release blockSumBuffer\n\n"); CL_FREE(SP->blockSumBuffer[i]); //reduce count by 1 refCount--; //printf("success release blockSumBuffer\n"); } } //HOST_FREE(SP); }
void reduce_deallocBlockSums(tempResult *tR) { for (int i = 0; i < tR->d_numLevelsAllocated; i++) { CL_FREE(tR->d_scanBlockSums[i]); } free((void**)tR->d_scanBlockSums); tR->d_numLevelsAllocated = 0; }
void testScanImpl(int rLen) { int _CPU_GPU=0; cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int result=0; int memSize=sizeof(int)*rLen; int outSize=sizeof(int)*rLen; void *Rin; HOST_MALLOC(Rin, memSize); generateRandInt((int*)Rin, rLen,rLen,0); void *Rout; HOST_MALLOC(Rout, outSize); cl_mem d_Rin; CL_MALLOC(&d_Rin, memSize); cl_mem d_Rout; CL_MALLOC(&d_Rout, outSize); cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); ScanPara *SP; SP=(ScanPara*)malloc(sizeof(ScanPara)); initScan(rLen,SP); scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU); cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); closeScan(SP); deschedule(CPU_GPU,burden); //validateScan( (int*)Rin, rLen, (int*)Rout ); HOST_FREE(Rin); HOST_FREE(Rout); CL_FREE(d_Rin); CL_FREE(d_Rout); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); clReleaseEvent(eventList[1]); }
int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { cl_int status; if (os->xscale != xscale || os->width < width) { cl_float *xweights = hb_bicubic_weights(xscale, width); CL_FREE(os->bicubic_x_weights); CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); os->width = width; os->xscale = xscale; free(xweights); } if ((os->yscale != yscale) || (os->height < height)) { cl_float *yweights = hb_bicubic_weights(yscale, height); CL_FREE(os->bicubic_y_weights); CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); os->height = height; os->yscale = yscale; free(yweights); } return 0; }
extern "C" int CL_GroupBy(Record * h_Rin, int rLen, Record* h_Rout, int** h_startPos, int numThread, int numBlock , int _CPU_GPU) { cl_mem d_Rin; cl_mem d_Rout; cl_mem d_startPos; ///////////////////////////////////////////////////////////////////////////////////////////////////////////// cl_event eventList[2]; int index=0; cl_kernel Kernel; int CPU_GPU; double burden; int memSize = sizeof(Record)*rLen; CL_MALLOC( &d_Rin, memSize ); CL_MALLOC(&d_Rout, memSize ); cl_writebuffer( d_Rin, h_Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); int numGroup = 0; numGroup= groupByImpl(d_Rin, rLen, d_Rout, &d_startPos, numThread, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU); (*h_startPos) = (int*)malloc( sizeof(int)*numGroup ); cl_readbuffer( *h_startPos, d_startPos, sizeof(int)*numGroup,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*rLen,&index,eventList,&CPU_GPU,&burden,_CPU_GPU); clWaitForEvents(1,&eventList[(index-1)%2]); deschedule(CPU_GPU,burden); CL_FREE( d_Rin ); CL_FREE( d_Rout ); CL_FREE( d_startPos ); clReleaseKernel(Kernel); clReleaseEvent(eventList[0]); clReleaseEvent(eventList[1]); printf("CL_GroupBy\n"); return numGroup; }
void reduceFirstPass( cl_mem d_Rin, int rLen, int numThread, int numMaxBlock, int OPERATOR,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU) { int* info = (int*)malloc( sizeof(int)*2 ); //get the information of partition //return bool: if is multiple of maxNumThread //if yes, info[0]: number of blocks, info[1] = maxNumThread //if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block bool isMul = howPartition( rLen, numThread, info ); //scan the isP2 blocks unsigned int numBlock = info[0]; unsigned int numElementsPerBlock = 0; unsigned int extraSpace = 0; unsigned int sharedMemSize = 0; cl_mem d_temp; //for coalsed CL_MALLOC( &d_temp, sizeof(int)*rLen ); cl_mem t_temp=NULL;//!!!!!!!!!! CL_MALLOC( &t_temp, sizeof(int)*rLen ); unsigned int base = 0; unsigned int offset = 0; cl_mem d_data; if( numBlock > 0 ) { int numChunk = ceil( (float)numBlock/numMaxBlock ); for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ ) { base = chunkIdx*numElementsPerBlock*numMaxBlock; offset = chunkIdx*numMaxBlock; int subNumBlock = (chunkIdx == (numChunk - 1))?( numBlock - chunkIdx*numMaxBlock ):(numMaxBlock); numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, numElementsPerBlock, true, base, offset, OPERATOR,subNumBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } } clWaitForEvents(1,&eventList[(*index-1)%2]); //scan the single not isP2 block if( (!isMul) || (numBlock == 0) ) { base = numElementsPerBlock*info[0]; offset = info[0]; unsigned int remainer = rLen - numElementsPerBlock*info[0]; numThread = info[1];//update the numThread //if only one elements if( numThread == 0 ) { copyLastElement_kernel_int(tR->d_scanBlockSums[0], d_Rin, base, offset,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { numBlock = 1; numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); if( isPowerOfTwo( remainer ) ) { perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, true, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen, index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { perscanFirstPass_kernel_int(t_temp,d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, false, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU ); clWaitForEvents(1,&eventList[(*index-1)%2]); } } } clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE( d_temp ); CL_FREE( t_temp ); }
int reduceBlockSums( cl_mem d_Rout, int maxNumThread, int OPERATOR, int rLen,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU) { int* info = (int*)malloc( sizeof(int)*2 ); cl_mem temp=NULL; CL_MALLOC(&temp, sizeof(int)*rLen ); //get the information of partition //return bool: if is multiple of maxNumThread //if yes, info[0]: number of blocks, info[1] = maxNumThread //if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block for( int level = 0; level < ( tR->d_numLevelsAllocated - 1 ); level++ ) { bool isMul = howPartition( tR->levelSize[level], maxNumThread, info ); unsigned int numBlock = info[0]; unsigned int numElementsPerBlock = 0; unsigned int extraSpace = 0; unsigned int sharedMemSize = 0; //scan the isP2 blocks if( numBlock > 0 ) { numElementsPerBlock = maxNumThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], numElementsPerBlock, true, 0, 0, OPERATOR, numBlock, maxNumThread,sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } clWaitForEvents(1,&eventList[(*index-1)%2]); //scan the single not isP2 block if( (!isMul) || (numBlock == 0) ) { unsigned int base = numElementsPerBlock*info[0]; unsigned int offset = info[0]; unsigned int remainer = tR->levelSize[level] - numElementsPerBlock*info[0]; int numThread = info[1];//update the numThread clWaitForEvents(1,&eventList[(*index-1)%2]); //only one number in the last block if( numThread == 0 ) { cl_copyBuffer((tR->d_scanBlockSums[level+1]), offset, tR->d_scanBlockSums[level], base, sizeof(int), index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); } else { numBlock = 1; numElementsPerBlock = numThread*2; extraSpace = numElementsPerBlock/NUM_BANKS; sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace ); if( isPowerOfTwo( remainer ) ) { perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1],tR->d_scanBlockSums[level], remainer, true, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } else { perscan_kernel_int(temp,tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], remainer, false, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); } } } } clWaitForEvents(1,&eventList[(*index-1)%2]); getResult_kernel_init(tR->d_scanBlockSums[tR->d_numLevelsAllocated - 1], d_Rout, rLen, OPERATOR,1,1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); clWaitForEvents(1,&eventList[(*index-1)%2]); CL_FREE(temp); return 1; }