int groupByImpl(cl_mem d_Rin, int rLen, cl_mem d_Rout, cl_mem* d_startPos, int numThread, int numBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU)
{
	cl_mem d_groupLabel=NULL;
	cl_mem d_writePos=NULL;
	cl_mem d_numGroup=NULL;
	int numGroup = 0;
	int memSize=sizeof(Record)*rLen;	
	//sort
	cl_copyBuffer(d_Rout,d_Rin,memSize,index,eventList,Flag_CPU_GPU,burden,_CPU_GPU);
	radixSortImpl(d_Rout, rLen,32, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
	CL_MALLOC(&d_groupLabel, sizeof(int)*rLen );
	groupByImpl_int(d_Rout, rLen, d_groupLabel,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	CL_MALLOC( &d_writePos, sizeof(int)*rLen );
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl( d_groupLabel, rLen, d_writePos,index,eventList,kernel,Flag_CPU_GPU,burden,SP,_CPU_GPU );
	CL_MALLOC( &d_numGroup, sizeof(int));
	groupByImpl_outSize_int( d_numGroup, d_groupLabel, d_writePos, rLen,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 

	int test;
	cl_readbuffer(&test,d_numGroup,sizeof(int),0);



	CL_MALLOC(d_startPos, sizeof(int)*numGroup );
	groupByImpl_write_int((*d_startPos), d_groupLabel, d_writePos, rLen,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	closeScan(SP);
	CL_FREE(d_groupLabel);
	CL_FREE( d_writePos);
	CL_FREE(d_numGroup );
	return numGroup;
}
//return bool: if is multiple of maxNumThread
//if yes, info[0]: number of blocks, info[1] = maxNumThread
//if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block
void testReduceImpl( int rLen, int OPERATOR, int numThreadPB , int numMaxBlock)
{
	int _CPU_GPU=0;
	int result=0;
	int memSize = sizeof(Record)*rLen;

	void * h_Rin;
	HOST_MALLOC(h_Rin, memSize );
	generateRand((Record *)h_Rin, TEST_MAX - 11111, rLen, 0 );

	void* h_Rout;

	unsigned int numResult = 0;
	cl_mem d_Rin=NULL;
	cl_mem d_Rout;
	CL_MALLOC( &d_Rin, memSize );
	cl_writebuffer( d_Rin, h_Rin, memSize,0);
	numResult= CL_AggMaxOnly( d_Rin, rLen, &d_Rout, numThreadPB, numMaxBlock,_CPU_GPU);
	HOST_MALLOC(h_Rout, sizeof(Record)*numResult );
	cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*numResult,_CPU_GPU);
	//validateReduce((Record *)h_Rin, rLen,((Record *)h_Rout)[0].y, OPERATOR );	
	HOST_FREE( h_Rin );
	HOST_FREE( h_Rout );
	CL_FREE( d_Rin );
	CL_FREE( d_Rout );
	printf("testReduceFinish\n");
}
void gatherBeforeAgg(cl_mem d_Rin, int rLen, cl_mem d_Ragg, cl_mem d_S, int numThread, int numBlock ,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU)
{
	cl_mem d_loc;
	CL_MALLOC(&d_loc, sizeof(int)*rLen ) ;
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	cl_mem d_temp;
	CL_MALLOC(&d_temp, sizeof(int)*rLen ) ;

	mapBeforeGather_int( d_Rin, rLen, d_loc, d_temp,numBlock, numThread,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	gatherImpl( d_Ragg, rLen, d_loc, d_S,rLen, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	CL_FREE(d_temp);
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	CL_FREE(d_loc);
}
extern "C" void CL_getValueList( cl_mem h_Rin, int rLen, cl_mem* h_ValueList,int numThreadPB, int numBlock,int _CPU_GPU)
{
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;
	int outputSize=sizeof(int)*rLen;
	CL_MALLOC(h_ValueList, outputSize);
	cl_mem d_tempOutput;
	CL_MALLOC(&d_tempOutput, outputSize);

	int numThread=numThreadPB*numBlock;
	dim3  thread( numThreadPB, 1, 1);
	dim3  grid( numBlock, 1 , 1);
//	getValueList_kernel(Record *d_R, int delta, int rLen,int *d_ValueList, int *d_output) 
//	getValueList_kernel<<<grid,thread>>>(d_Rin, numThread, rLen, *d_ValueList, d_tempOutput);
	getValueListImpl(h_Rin,numThread,rLen,h_ValueList,d_tempOutput,numThreadPB,numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]); 
	deschedule(CPU_GPU,burden);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	CL_FREE(d_tempOutput);
	//bufferchecking(*h_ValueList,sizeof(Record)*1);
}
extern "C" void CL_setRIDList(cl_mem h_RIDList, int rLen, cl_mem h_destRin, int numThreadPB, int numBlock,int _CPU_GPU)
{
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;
	////
	int outputSize=sizeof(int)*rLen;
	cl_mem d_tempOutput;
	CL_MALLOC(&d_tempOutput, outputSize);

	int numThread=numThreadPB*numBlock;
	dim3  thread( numThreadPB, 1, 1);//????
	dim3  grid( numBlock, 1 , 1);//??????????

	setRIDListImpl (h_RIDList,d_tempOutput,numThread, rLen, h_destRin, numThreadPB, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]); 
	deschedule(CPU_GPU,burden);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	CL_FREE(d_tempOutput);
	//bufferchecking(h_destRin,sizeof(Record)*1);

}
void aggAfterGroupByImpl(cl_mem d_Rin, int rLen, cl_mem d_startPos, int numGroups, cl_mem d_Ragg, cl_mem d_aggResults, int OPERATOR,  int numThread,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU)
{
	//gather=============================================================
	cl_mem d_S;
	CL_MALLOC( &d_S, sizeof(Record)*rLen ) ;
	gatherBeforeAgg( d_Rin, rLen, d_Ragg, d_S, 512, 256,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	//parallel aggregation after gather======================================
	//numThread = 1;
	int numChunk = ceil(((float)numGroups)/MAX_NUM_BLOCK);
	int numBlock;
	int blockOffset;

	int sharedMemSize = sizeof(int)*numThread;

	for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ )
	{
		blockOffset = chunkIdx*MAX_NUM_BLOCK;
		if( chunkIdx == ( numChunk - 1 ) )
		{
			numBlock = numGroups - chunkIdx*MAX_NUM_BLOCK;
		}
		else
		{
			numBlock = MAX_NUM_BLOCK;
		}
		parallelAggregate_init(d_S, d_startPos, d_aggResults, OPERATOR, blockOffset, numGroups,numBlock, numThread, sharedMemSize, rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
		clWaitForEvents(1,&eventList[(*index-1)%2]);
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	CL_FREE( d_S );
}
void closeScan(ScanPara* SP){
    cl_int status;
	cl_uint refCount;
	status = clGetMemObjectInfo(SP->tempBuffer,
                		        CL_MEM_REFERENCE_COUNT,
							    sizeof(cl_uint),
						        &refCount,
								NULL);
	//printf("refCount: %d\n", refCount);
	while(refCount != 0)
	{
		//printf("release tempBuffer\n\n");
		CL_FREE(SP->tempBuffer); //reduce count by 1	
		//printf("success release tempBuffer\n");
		refCount--;
	}
    for(int i = 0; i < (int)SP->pass; i++)
    {
		status = clGetMemObjectInfo(SP->outputBuffer[i],
                		            CL_MEM_REFERENCE_COUNT,
									sizeof(cl_uint),
									&refCount,
									NULL);
		while(refCount != 0)
		{
			//printf("release outputBuffer\n\n");
			CL_FREE(SP->outputBuffer[i]); //reduce count by 1
			refCount--;
			//printf("success release outputBuffer\n");
		}

		status = clGetMemObjectInfo(SP->blockSumBuffer[i],
                		            CL_MEM_REFERENCE_COUNT,
									sizeof(cl_uint),
									&refCount,
									NULL);
		while(refCount != 0)
		{
			//printf("release blockSumBuffer\n\n");
			CL_FREE(SP->blockSumBuffer[i]); //reduce count by 1
			refCount--;
			//printf("success release blockSumBuffer\n");
		}
    }

	//HOST_FREE(SP);
}
void reduce_deallocBlockSums(tempResult *tR)
{
    for (int i = 0; i < tR->d_numLevelsAllocated; i++)
    {
		CL_FREE(tR->d_scanBlockSums[i]);
    }

    free((void**)tR->d_scanBlockSums);
    tR->d_numLevelsAllocated = 0;
}
void testScanImpl(int rLen)
{
	int _CPU_GPU=0;
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;	
	int result=0;
	int memSize=sizeof(int)*rLen;
	int outSize=sizeof(int)*rLen;
	void *Rin;
	HOST_MALLOC(Rin, memSize);
	generateRandInt((int*)Rin, rLen,rLen,0);
	void *Rout;
	HOST_MALLOC(Rout, outSize);
	cl_mem d_Rin;
	CL_MALLOC(&d_Rin, memSize);
	cl_mem d_Rout;
	CL_MALLOC(&d_Rout, outSize);
	cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU);	
	cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]);
	closeScan(SP);
	deschedule(CPU_GPU,burden);
	//validateScan( (int*)Rin, rLen, (int*)Rout );
	HOST_FREE(Rin);
	HOST_FREE(Rout);
	CL_FREE(d_Rin);
	CL_FREE(d_Rout);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	clReleaseEvent(eventList[1]);
}
示例#10
0
int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) {
    cl_int status;
    if (os->xscale != xscale || os->width < width) {
        cl_float *xweights = hb_bicubic_weights(xscale, width);
        CL_FREE(os->bicubic_x_weights);
        CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4);
        OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL );
        os->width = width;
        os->xscale = xscale;
        free(xweights);
    }

    if ((os->yscale != yscale) || (os->height < height)) {
        cl_float *yweights = hb_bicubic_weights(yscale, height);
        CL_FREE(os->bicubic_y_weights);
        CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4);
        OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL );
        os->height = height;
        os->yscale = yscale;
        free(yweights);
    }
    return 0;
}
extern "C" int CL_GroupBy(Record * h_Rin, int rLen, Record* h_Rout, int** h_startPos,
                          int numThread, int numBlock , int _CPU_GPU)
{
    cl_mem d_Rin;
    cl_mem d_Rout;
    cl_mem d_startPos;
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
    cl_event eventList[2];
    int index=0;
    cl_kernel Kernel;
    int CPU_GPU;
    double burden;
    int memSize = sizeof(Record)*rLen;


    CL_MALLOC( &d_Rin, memSize );
    CL_MALLOC(&d_Rout, memSize );
    cl_writebuffer( d_Rin, h_Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    int numGroup = 0;


    numGroup= groupByImpl(d_Rin, rLen, d_Rout, &d_startPos, numThread, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU);
    (*h_startPos) = (int*)malloc( sizeof(int)*numGroup );

    cl_readbuffer( *h_startPos, d_startPos, sizeof(int)*numGroup,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*rLen,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
    clWaitForEvents(1,&eventList[(index-1)%2]);
    deschedule(CPU_GPU,burden);
    CL_FREE( d_Rin );
    CL_FREE( d_Rout );
    CL_FREE( d_startPos );
    clReleaseKernel(Kernel);
    clReleaseEvent(eventList[0]);
    clReleaseEvent(eventList[1]);
    printf("CL_GroupBy\n");
    return numGroup;
}
void reduceFirstPass( cl_mem d_Rin, int rLen, int numThread, int numMaxBlock, int OPERATOR,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU)
{
	int* info = (int*)malloc( sizeof(int)*2 );
	//get the information of partition	
	//return bool: if is multiple of maxNumThread
	//if yes, info[0]: number of blocks, info[1] = maxNumThread
	//if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block
	bool isMul = howPartition( rLen, numThread, info );

	//scan the isP2 blocks	
	unsigned int numBlock = info[0];
	unsigned int numElementsPerBlock = 0;
	unsigned int extraSpace = 0;
	unsigned int sharedMemSize = 0;

	cl_mem d_temp; //for coalsed 
	CL_MALLOC( &d_temp, sizeof(int)*rLen );
	cl_mem t_temp=NULL;//!!!!!!!!!!
	CL_MALLOC( &t_temp, sizeof(int)*rLen );
	
	
	unsigned int base = 0;
	unsigned int offset = 0;
	cl_mem d_data;

	if( numBlock > 0 )
	{
		int numChunk = ceil( (float)numBlock/numMaxBlock );

		for( int chunkIdx = 0; chunkIdx < numChunk; chunkIdx++ )
		{
			base = chunkIdx*numElementsPerBlock*numMaxBlock;
			offset = chunkIdx*numMaxBlock;
			int subNumBlock = (chunkIdx == (numChunk - 1))?( numBlock - chunkIdx*numMaxBlock ):(numMaxBlock);

			numElementsPerBlock = numThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
  			perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, numElementsPerBlock, true, base, offset, OPERATOR,subNumBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
	
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	//scan the single not isP2 block
	if( (!isMul) || (numBlock == 0) )
	{
		base = numElementsPerBlock*info[0];
		offset = info[0];
		unsigned int remainer = rLen - numElementsPerBlock*info[0];

		numThread = info[1];//update the numThread
		
		//if only one elements
		if( numThread == 0 )
		{
			copyLastElement_kernel_int(tR->d_scanBlockSums[0], d_Rin, base, offset,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
		else
		{
			numBlock = 1;
			numElementsPerBlock = numThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
			
			if( isPowerOfTwo( remainer ) )
			{

				perscanFirstPass_kernel_int(t_temp, d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, true, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen, index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
				clWaitForEvents(1,&eventList[(*index-1)%2]); 
			}
			else
			{
				perscanFirstPass_kernel_int(t_temp,d_temp, tR->d_scanBlockSums[0], d_Rin, remainer, false, base, offset, OPERATOR ,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
				clWaitForEvents(1,&eventList[(*index-1)%2]); 
			}	
		}
	
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	CL_FREE( d_temp );
	CL_FREE( t_temp );
}
int reduceBlockSums( cl_mem d_Rout, int maxNumThread, int OPERATOR, int rLen,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU)
{
	int* info = (int*)malloc( sizeof(int)*2 );
	cl_mem temp=NULL;
	CL_MALLOC(&temp, sizeof(int)*rLen );
	//get the information of partition	
	//return bool: if is multiple of maxNumThread
	//if yes, info[0]: number of blocks, info[1] = maxNumThread
	//if no, info[0]: number of blocks except of the last block, info[1]: number of thread in the last block
	for( int level = 0; level < ( tR->d_numLevelsAllocated - 1 ); level++ )
	{
		bool isMul = howPartition( tR->levelSize[level], maxNumThread, info );

		unsigned int numBlock = info[0];
		unsigned int numElementsPerBlock = 0;
		unsigned int extraSpace = 0;
		unsigned int sharedMemSize = 0;
		
		//scan the isP2 blocks
		if( numBlock > 0 )
		{
			numElementsPerBlock = maxNumThread*2;
			extraSpace = numElementsPerBlock/NUM_BANKS;
			sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );			
			perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], numElementsPerBlock, true, 0, 0, OPERATOR, numBlock, maxNumThread,sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
		}
		clWaitForEvents(1,&eventList[(*index-1)%2]); 
		//scan the single not isP2 block
		if( (!isMul) || (numBlock == 0) )
		{
			unsigned int base = numElementsPerBlock*info[0];
			unsigned int offset = info[0];
			unsigned int remainer = tR->levelSize[level] - numElementsPerBlock*info[0];

			int numThread = info[1];//update the numThread
			clWaitForEvents(1,&eventList[(*index-1)%2]); 
			//only one number in the last block
			if( numThread == 0 )
			{				
				cl_copyBuffer((tR->d_scanBlockSums[level+1]), offset,
					tR->d_scanBlockSums[level], base, sizeof(int), index,eventList,Flag_CPU_GPU,burden,_CPU_GPU); 
			
			}
			else
			{
				numBlock = 1;
				numElementsPerBlock = numThread*2;
				extraSpace = numElementsPerBlock/NUM_BANKS;
				sharedMemSize = sizeof(int)*( numElementsPerBlock + extraSpace );
				
				if( isPowerOfTwo( remainer ) )
				{
					perscan_kernel_int(temp, tR->d_scanBlockSums[level + 1],tR->d_scanBlockSums[level], remainer, true, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU); 
					clWaitForEvents(1,&eventList[(*index-1)%2]); 
				}
				else
				{
					
					perscan_kernel_int(temp,tR->d_scanBlockSums[level + 1], tR->d_scanBlockSums[level], remainer, false, base, offset, OPERATOR,numBlock, numThread, sharedMemSize,rLen,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
					clWaitForEvents(1,&eventList[(*index-1)%2]); 
				}
			}
			
		}
	}
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	getResult_kernel_init(tR->d_scanBlockSums[tR->d_numLevelsAllocated - 1], d_Rout, rLen, OPERATOR,1,1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	CL_FREE(temp);
	return 1;
}