int groupByImpl(cl_mem d_Rin, int rLen, cl_mem d_Rout, cl_mem* d_startPos, int numThread, int numBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,int _CPU_GPU)
{
	cl_mem d_groupLabel=NULL;
	cl_mem d_writePos=NULL;
	cl_mem d_numGroup=NULL;
	int numGroup = 0;
	int memSize=sizeof(Record)*rLen;	
	//sort
	cl_copyBuffer(d_Rout,d_Rin,memSize,index,eventList,Flag_CPU_GPU,burden,_CPU_GPU);
	radixSortImpl(d_Rout, rLen,32, numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU );
	CL_MALLOC(&d_groupLabel, sizeof(int)*rLen );
	groupByImpl_int(d_Rout, rLen, d_groupLabel,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	CL_MALLOC( &d_writePos, sizeof(int)*rLen );
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl( d_groupLabel, rLen, d_writePos,index,eventList,kernel,Flag_CPU_GPU,burden,SP,_CPU_GPU );
	CL_MALLOC( &d_numGroup, sizeof(int));
	groupByImpl_outSize_int( d_numGroup, d_groupLabel, d_writePos, rLen,1, 1,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 

	int test;
	cl_readbuffer(&test,d_numGroup,sizeof(int),0);



	CL_MALLOC(d_startPos, sizeof(int)*numGroup );
	groupByImpl_write_int((*d_startPos), d_groupLabel, d_writePos, rLen,numThread, numBlock,index,eventList,kernel,Flag_CPU_GPU,burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]); 
	closeScan(SP);
	CL_FREE(d_groupLabel);
	CL_FREE( d_writePos);
	CL_FREE(d_numGroup );
	return numGroup;
}
void testScanImpl(int rLen)
{
	int _CPU_GPU=0;
	cl_event eventList[2];
	int index=0;
	cl_kernel Kernel; 
	int CPU_GPU;
	double burden;	
	int result=0;
	int memSize=sizeof(int)*rLen;
	int outSize=sizeof(int)*rLen;
	void *Rin;
	HOST_MALLOC(Rin, memSize);
	generateRandInt((int*)Rin, rLen,rLen,0);
	void *Rout;
	HOST_MALLOC(Rout, outSize);
	cl_mem d_Rin;
	CL_MALLOC(&d_Rin, memSize);
	cl_mem d_Rout;
	CL_MALLOC(&d_Rout, outSize);
	cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	ScanPara *SP;
	SP=(ScanPara*)malloc(sizeof(ScanPara));
	initScan(rLen,SP);
	scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU);	
	cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
	clWaitForEvents(1,&eventList[(index-1)%2]);
	closeScan(SP);
	deschedule(CPU_GPU,burden);
	//validateScan( (int*)Rin, rLen, (int*)Rout );
	HOST_FREE(Rin);
	HOST_FREE(Rout);
	CL_FREE(d_Rin);
	CL_FREE(d_Rout);
	clReleaseKernel(Kernel);  
	clReleaseEvent(eventList[0]);
	clReleaseEvent(eventList[1]);
}
Exemple #3
0
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){

    struct timespec start,end;
    clock_gettime(CLOCK_REALTIME,&start);

    cl_event ndrEvt;
    cl_ulong startTime,endTime;

    struct tableNode * res = NULL;
    long gpuTupleNum;
    int gpuGbColNum;
    cl_mem gpuGbIndex;
    cl_mem gpuGbType, gpuGbSize;

    cl_mem gpuGbKey;
    cl_mem gpuContent;

    int gbCount;                // the number of groups
    int gbConstant = 0;         // whether group by constant

    cl_int error = 0;

    res = (struct tableNode *) malloc(sizeof(struct tableNode));
    CHECK_POINTER(res);
    res->tupleSize = gb->tupleSize;
    res->totalAttr = gb->outputAttrNum;
    res->attrType = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrType);
    res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrSize);
    res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->attrTotalSize);
    res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataPos);
    res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(res->dataFormat);
    res->content = (char **) malloc(sizeof(char **) * res->totalAttr);
    CHECK_POINTER(res->content);

    for(int i=0;i<res->totalAttr;i++){
        res->attrType[i] = gb->attrType[i];
        res->attrSize[i] = gb->attrSize[i];
        res->dataFormat[i] = UNCOMPRESSED;
    }
    
    gpuTupleNum = gb->table->tupleNum;
    gpuGbColNum = gb->groupByColNum;

    if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){

        gbConstant = 1;
    }

    size_t localSize = 128;
    size_t globalSize = 1024*128;

    int blockNum = gb->table->tupleNum / localSize + 1; 

    if(blockNum < 1024)
        globalSize = blockNum * 128;


    cl_mem gpu_hashNum;
    cl_mem gpu_psum;
    cl_mem gpuGbCount;

    long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr);
    CHECK_POINTER(cpuOffset);
    long offset = 0;
    long totalSize = 0;

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        cpuOffset[i] = offset;

        /*align each column*/

        if(size % 4 !=0){
            size += 4 - (size%4);
        }

        offset += size;
        totalSize += size;
    }

    gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error);

    for(int i=0;i<gb->table->totalAttr;i++){

        int attrSize = gb->table->attrSize[i];
        int size = attrSize * gb->table->tupleNum;

        if(gb->table->dataPos[i]==MEM){
            error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }else
            error = clEnqueueCopyBuffer(context->queue,(cl_mem)gb->table->content[i],gpuContent,0, cpuOffset[i],size,0,0,0);

    }

    cl_mem gpuOffset = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(long)*gb->table->totalAttr,NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuOffset,CL_TRUE,0,sizeof(long)*gb->table->totalAttr,cpuOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    if(gbConstant != 1){

        gpuGbType = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbSize = clCreateBuffer(context->context,CL_MEM_READ_ONLY,sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupBySize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpuGbKey = clCreateBuffer(context->context,CL_MEM_READ_WRITE,sizeof(int)*gb->table->tupleNum,NULL,&error);

        gpuGbIndex = clCreateBuffer(context->context,CL_MEM_READ_ONLY, sizeof(int)*gb->groupByColNum,NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbIndex,CL_TRUE,0,sizeof(int)*gb->groupByColNum,gb->groupByIndex,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_hashNum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        context->kernel = clCreateKernel(context->program,"cl_memset_int",0);

        int tmp = HSIZE;
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int), (void*)&tmp);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        context->kernel = clCreateKernel(context->program, "build_groupby_key",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem),(void *)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int),(void *)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem),(void *)&gpuGbIndex);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem),(void *)&gpuGbType);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem),(void *)&gpuGbSize);
        clSetKernelArg(context->kernel,6,sizeof(long),(void *)&gpuTupleNum);
        clSetKernelArg(context->kernel,7,sizeof(cl_mem),(void *)&gpuGbKey);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem),(void *)&gpu_hashNum);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clReleaseMemObject(gpuGbType);
        clReleaseMemObject(gpuGbSize);
        clReleaseMemObject(gpuGbIndex);

        gbCount = 1;

        tmp = 0;
        gpuGbCount = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int),NULL,&error);
        clEnqueueWriteBuffer(context->queue,gpuGbCount,CL_TRUE,0,sizeof(int),&tmp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        int hsize = HSIZE;
        context->kernel = clCreateKernel(context->program, "count_group_num",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem),(void *)&gpu_hashNum);
        clSetKernelArg(context->kernel,1,sizeof(int),(void *)&hsize);
        clSetKernelArg(context->kernel,2,sizeof(cl_mem),(void *)&gpuGbCount);
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif

        clEnqueueReadBuffer(context->queue, gpuGbCount, CL_TRUE, 0, sizeof(int), &gbCount,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        gpu_psum = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int)*HSIZE,NULL,&error);

        scanImpl(gpu_hashNum,HSIZE,gpu_psum,context,pp);

        clReleaseMemObject(gpuGbCount);
        clReleaseMemObject(gpu_hashNum);
    }

    if(gbConstant == 1)
        res->tupleNum = 1;
    else
        res->tupleNum = gbCount;

    printf("groupBy num %ld\n",res->tupleNum);

    gpuGbType = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbType,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrType,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuGbSize,CL_TRUE,0,sizeof(int)*res->totalAttr,res->attrSize,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    /*
     * @gpuGbExp is the mathExp in each groupBy expression
     * @mathexp stores the math exp for for the group expression that has two operands
     * The reason that we need two variables instead of one is that OpenCL doesn't support pointer to pointer
     * */

    cl_mem gpuGbExp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(struct mathExp)*res->totalAttr, NULL, &error);
    cl_mem mathexp = clCreateBuffer(context->context, CL_MEM_READ_ONLY, 2*sizeof(struct mathExp)*res->totalAttr, NULL, &error);

    struct mathExp tmpExp[2];
    int * cpuFunc = (int *) malloc(sizeof(int) * res->totalAttr);
    CHECK_POINTER(cpuFunc);

    offset = 0;
    for(int i=0;i<res->totalAttr;i++){

        error = clEnqueueWriteBuffer(context->queue, gpuGbExp, CL_TRUE, offset, sizeof(struct mathExp), &(gb->gbExp[i].exp),0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->pcie += 1e-6 * (endTime - startTime);
#endif

        offset += sizeof(struct mathExp);

        cpuFunc[i] = gb->gbExp[i].func;

        if(gb->gbExp[i].exp.opNum == 2){
            struct mathExp * tmpMath = (struct mathExp *) (gb->gbExp[i].exp.exp);
            tmpExp[0].op = tmpMath[0].op;
            tmpExp[0].opNum = tmpMath[0].opNum;
            tmpExp[0].opType = tmpMath[0].opType;
            tmpExp[0].opValue = tmpMath[0].opValue;

            tmpExp[1].op = tmpMath[1].op;
            tmpExp[1].opNum = tmpMath[1].opNum;
            tmpExp[1].opType = tmpMath[1].opType;
            tmpExp[1].opValue = tmpMath[1].opValue;
            clEnqueueWriteBuffer(context->queue, mathexp, CL_TRUE, 2*i*sizeof(struct mathExp),2*sizeof(struct mathExp),tmpExp,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
            clWaitForEvents(1, &ndrEvt);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
            clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
            pp->pcie += 1e-6 * (endTime - startTime);
#endif
        }
    }

    cl_mem gpuFunc = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*res->totalAttr, NULL, &error);
    clEnqueueWriteBuffer(context->queue,gpuFunc,CL_TRUE,0,sizeof(int)*res->totalAttr,cpuFunc,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    long *resOffset = (long *)malloc(sizeof(long)*res->totalAttr);
    CHECK_POINTER(resOffset);
    
    offset = 0;
    totalSize = 0;
    for(int i=0;i<res->totalAttr;i++){
        
        /*
         * align the output of each column on the boundary of 4
         */

        int size = res->attrSize[i] * res->tupleNum;
        if(size %4 != 0){
            size += 4- (size %4);
        }

        resOffset[i] = offset;
        offset += size; 
        totalSize += size;
    }

    cl_mem gpuResult = clCreateBuffer(context->context,CL_MEM_READ_WRITE, totalSize, NULL, &error);
    cl_mem gpuResOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY,sizeof(long)*res->totalAttr, NULL,&error);
    clEnqueueWriteBuffer(context->queue,gpuResOffset,CL_TRUE,0,sizeof(long)*res->totalAttr,resOffset,0,0,&ndrEvt);

#ifdef OPENCL_PROFILE
    clWaitForEvents(1, &ndrEvt);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
    clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
    pp->pcie += 1e-6 * (endTime - startTime);
#endif

    gpuGbColNum = res->totalAttr;

    if(gbConstant !=1){
        context->kernel = clCreateKernel(context->program,"agg_cal",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuGbKey);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpu_psum);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,11,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,12,sizeof(cl_mem), (void*)&gpuFunc);

        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
        
        clReleaseMemObject(gpuGbKey);
        clReleaseMemObject(gpu_psum);
    }else{
        context->kernel = clCreateKernel(context->program,"agg_cal_cons",0);
        clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
        clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuOffset);
        clSetKernelArg(context->kernel,2,sizeof(int), (void*)&gpuGbColNum);
        clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuGbExp);
        clSetKernelArg(context->kernel,4,sizeof(cl_mem), (void*)&mathexp);
        clSetKernelArg(context->kernel,5,sizeof(cl_mem), (void*)&gpuGbType);
        clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuGbSize);
        clSetKernelArg(context->kernel,7,sizeof(long), (void*)&gpuTupleNum);
        clSetKernelArg(context->kernel,8,sizeof(cl_mem), (void*)&gpuResult);
        clSetKernelArg(context->kernel,9,sizeof(cl_mem), (void*)&gpuResOffset);
        clSetKernelArg(context->kernel,10,sizeof(cl_mem), (void*)&gpuFunc);

        globalSize = localSize * 4;
        error = clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
        clWaitForEvents(1, &ndrEvt);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
        clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
        pp->kernel += 1e-6 * (endTime - startTime);
#endif
    }

    for(int i=0; i<res->totalAttr;i++){
        res->content[i] = (char *)clCreateBuffer(context->context,CL_MEM_READ_WRITE, res->attrSize[i]*res->tupleNum, NULL, &error); 
        res->dataPos[i] = GPU;
        res->attrTotalSize[i] = res->tupleNum * res->attrSize[i];
        clEnqueueCopyBuffer(context->queue, gpuResult, (cl_mem)res->content[i], resOffset[i],0, res->attrSize[i] * res->tupleNum, 0,0,0);
    }

    free(resOffset);
    free(cpuOffset);

    clFinish(context->queue);
    clReleaseMemObject(gpuContent);
    clReleaseMemObject(gpuResult);
    clReleaseMemObject(gpuOffset);
    clReleaseMemObject(gpuResOffset);
    clReleaseMemObject(gpuGbExp);
    clReleaseMemObject(gpuFunc);

    clock_gettime(CLOCK_REALTIME,&end);
        double timeE = (end.tv_sec -  start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec;
        printf("GroupBy Time: %lf\n", timeE/(1000*1000));

    return res;
}