extern "C" size_t scanExclusiveShort(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint batchSize,
    uint arrayLength
) {
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    oclCheckError( factorizationRemainder == 1, shrTRUE);

    //Check supported size range
    oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE );

    //Check total batch size limit
    oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );

    //Check all work-groups to be fully packed with data
    oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE);

    return scanExclusiveLocal1(
               cqCommandQueue,
               d_Dst,
               d_Src,
               batchSize,
               arrayLength
           );
}
extern "C" size_t scanExclusiveLarge(
    cl_command_queue cqCommandQueue,
    cl_mem d_Dst,
    cl_mem d_Src,
    uint batchSize,
    uint arrayLength
) {
    //Check power-of-two factorization
    uint log2L;
    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    oclCheckError( factorizationRemainder == 1, shrTRUE);

    //Check supported size range
    oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE );

    //Check total batch size limit
    oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );

    scanExclusiveLocal1(
        cqCommandQueue,
        d_Dst,
        d_Src,
        (batchSize * arrayLength) / (4 * WORKGROUP_SIZE),
        4 * WORKGROUP_SIZE
    );

    scanExclusiveLocal2(
        cqCommandQueue,
        d_Buffer,
        d_Dst,
        d_Src,
        batchSize,
        arrayLength / (4 * WORKGROUP_SIZE)
    );

    return uniformUpdate(
               cqCommandQueue,
               d_Dst,
               d_Buffer,
               (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)
           );
}
Esempio n. 3
0
void BoidModelSHWay1::bitonicSort(
	cl::Buffer d_DstKey,
	cl::Buffer d_DstVal,
	cl::Buffer d_SrcKey,
	cl::Buffer d_SrcVal,
	unsigned int batch,
	unsigned int arrayLength,
	unsigned int dir
	){

	if (arrayLength < 2)
		return;

	//Only power-of-two array lengths are supported so far
	cl_uint log2L;
	cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength);

	if (factorizationRemainder != 1){
		log("Array not a power of two");
		return;
	}

	dir = (dir != 0);

	size_t localWorkSize, globalWorkSize;

	unsigned long long timeNow = GetTickCount64();

	if (arrayLength <= LOCAL_SIZE_LIMIT)
	{
		try
		{
			err = kernel_bitonicSortLocal.setArg(0, d_DstKey);
			err = kernel_bitonicSortLocal.setArg(1, d_DstVal);
			err = kernel_bitonicSortLocal.setArg(2, d_SrcKey);
			err = kernel_bitonicSortLocal.setArg(3, d_SrcVal);
			err = kernel_bitonicSortLocal.setArg(4, arrayLength);
			err = kernel_bitonicSortLocal.setArg(5, dir);
		}
		catch (cl::Error er) {
			log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
		}

		localWorkSize = LOCAL_SIZE_LIMIT / 2;
		globalWorkSize = batch * arrayLength / 2;

		err = queue.enqueueNDRangeKernel(kernel_bitonicSortLocal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL);
		queue.finish();
	}
	else
	{
		try
		{
			err = kernel_bitonicSortLocal1.setArg(0, d_DstKey);
			err = kernel_bitonicSortLocal1.setArg(1, d_DstVal);
			err = kernel_bitonicSortLocal1.setArg(2, d_SrcKey);
			err = kernel_bitonicSortLocal1.setArg(3, d_SrcVal);
		}
		catch (cl::Error er) {
			log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
		}

		localWorkSize = LOCAL_SIZE_LIMIT / 2;
		globalWorkSize = batch * arrayLength / 2;
		err = queue.enqueueNDRangeKernel(kernel_bitonicSortLocal1, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL);

		queue.finish();

		for (unsigned int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1)
		{
			for (unsigned stride = size / 2; stride > 0; stride >>= 1)
			{
				if (stride >= LOCAL_SIZE_LIMIT)
				{

					localWorkSize = LOCAL_SIZE_LIMIT / 4;
					globalWorkSize = batch * arrayLength / 2;
					//Launch bitonicMergeGlobal
					try
					{
						err = kernel_bitonicMergeGlobal.setArg(0, d_DstKey);
						err = kernel_bitonicMergeGlobal.setArg(1, d_DstVal);
						err = kernel_bitonicMergeGlobal.setArg(2, d_DstKey);
						err = kernel_bitonicMergeGlobal.setArg(3, d_DstVal);
						err = kernel_bitonicMergeGlobal.setArg(4, arrayLength);
						err = kernel_bitonicMergeGlobal.setArg(5, size);
						err = kernel_bitonicMergeGlobal.setArg(6, stride);
						err = kernel_bitonicMergeGlobal.setArg(7, dir);
					}
					catch (cl::Error er) {
						log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
					}

					err = queue.enqueueNDRangeKernel(kernel_bitonicMergeGlobal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL);
					queue.finish();
				}
				else
				{
					//Launch bitonicMergeLocal
					localWorkSize = LOCAL_SIZE_LIMIT / 2;
					globalWorkSize = batch * arrayLength / 2;

					try
					{
						err = kernel_bitonicMergeLocal.setArg(0, d_DstKey);
						err = kernel_bitonicMergeLocal.setArg(1, d_DstVal);
						err = kernel_bitonicMergeLocal.setArg(2, d_DstKey);
						err = kernel_bitonicMergeLocal.setArg(3, d_DstVal);
						err = kernel_bitonicMergeLocal.setArg(4, arrayLength);
						err = kernel_bitonicMergeLocal.setArg(5, stride);
						err = kernel_bitonicMergeLocal.setArg(6, size);
						err = kernel_bitonicMergeLocal.setArg(7, dir);
					}
					catch (cl::Error er) {
						log("ERROR: " + std::string(er.what()) + clHelper->oclErrorString(er.err()));
					}



					err = queue.enqueueNDRangeKernel(kernel_bitonicMergeLocal, cl::NullRange, cl::NDRange(globalWorkSize), cl::NDRange(localWorkSize), NULL, NULL);
					queue.finish();
					break;
				}
			}
		}
	}
	times[1] = GetTickCount64() - timeNow;
}
Esempio n. 4
0
int Bitonic<T>::Sort(int batch, int arrayLength, int dir,
                    Buffer<T> *cl_dstkey, Buffer<T> *cl_dstval, 
                    Buffer<T> *cl_srckey, Buffer<T> *cl_srcval)
{

    if(arrayLength < 2)
        return 0;


    int arg = 0;
    k_bitonicSortLocal.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicSortLocal.setArg(arg++, cl_dstval->getDevicePtr());
    k_bitonicSortLocal.setArg(arg++, cl_srckey->getDevicePtr());
    k_bitonicSortLocal.setArg(arg++, cl_srcval->getDevicePtr());

    arg = 0;
    k_bitonicSortLocal1.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicSortLocal1.setArg(arg++, cl_dstval->getDevicePtr());
    k_bitonicSortLocal1.setArg(arg++, cl_srckey->getDevicePtr());
    k_bitonicSortLocal1.setArg(arg++, cl_srcval->getDevicePtr());

    arg = 0;
    k_bitonicMergeGlobal.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicMergeGlobal.setArg(arg++, cl_dstval->getDevicePtr());
    k_bitonicMergeGlobal.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicMergeGlobal.setArg(arg++, cl_dstval->getDevicePtr());

    arg = 0;
    k_bitonicMergeLocal.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicMergeLocal.setArg(arg++, cl_dstval->getDevicePtr());
    k_bitonicMergeLocal.setArg(arg++, cl_dstkey->getDevicePtr());
    k_bitonicMergeLocal.setArg(arg++, cl_dstval->getDevicePtr());






    //Only power-of-two array lengths are supported so far
    cl_uint log2L;
    cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength);
    //printf("bitonic factorization remainder: %d\n", factorizationRemainder);
    
    dir = (dir != 0);
    //printf("dir: %d\n", dir);

    int localWorkSize;
    int globalWorkSize;

    if(arrayLength <= LOCAL_SIZE_LIMIT)
    {
         //Launch bitonicSortLocal
        k_bitonicSortLocal.setArg(4, arrayLength);
        k_bitonicSortLocal.setArg(5, dir); 

        localWorkSize  = LOCAL_SIZE_LIMIT / 2;
        globalWorkSize = batch * arrayLength / 2;
        k_bitonicSortLocal.execute(globalWorkSize, localWorkSize);
  
    }
    else
    {
        //Launch bitonicSortLocal1
        
        localWorkSize  = LOCAL_SIZE_LIMIT / 2;
        globalWorkSize = batch * arrayLength / 2;
        k_bitonicSortLocal1.execute(globalWorkSize, localWorkSize);

        for(uint size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1)
        {
            for(unsigned stride = size / 2; stride > 0; stride >>= 1)
            {
                if(stride >= LOCAL_SIZE_LIMIT)
                {
                    //Launch bitonicMergeGlobal
                    k_bitonicMergeGlobal.setArg(4, arrayLength);
                    k_bitonicMergeGlobal.setArg(5, size);
                    k_bitonicMergeGlobal.setArg(6, stride);
                    k_bitonicMergeGlobal.setArg(7, dir); 

                    globalWorkSize = batch * arrayLength / 2;
                    k_bitonicMergeGlobal.execute(globalWorkSize);
                }
                else
                {
                    //Launch bitonicMergeLocal
                    
                    
                    k_bitonicMergeLocal.setArg(4, arrayLength);
                    k_bitonicMergeLocal.setArg(5, stride);
                    k_bitonicMergeLocal.setArg(6, size);
                    k_bitonicMergeLocal.setArg(7, dir); 

                    localWorkSize  = LOCAL_SIZE_LIMIT / 2;
                    globalWorkSize = batch * arrayLength / 2;
                    
                    k_bitonicMergeLocal.execute(globalWorkSize, localWorkSize);
                    break;
                }
                //printf("globalWorkSize: %d\n", globalWorkSize);
            }
        }




        
    }

    return localWorkSize;

    


    //scopy(num, cl_sort_output_hashes.getDevicePtr(), 
	//             cl_sort_hashes.getDevicePtr());
	//scopy(num, cl_sort_output_indices.getDevicePtr(), 
	//             cl_sort_indices.getDevicePtr());
    


}