// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { #if defined (USE_B40C) sorter->sort(srcValues, output, N); #elif defined(USE_THRUST) && defined(USE_THRUST_96) //Extra buffer values my_dev::dev_mem<uint> permutation(devContext); // Permutation values, for sorting the int4 data my_dev::dev_mem<uint> temp_buffer(devContext); // temporary uint buffer //Permutation has to be allocated after the two previous //allocated buffers, get the right offset int memOffset = permutation.getGlobalMemAllignmentPadding(8*N); memOffset += 8*N; memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = temp_buffer.cmalloc_copy(tree.generalBuffer1, N, memOffset); thrust_sort_96b(srcValues, output, temp_buffer, permutation, N); #else //Extra buffer values my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes //Permutation has to be allocated after the two previous //allocated buffers, get the right offset int memOffset = simpleKeys.getGlobalMemAllignmentPadding(8*N); memOffset += 8*N; memOffset = simpleKeys.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = output32b.cmalloc_copy(tree.generalBuffer1, N, memOffset); memOffset = valuesOutput.cmalloc_copy(tree.generalBuffer1, N, memOffset); //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; extractInt.setWork(N, blockSize); reOrderKeysValues.setWork(N, blockSize); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; //Extracts a 32bit key and fills a sequence extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<cl_mem>(2, permutation.p()); extractInt.set_arg<uint>(3, &N); extractInt.set_arg<int>(4, &intIdx);//bit idx reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(execStream->s()); #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(execStream->s()); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(4, &intIdx);//smem size extractInt.execute(execStream->s()); #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); reOrderKeysValues.execute(execStream->s()); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(4, &intIdx);//integer idx extractInt.execute(execStream->s()); //Now sort the final set of 32bit keys #ifdef USE_THRUST thrust_sort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #else gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, valuesOutput,permutation, N, 32); #endif reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.execute(execStream->s()); #endif // USE_THRUST_96 }
// If srcValues and buffer are different, then the original values // are preserved, if they are the same srcValues will be overwritten void octree::gpuSort(my_dev::context &devContext, my_dev::dev_mem<uint4> &srcValues, my_dev::dev_mem<uint4> &output, my_dev::dev_mem<uint4> &buffer, int N, int numberOfBits, int subItems, tree_structure &tree) { //Extra buffer values // my_dev::dev_mem<uint> simpleKeys(devContext, N); //Int keys, // my_dev::dev_mem<uint> permutation(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data // my_dev::dev_mem<uint> valuesOutput(devContext, N); //Buffers for the values which are the indexes my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys, my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<int> output32b(devContext); //Permutation values, for sorting the int4 data my_dev::dev_mem<uint> valuesOutput(devContext); //Buffers for the values which are the indexes int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[8*N], 8*N, N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum); permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[9*N], 9*N, N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum); output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[10*N], 10*N, N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum)); //N elements after permutation prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum); valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), tree.generalBuffer1.get_flags(), tree.generalBuffer1.get_devMem(), &tree.generalBuffer1[11*N], 11*N, N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum)); //N elements after output32b //Dimensions for the kernels that shuffle and extract data const int blockSize = 256; int ng = (N)/blockSize + 1; int nx = (int)sqrt(ng); int ny = (ng-1)/nx + 1; vector<size_t> localWork(2), globalWork(2); globalWork[0] = nx*blockSize; globalWork[1] = ny; localWork [0] = blockSize; localWork[1] = 1; extractInt.setWork(globalWork, localWork); fillSequence.setWork(globalWork, localWork); reOrderKeysValues.setWork(globalWork, localWork); //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=2 //subitems = 2, than idx=1 //subitems = 1, than idx=0 //intIdx = subItems-1 int intIdx = subItems-1; extractInt.set_arg<cl_mem>(0, srcValues.p()); extractInt.set_arg<cl_mem>(1, simpleKeys.p()); extractInt.set_arg<uint>(2, &N); extractInt.set_arg<int>(3, &intIdx);//bit idx fillSequence.set_arg<cl_mem>(0, permutation.p()); fillSequence.set_arg<uint>(1, &N); reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p()); reOrderKeysValues.set_arg<uint>(3, &N); extractInt.execute(); fillSequence.execute(); //Now sort the first 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, // output32b, aPing32b, output32b, simpleKeys, // valuesOutput,valuesAPing, valuesOutput,permutation, // count, N, 32); //Now reorder the main keys //Use output as the new output/src value thing buffer reOrderKeysValues.execute(); if(subItems == 1) { //Only doing one 32bit sort. Data is already in output so done return; } //2nd set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=1 //subitems = 2, than idx=0 //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = subItems-2; extractInt.set_arg<cl_mem>(0, output.p()); extractInt.set_arg<int>(3, &intIdx);//smem size reOrderKeysValues.set_arg<cl_mem>(0, output.p()); reOrderKeysValues.set_arg<cl_mem>(1, buffer.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); if(subItems == 2) { //Doing two 32bit sorts. Data is in buffer //so move the data from buffer to output output.copy(buffer, buffer.get_size()); return; } //3th set of 32bit keys //Idx depends on subitems, z goes first, x last if subitems = 3 //subitems = 3, than idx=0 //subitems = 2, completed previous round //subitems = 1, completed previous round //intIdx = subItems-2 intIdx = 0; extractInt.set_arg<cl_mem>(0, buffer.p()); extractInt.set_arg<int>(3, &intIdx);//integer idx reOrderKeysValues.set_arg<cl_mem>(0, buffer.p()); reOrderKeysValues.set_arg<cl_mem>(1, output.p()); extractInt.execute(); fillSequence.execute(); //Now sort the 32bit keys //Using int2 with key and value combined //See sortArray4 //Using key and value in a seperate array //Now sort the 2nd 32bit keys //Using 32bit sort with key and value seperated gpuSort_32b(devContext, simpleKeys, permutation, output32b, simpleKeys, // output32b, aPing32b, // valuesOutput,valuesAPing, valuesOutput,permutation, //count, N, 32); reOrderKeysValues.execute(); clFinish(devContext.get_command_queue()); // fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0); }