示例#1
0
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

#if defined (USE_B40C)
  sorter->sort(srcValues, output, N);

#elif defined(USE_THRUST) && defined(USE_THRUST_96)
  //Extra buffer values
  my_dev::dev_mem<uint> permutation(devContext);   // Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> temp_buffer(devContext);  // temporary uint buffer
  
  //Permutation has to be allocated after the two previous
  //allocated buffers, get the right offset
  int memOffset  = permutation.getGlobalMemAllignmentPadding(8*N);
      memOffset += 8*N; 

      memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset);
      memOffset = temp_buffer.cmalloc_copy(tree.generalBuffer1, N, memOffset);      
      
  thrust_sort_96b(srcValues, output, temp_buffer, permutation, N);
  
#else
  //Extra buffer values
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  //Permutation has to be allocated after the two previous
  //allocated buffers, get the right offset
  int memOffset = simpleKeys.getGlobalMemAllignmentPadding(8*N);
      memOffset += 8*N; 
      memOffset = simpleKeys.cmalloc_copy(tree.generalBuffer1, N, memOffset);
      memOffset = permutation.cmalloc_copy(tree.generalBuffer1, N, memOffset);   
      memOffset = output32b.cmalloc_copy(tree.generalBuffer1, N, memOffset); 
      memOffset = valuesOutput.cmalloc_copy(tree.generalBuffer1, N, memOffset); 
  
    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  
  extractInt.setWork(N, blockSize); 
  reOrderKeysValues.setWork(N, blockSize); 

  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  //Extracts a 32bit key and fills a sequence
  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<cl_mem>(2, permutation.p());
  extractInt.set_arg<uint>(3, &N);
  extractInt.set_arg<int>(4, &intIdx);//bit idx


  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute(execStream->s());
  
  #ifdef USE_THRUST
  
  thrust_sort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
                   valuesOutput,permutation,
                   N, 32);
  
  #else
    //Now sort the first 32bit keys
    //Using 32bit sort with key and value seperated    
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif  

    
  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute(execStream->s());
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(4, &intIdx);//smem size
  extractInt.execute(execStream->s());

  #ifdef USE_THRUST
  
    thrust_sort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  
  #else
    //Now sort the 2nd 32bit keys
    //Using 32bit sort with key and value seperated    
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif   

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
  reOrderKeysValues.execute(execStream->s());

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
 
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(4, &intIdx);//integer idx
  extractInt.execute(execStream->s());


  //Now sort the final set of 32bit keys
  #ifdef USE_THRUST  
    thrust_sort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  
  #else
    gpuSort_32b(devContext, 
                    simpleKeys, permutation,
                    output32b, simpleKeys,
                    valuesOutput,permutation,
                    N, 32);
  #endif   
  
  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.execute(execStream->s());  
#endif // USE_THRUST_96
}
示例#2
0
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

  //Extra buffer values

//   my_dev::dev_mem<uint> simpleKeys(devContext, N);    //Int keys,
//   my_dev::dev_mem<uint> permutation(devContext, N);   //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<uint> valuesOutput(devContext, N);  //Buffers for the values which are the indexes
  
  
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output

  
  simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*N], 8*N,
                          N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum));    //Ofset 8 since we have 2 uint4 before
  
  prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);
  
  permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[9*N], 9*N,
                          N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum));  //N elements after simpleKeys    

  prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum);
  

  output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[10*N], 10*N,
                          N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum));  //N elements after permutation      
  
  prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum);
  
  valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[11*N], 11*N,
                          N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum));  //N elements after output32b        

    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  int ng = (N)/blockSize + 1;
  int nx = (int)sqrt(ng);
  int ny = (ng-1)/nx + 1;

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = nx*blockSize;   globalWork[1] = ny;
  localWork [0] = blockSize;       localWork[1] = 1;

  extractInt.setWork(globalWork, localWork);
  fillSequence.setWork(globalWork, localWork);
  reOrderKeysValues.setWork(globalWork, localWork);
  
  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<uint>(2, &N);
  extractInt.set_arg<int>(3, &intIdx);//bit idx

  fillSequence.set_arg<cl_mem>(0, permutation.p());
  fillSequence.set_arg<uint>(1, &N);

  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute();
  fillSequence.execute();

  //Now sort the first 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
//                     output32b, aPing32b,
                   output32b, simpleKeys,
//                    valuesOutput,valuesAPing,
                   valuesOutput,permutation,
//                   count,
                   N, 32);


  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute();
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(3, &intIdx);//smem size

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
 
  extractInt.execute();
  
  fillSequence.execute();

  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
//                    output32b, aPing32b,
//                   valuesOutput,valuesAPing,
                   valuesOutput,permutation,
                   //count,
                   N, 32);
                   
  reOrderKeysValues.execute();
  

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
  
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(3, &intIdx);//integer idx

  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());

  extractInt.execute();
  fillSequence.execute();
  //Now sort the 32bit keys
  //Using int2 with key and value combined
  //See sortArray4
  //Using key and value in a seperate array
  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
              simpleKeys, permutation,
              output32b, simpleKeys,
//               output32b, aPing32b,
//               valuesOutput,valuesAPing,
              valuesOutput,permutation,
              //count,
              N, 32);  

  reOrderKeysValues.execute();

  clFinish(devContext.get_command_queue());

//   fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0);
}