Exemple #1
0
void octree::direct_dust(tree_structure &tree)
{
  if(tree.n_dust == 0) return;
  
  directGrav.set_arg<cl_mem>(0, tree.dust_acc1.p());
  directGrav.set_arg<cl_mem>(1, tree.dust_pos.p());
  directGrav.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
  directGrav.set_arg<int>(3,    &tree.n_dust);
  directGrav.set_arg<int>(4,    &tree.n);
  directGrav.set_arg<float>(5,  &(this->eps2));
  directGrav.set_arg<float4>(6, NULL, 256);
  std::vector<size_t> localWork(2), globalWork(2);
  localWork[0] = 256; localWork[1] = 1;
  globalWork[0] = 256 * ((tree.n_dust + 255) / 256);
  globalWork[1] = 1; 
  directGrav.setWork(globalWork, localWork);
  directGrav.execute(gravStream->s());  //First half
}
Exemple #2
0
void octree::gpuSort_32b(my_dev::context &devContext, 
                    my_dev::dev_mem<uint> &srcKeys,     my_dev::dev_mem<uint> &srcValues,
                    my_dev::dev_mem<int>  &keysOutput,  my_dev::dev_mem<uint> &keysAPing,
                    my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing,
                    int N, int numberOfBits)
{

  int bitIdx = 0;

  //Step 1, do the count
  //Memory that should be alloced outside the function:

  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480;
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;

  sortCount.set_arg<cl_mem>(0, srcKeys.p());
  sortCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  sortCount.set_arg<uint>(2, &N);
  sortCount.set_arg<int>(3, NULL, 128);//smem size
  sortCount.set_arg<setupParams>(4, &sParam);
  sortCount.set_arg<int>(5, &bitIdx);
  
  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;
  sortCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  sortMove.set_arg<cl_mem>(0, srcKeys.p());
  sortMove.set_arg<cl_mem>(1, keysOutput.p());
  sortMove.set_arg<cl_mem>(2, srcValues.p());
  sortMove.set_arg<cl_mem>(3, valuesOutput.p());
  sortMove.set_arg<cl_mem>(4, this->devMemCounts.p());
  sortMove.set_arg<uint>(5, &N);
  sortMove.set_arg<uint>(6, NULL, 192); //Dynamic shared memory 128+64 , prefux sum buffer
  sortMove.set_arg<uint>(7, NULL, 64*4); //Dynamic shared memory stage buffer
  sortMove.set_arg<uint>(8, NULL, 64*4); //Dynamic shared memory stage_values buffer
  sortMove.set_arg<setupParams>(9, &sParam);
  sortMove.set_arg<int>(10, &bitIdx);

  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  sortMove.setWork(globalWork, localWork);

  bool pingPong = false;

  //Execute bitIdx 0

  sortCount.execute(execStream->s());
  exScanBlock.execute(execStream->s());
  sortMove.execute(execStream->s());  

  //Swap buffers
  sortCount.set_arg<cl_mem>(0, keysOutput.p());
  sortMove.set_arg<cl_mem>(0, keysOutput.p());
  sortMove.set_arg<cl_mem>(1, keysAPing.p());
  sortMove.set_arg<cl_mem>(2, valuesOutput.p());
  sortMove.set_arg<cl_mem>(3, valuesAPing.p());

  //Remaining bits, ping ponging buffers
  for(int i=1; i < numberOfBits; i++)
  {
    bitIdx = i;
    sortCount.set_arg<int>(5, &bitIdx);
    sortMove.set_arg<int>(10, &bitIdx);

    sortCount.execute(execStream->s());
    exScanBlock.execute(execStream->s()); 
    
    sortMove.execute(execStream->s());

    //Switch buffers
    if(pingPong)
    {
      sortCount.set_arg<cl_mem>(0, keysOutput.p());

      sortMove.set_arg<cl_mem>(0, keysOutput.p());
      sortMove.set_arg<cl_mem>(1, keysAPing.p());

      sortMove.set_arg<cl_mem>(2, valuesOutput.p());
      sortMove.set_arg<cl_mem>(3, valuesAPing.p());

      pingPong = false;
    }
    else
    {
      sortCount.set_arg<cl_mem>(0, keysAPing.p());

      sortMove.set_arg<cl_mem>(0, keysAPing.p());
      sortMove.set_arg<cl_mem>(1, keysOutput.p());

      sortMove.set_arg<cl_mem>(2, valuesAPing.p());
      sortMove.set_arg<cl_mem>(3, valuesOutput.p());

      pingPong = true;
    }
  }
 

}
Exemple #3
0
//Splits an array of integers, the values in srcValid indicate if a
//value is valid (1 == valid anything else is UNvalid) returns the 
//splitted values in the output array (first all valid 
//number and then the invalid ones) and the total
//number of valid items is stored in 'count' 
void octree::gpuSplit(my_dev::context &devContext, 
                      my_dev::dev_mem<uint> &srcValues,
                      my_dev::dev_mem<uint> &output,                        
                      int N, 
                      int *validCount)  // if validCount NULL leave count on device
{

  //In the next step we associate the GPU memory with the Kernel arguments
  //my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512);
  //Memory that should be alloced outside the function:
  //devMemCounts and devMemCountsx 
  
  // make sure previous reset has finished.
  this->devMemCountsx.waitForCopyEvent();

  //Kernel configuration parameters
  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480; 
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;
  
  compactCount.set_arg<cl_mem>(0, srcValues.p());
  compactCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  compactCount.set_arg<uint>(2, &N);
  compactCount.set_arg<int>(3, NULL, 128);
  compactCount.set_arg<setupParams>(4, &sParam);
  compactCount.set_arg<cl_mem>(5, this->devMemCountsx.p());
  
  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;   
  compactCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());  
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  splitMove.set_arg<cl_mem>(0, srcValues.p());
  splitMove.set_arg<cl_mem>(1, output.p());
  splitMove.set_arg<cl_mem>(2, this->devMemCounts.p());
  splitMove.set_arg<uint>(3, &N);
  splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory
  splitMove.set_arg<setupParams>(5, &sParam);
  
  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  splitMove.setWork(globalWork, localWork);

  ////////////////////
  compactCount.execute(execStream->s());
  exScanBlock.execute(execStream->s());
  splitMove.execute(execStream->s());

  if (validCount) {
    this->devMemCountsx.d2h();
    *validCount = this->devMemCountsx[0];
  }
}
Exemple #4
0
void octree::compute_properties (tree_structure &tree) {
  
  #if 0
      fprintf(stderr,"This file is not up to date anymore! %s\n", __FILE__);
    exit(0);
  
  //Computes the tree-properties (size, cm, monopole, quadropole, etc)
  //start the kernel for the leaf-type nodes
  propsLeaf.set_arg<int>(0,    &tree.n_leafs);
  propsLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsLeaf.set_arg<cl_mem>(2, tree.node_bodies.p());
  propsLeaf.set_arg<cl_mem>(3, tree.bodies_Ppos.p());
//   propsLeaf.set_arg<cl_mem>(3, tree.bodies_pos.p());  
  propsLeaf.set_arg<cl_mem>(4, tree.multipole.p());
  propsLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());
  propsLeaf.set_arg<cl_mem>(7, tree.lowerBounds.p());
  propsLeaf.set_arg<cl_mem>(8, tree.upperBounds.p());  
  propsLeaf.set_arg<cl_mem>(9, tree.bodies_Pvel.p());  //Velocity to get max eps  
  
  
  propsLeaf.setWork(tree.n_leafs, 128);
  printf("PropsLeaf: "); propsLeaf.printWorkSize();

  propsLeaf.execute();  


  int temp = tree.n_nodes-tree.n_leafs;
  propsNonLeaf.set_arg<int>(0,    &temp);
  propsNonLeaf.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsNonLeaf.set_arg<cl_mem>(2, tree.node_level_list.p());
  propsNonLeaf.set_arg<cl_mem>(3, tree.n_children.p());  
  propsNonLeaf.set_arg<cl_mem>(4, tree.multipole.p());
  propsNonLeaf.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsNonLeaf.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());


  for(int i=tree.n_levels; i >= 1; i--)
  {   
      propsNonLeaf.set_arg<int>(0,    &i);  
      {    
        vector<size_t> localWork(2), globalWork(2);
        int totalOnThisLevel;
      
        totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];
        
        propsNonLeaf.setWork(totalOnThisLevel, 128);
        
        printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
               tree.node_level_list[i-1], tree.node_level_list[i]); 
        propsNonLeaf.printWorkSize();
      }      
      propsNonLeaf.set_arg<int>(0,    &i); //set the level
      propsNonLeaf.execute();     
  }
  
  float theta2 = theta;
  
  propsScaling.set_arg<int>(0,    &tree.n_nodes);
  propsScaling.set_arg<real4>(1,  &tree.corner);
  propsScaling.set_arg<cl_mem>(2, tree.multipole.p());
  propsScaling.set_arg<cl_mem>(3, tree.nodeLowerBounds.p());
  propsScaling.set_arg<cl_mem>(4, tree.nodeUpperBounds.p());
  propsScaling.set_arg<cl_mem>(5, tree.n_children.p());  
  propsScaling.set_arg<cl_mem>(6, tree.node_data.p());
  propsScaling.set_arg<float >(7, &theta2);
  propsScaling.set_arg<cl_mem>(8, tree.boxSizeInfo.p());
  propsScaling.set_arg<cl_mem>(9, tree.boxCenterInfo.p());
  
  propsScaling.setWork(tree.n_nodes, 128);
  printf("propsScaling: \t "); propsScaling.printWorkSize();
  propsScaling.execute();     


  //tree.multipole.d2h();
  //printf("COM: %f %f %f %f \n",tree.multipole[0].x, tree.multipole[0].y, tree.multipole[0].z, tree.multipole[0].w);


  #ifdef USE_CUDA
    cuCtxSynchronize();
  #else
    clFinish(devContext.get_command_queue());
  #endif
  
  tree.nodeLowerBounds.d2h();
  tree.nodeUpperBounds.d2h();
  
  
  
  copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
  copyNodeDataToGroupData.set_arg<int>(1,    &tree.n_nodes);
  copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.node_data.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_data.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.node_bodies.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.group_list.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(6, tree.boxCenterInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(8, tree.groupCenterInfo.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(9, tree.groupSizeInfo.p());
  copyNodeDataToGroupData.setWork(tree.n_nodes, 128);

  printf("copyNodeDataToGroupData: \t "); copyNodeDataToGroupData.printWorkSize();
  copyNodeDataToGroupData.execute();  
 
//    tree.multipole.d2h();
//   testRes.d2h();
  
//   for(int i=0; i < tree.n_nodes; i++)
//   for(int i=tree.n_nodes-10; i < tree.n_nodes; i++)
/*   for(int i=0; i < 10; i++)
   {
     fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+0].x,tree.multipole[i*3+0].y,tree.multipole[i*3+0].z, tree.multipole[i*3+0].w);
//     fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, testRes[i]);
    fprintf(stderr,"%d\t%f\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+1].x,tree.multipole[i*3+1].y,tree.multipole[i*3+1].z, tree.multipole[i*3+1].w, 0);
    fprintf(stderr,"%d\t%f\t%f\t%f\t%f\n", i, tree.multipole[i*3+2].x,tree.multipole[i*3+2].y,tree.multipole[i*3+2].z, tree.multipole[i*3+2].w);
   }

exit(0);
*/
  #else

  compute_properties_double(tree);
  
  #endif
  
}
Exemple #5
0
void octree::compute_properties_double(tree_structure &tree) {

  /*****************************************************          
    Assign the memory buffers, note that we check the size first
    and if needed we increase the size of the generalBuffer1
    Size required:
      - multipoleD -> double4*3_n_nodes -> 6*n_nodes*uint4 
      - lower/upperbounds ->               2*n_nodes*uint4
      - node lower/upper  ->               2*n_nodes*uint4
      - SUM: 10*n_nodes*uint4 
      - generalBuffer1 has default size: 3*N*uint4
      
    check if 10*n_nodes < 3*N if so realloc
    
   *****************************************************/
  
  if(10*tree.n_nodes > 3*tree.n)
  {
    fprintf(stderr, "Resizeing the generalBuffer1 \n");
    tree.generalBuffer1.cresize(8*tree.n_nodes*4, false);
  }
  
  my_dev::dev_mem<double4> multipoleD(devContext);
  
  multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[0], 0, 
                          3*tree.n_nodes, getAllignmentOffset(0));

  //Offset is in uint, so: double4 = 8uint*3*n_nodes
  tree.nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                          tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));
                         
  int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output
                          
  tree.nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 
                          8*3*tree.n_nodes + 4*tree.n_nodes, 
                          tree.n_nodes, 
                          prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));     
       
  
  //Computes the tree-properties (size, cm, monopole, quadropole, etc)
  //start the kernel for the leaf-type nodes
  propsLeafD.set_arg<int>(0,    &tree.n_leafs);
  propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
  propsLeafD.set_arg<cl_mem>(3, tree.bodies_Ppos.p());  
  propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());
  propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps
  
  propsLeafD.setWork(tree.n_leafs, 128);
  printf("PropsLeaf: "); propsLeafD.printWorkSize();
  propsLeafD.execute(); 
   
  
  int temp = tree.n_nodes-tree.n_leafs;
  propsNonLeafD.set_arg<int>(0,    &temp);
  propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
  propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());  
  propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsNonLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsNonLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());

  //Work from the bottom up
  for(int i=tree.n_levels; i >= 1; i--)
  {   
      propsNonLeafD.set_arg<int>(0,    &i);  
      {    
        vector<size_t> localWork(2), globalWork(2);
        int totalOnThisLevel;
      
        totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];
        
        propsNonLeafD.setWork(totalOnThisLevel, 128);
        
        printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
               tree.node_level_list[i-1], tree.node_level_list[i]); 
        propsNonLeafD.printWorkSize();
      }      
      propsNonLeafD.set_arg<int>(0,    &i); //set the level
      propsNonLeafD.execute();     
  }
  

  float theta2 = theta;
  
  propsScalingD.set_arg<int>(0,    &tree.n_nodes);
  propsScalingD.set_arg<real4>(1,  &tree.corner);
  propsScalingD.set_arg<cl_mem>(2, multipoleD.p());
  propsScalingD.set_arg<cl_mem>(3, tree.nodeLowerBounds.p());
  propsScalingD.set_arg<cl_mem>(4, tree.nodeUpperBounds.p());
  propsScalingD.set_arg<cl_mem>(5, tree.n_children.p());  
  propsScalingD.set_arg<cl_mem>(6, tree.multipole.p());
  propsScalingD.set_arg<float >(7, &theta2);
  propsScalingD.set_arg<cl_mem>(8, tree.boxSizeInfo.p());
  propsScalingD.set_arg<cl_mem>(9, tree.boxCenterInfo.p());
  propsScalingD.set_arg<cl_mem>(10, tree.node_bodies.p());
  
  propsScalingD.setWork(tree.n_nodes, 128);
  printf("propsScaling: \t "); propsScalingD.printWorkSize();
  propsScalingD.execute();   


  #ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
  #endif

    
  //Set the group properties, note that it is not based on the nodes anymore
  //but on self created groups based on particle order setPHGroupData    
  copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
  copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
  copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
  copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);    
  copyNodeDataToGroupData.printWorkSize();
  copyNodeDataToGroupData.execute();
  
  #ifdef INDSOFT  
    memCpyStream.sync();  
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
  #else
  #endif
  
  //Get the local domain boundary based on group positions and sizes
  real4 r_min, r_max;
  getBoundariesGroups(tree, r_min, r_max); 
  
  #if 0
    //Write the tree structure to file

    string nodeFileName = "fullTreeStructure.txt";
    char fileName[256];
    sprintf(fileName, "fullTreeStructure-%d.txt", mpiGetRank());
    ofstream nodeFile;
    //nodeFile.open(nodeFileName.c_str());
    nodeFile.open(fileName);
    tree.multipole.d2h();
    tree.boxSizeInfo.d2h();
    tree.boxCenterInfo.d2h();
    
    for(int i=0; i < tree.n_nodes; i++)
    {
      //nodeFile << i << "\t" << tree.boxCenterInfo[i].x << "\t" << tree.boxCenterInfo[i].y;
      //nodeFile << "\t" << 2*tree.boxSizeInfo[i].x << "\t" << 2*tree.boxSizeInfo[i].y << "\t";
      
      nodeFile << i << "\t" << tree.boxCenterInfo[i].x-tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y-tree.boxSizeInfo[i].y;
      nodeFile << "\t"      << tree.boxCenterInfo[i].x+tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y+tree.boxSizeInfo[i].y << "\t";
      
      
      nodeFile << tree.multipole[i*3+0].x << "\t" << tree.multipole[i*3+0].w << "\n";
    }
    
    nodeFile.close();
    

    sprintf(fileName, "fullTreeStructureParticles-%d.txt", mpiGetRank());
    ofstream partFile;
    partFile.open(fileName);
    tree.bodies_pos.d2h();
                                                                                                                    
    for(int i=0; i < tree.n; i++)                                                                                     
    {                                                                                                                 
      float4  pos =  tree.bodies_pos[i];                                                                              
      partFile << i << "\t" << pos.x << "\t" << pos.y << "\t" << pos.z << endl;                                                        
    }                                                                                                                 
    partFile.close(); 
    

  
  #endif
 
}
Exemple #6
0
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

  //Extra buffer values

//   my_dev::dev_mem<uint> simpleKeys(devContext, N);    //Int keys,
//   my_dev::dev_mem<uint> permutation(devContext, N);   //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<uint> valuesOutput(devContext, N);  //Buffers for the values which are the indexes
  
  
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output

  
  simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*N], 8*N,
                          N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum));    //Ofset 8 since we have 2 uint4 before
  
  prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);
  
  permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[9*N], 9*N,
                          N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum));  //N elements after simpleKeys    

  prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum);
  

  output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[10*N], 10*N,
                          N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum));  //N elements after permutation      
  
  prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum);
  
  valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[11*N], 11*N,
                          N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum));  //N elements after output32b        

    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  int ng = (N)/blockSize + 1;
  int nx = (int)sqrt(ng);
  int ny = (ng-1)/nx + 1;

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = nx*blockSize;   globalWork[1] = ny;
  localWork [0] = blockSize;       localWork[1] = 1;

  extractInt.setWork(globalWork, localWork);
  fillSequence.setWork(globalWork, localWork);
  reOrderKeysValues.setWork(globalWork, localWork);
  
  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<uint>(2, &N);
  extractInt.set_arg<int>(3, &intIdx);//bit idx

  fillSequence.set_arg<cl_mem>(0, permutation.p());
  fillSequence.set_arg<uint>(1, &N);

  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute();
  fillSequence.execute();

  //Now sort the first 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
//                     output32b, aPing32b,
                   output32b, simpleKeys,
//                    valuesOutput,valuesAPing,
                   valuesOutput,permutation,
//                   count,
                   N, 32);


  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute();
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(3, &intIdx);//smem size

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
 
  extractInt.execute();
  
  fillSequence.execute();

  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
//                    output32b, aPing32b,
//                   valuesOutput,valuesAPing,
                   valuesOutput,permutation,
                   //count,
                   N, 32);
                   
  reOrderKeysValues.execute();
  

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
  
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(3, &intIdx);//integer idx

  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());

  extractInt.execute();
  fillSequence.execute();
  //Now sort the 32bit keys
  //Using int2 with key and value combined
  //See sortArray4
  //Using key and value in a seperate array
  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
              simpleKeys, permutation,
              output32b, simpleKeys,
//               output32b, aPing32b,
//               valuesOutput,valuesAPing,
              valuesOutput,permutation,
              //count,
              N, 32);  

  reOrderKeysValues.execute();

  clFinish(devContext.get_command_queue());

//   fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0);
}
Exemple #7
0
//Splits an array of integers, the values in srcValid indicate if a
//value is valid (1 == valid anything else is UNvalid) returns the 
//splitted values in the output array (first all valid 
//number and then the invalid ones) and the total
//number of valid items is stored in 'count' 
void octree::gpuSplit(my_dev::context &devContext, 
                        my_dev::dev_mem<uint> &srcValues,
                        my_dev::dev_mem<uint> &output,                        
                        int N, int *validCount)
{

  // In the next step we associate the GPU memory with the Kernel arguments
//   my_dev::dev_mem<uint> counts(devContext, 512), countx(devContext, 512);
  //Memory that should be alloced outside the function:
  //devMemCounts and devMemCountsx 
  

  //Kernel configuration parameters
  setupParams sParam;
  sParam.jobs = (N / 64) / 480  ; //64=32*2 2 items per look, 480 is 120*4, number of procs
  sParam.blocksWithExtraJobs = (N / 64) % 480; 
  sParam.extraElements = N % 64;
  sParam.extraOffset = N - sParam.extraElements;
  
  
//   printf("Param info: %d %d %d %d \n", sParam.jobs, sParam.blocksWithExtraJobs, sParam.extraElements, sParam.extraOffset);

  compactCount.set_arg<cl_mem>(0, srcValues.p());
  compactCount.set_arg<cl_mem>(1, this->devMemCounts.p());
  compactCount.set_arg<uint>(2, &N);
  compactCount.set_arg<int>(3, NULL, 128);
  compactCount.set_arg<setupParams>(4, &sParam);

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = 32*120;   globalWork[1] = 4;
  localWork [0] = 32;       localWork[1] = 4;   
  compactCount.setWork(globalWork, localWork);

  ///////////////

  exScanBlock.set_arg<cl_mem>(0, this->devMemCounts.p());  
  int blocks = 120*4;
  exScanBlock.set_arg<int>(1, &blocks);
  exScanBlock.set_arg<cl_mem>(2, this->devMemCountsx.p());
  exScanBlock.set_arg<int>(3, NULL, 512); //shared memory allocation

  globalWork[0] = 512; globalWork[1] = 1;
  localWork [0] = 512; localWork [1] = 1;

  exScanBlock.setWork(globalWork, localWork);

  //////////////

  splitMove.set_arg<cl_mem>(0, srcValues.p());
  splitMove.set_arg<cl_mem>(1, output.p());
  splitMove.set_arg<cl_mem>(2, this->devMemCounts.p());
  splitMove.set_arg<uint>(3, &N);
  splitMove.set_arg<uint>(4, NULL, 192); //Dynamic shared memory
  splitMove.set_arg<setupParams>(5, &sParam);

  globalWork[0] = 120*32;  globalWork[1] = 4;
  localWork [0] = 32;      localWork [1] = 4;

  splitMove.setWork(globalWork, localWork);

  ////////////////////
  compactCount.execute();

//   exit(0);
//   counts.d2h();
//   for(int i=0; i < 482; i++)
//   {
//     printf("%d\t%d\n", i, counts[i]);
//   }
//   

  exScanBlock.execute();
  
  splitMove.execute();

  //TODO fix the damn clFinish function
  #ifdef USE_CUDA
    cuCtxSynchronize();
  #else
    clFinish(devContext.get_command_queue());
  #endif
  this->devMemCountsx.d2h();
  *validCount = this->devMemCountsx[0];
  //printf("Total number of valid items: %d \n", countx[0]);
}
void octree::compute_properties(tree_structure &tree,  my_dev::dev_mem<float4>  &bodies_pos, int n_bodies) {

    /*****************************************************
      Assign the memory buffers, note that we check the size first
      and if needed we increase the size of the generalBuffer1
      Size required:
        - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4
        - lower/upperbounds ->               2*n_nodes*uint4
        - node lower/upper  ->               2*n_nodes*uint4
        - SUM:                               10*n_nodes*uint4
        - generalBuffer1 has default size: 3*N*uint4

      check if 10*n_nodes < 3*N if so realloc
      (Note that generalBuffer might be larger because of tree-walk stack)
     *****************************************************/

    if(10*tree.n_nodes > 3*tree.n)
    {
#ifdef _DEBUG_PRINT_
        fprintf(stderr, "Resizeing the generalBuffer1 \n");
#endif
        tree.generalBuffer1.cresize(10*tree.n_nodes*4, false);
    }

    my_dev::dev_mem<double4> multipoleD(devContext);
    my_dev::dev_mem<real4>  nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO
    my_dev::dev_mem<real4>  nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO

    multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                            tree.generalBuffer1.get_flags(),
                            tree.generalBuffer1.get_devMem(),
                            &tree.generalBuffer1[0], 0,
                            3*tree.n_nodes, getAllignmentOffset(0));

    //Offset is in uint, so: double4 = 8uint*3*n_nodes
    nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                                 tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));

    int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output

    nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes],
                                 8*3*tree.n_nodes + 4*tree.n_nodes,
                                 tree.n_nodes,
                                 prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));

    //Computes the tree-properties (size, cm, monopole, quadropole, etc)
    //start the kernel for the leaf-type nodes
    propsLeafD.set_arg<int>(0,    &tree.n_leafs);
    propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
    propsLeafD.set_arg<cl_mem>(3, bodies_pos.p());
    propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());
//   propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps

    propsLeafD.setWork(tree.n_leafs, 128);
#ifdef _DEBUG_PRINT_
    printf("PropsLeaf: ");
    propsLeafD.printWorkSize();
#endif
    propsLeafD.execute();


    int temp = tree.n_nodes-tree.n_leafs;
    propsNonLeafD.set_arg<int>(0,    &temp);
    propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
    propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());
    propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());

    //Work from the bottom up
    for(int i=tree.n_levels; i >= 1; i--)
    {
        propsNonLeafD.set_arg<int>(0,    &i);
        {
            vector<size_t> localWork(2), globalWork(2);
            int totalOnThisLevel;

            totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];

            propsNonLeafD.setWork(totalOnThisLevel, 128);

#ifdef _DEBUG_PRINT_
            printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
                   tree.node_level_list[i-1], tree.node_level_list[i]);
#endif
            propsNonLeafD.printWorkSize();
        }
        propsNonLeafD.set_arg<int>(0,    &i); //set the level
        propsNonLeafD.execute();
    }

    propsScalingD.set_arg<int>(0,    &tree.n_nodes);
    propsScalingD.set_arg<cl_mem>(1, multipoleD.p());
    propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p());
    propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p());
    propsScalingD.set_arg<cl_mem>(4, tree.n_children.p());
    propsScalingD.set_arg<cl_mem>(5, tree.multipole.p());
    propsScalingD.set_arg<float >(6, &theta);
    propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
    propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p());
    propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p());

    propsScalingD.setWork(tree.n_nodes, 128);
#ifdef _DEBUG_PRINT_
    printf("propsScaling: \t ");
    propsScalingD.printWorkSize();
#endif
    propsScalingD.execute();


#if 0
#ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
#endif


    //Set the group properties, note that it is not based on the nodes anymore
    //but on self created groups based on particle order setPHGroupData
    copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
    copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
    copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
    copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);
    copyNodeDataToGroupData.printWorkSize();
    copyNodeDataToGroupData.execute();

#ifdef INDSOFT
    memCpyStream.sync();
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
#else
#endif

    //Get the local domain boundary based on group positions and sizes
    real4 r_min, r_max;
    getBoundariesGroups(tree, r_min, r_max);

#endif

}