Example #1
0
void octree::compute_properties_double(tree_structure &tree) {

  /*****************************************************          
    Assign the memory buffers, note that we check the size first
    and if needed we increase the size of the generalBuffer1
    Size required:
      - multipoleD -> double4*3_n_nodes -> 6*n_nodes*uint4 
      - lower/upperbounds ->               2*n_nodes*uint4
      - node lower/upper  ->               2*n_nodes*uint4
      - SUM: 10*n_nodes*uint4 
      - generalBuffer1 has default size: 3*N*uint4
      
    check if 10*n_nodes < 3*N if so realloc
    
   *****************************************************/
  
  if(10*tree.n_nodes > 3*tree.n)
  {
    fprintf(stderr, "Resizeing the generalBuffer1 \n");
    tree.generalBuffer1.cresize(8*tree.n_nodes*4, false);
  }
  
  my_dev::dev_mem<double4> multipoleD(devContext);
  
  multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[0], 0, 
                          3*tree.n_nodes, getAllignmentOffset(0));

  //Offset is in uint, so: double4 = 8uint*3*n_nodes
  tree.nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                          tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));
                         
  int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output
                          
  tree.nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 
                          8*3*tree.n_nodes + 4*tree.n_nodes, 
                          tree.n_nodes, 
                          prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));     
       
  
  //Computes the tree-properties (size, cm, monopole, quadropole, etc)
  //start the kernel for the leaf-type nodes
  propsLeafD.set_arg<int>(0,    &tree.n_leafs);
  propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
  propsLeafD.set_arg<cl_mem>(3, tree.bodies_Ppos.p());  
  propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());
  propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps
  
  propsLeafD.setWork(tree.n_leafs, 128);
  printf("PropsLeaf: "); propsLeafD.printWorkSize();
  propsLeafD.execute(); 
   
  
  int temp = tree.n_nodes-tree.n_leafs;
  propsNonLeafD.set_arg<int>(0,    &temp);
  propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
  propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());  
  propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsNonLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsNonLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());

  //Work from the bottom up
  for(int i=tree.n_levels; i >= 1; i--)
  {   
      propsNonLeafD.set_arg<int>(0,    &i);  
      {    
        vector<size_t> localWork(2), globalWork(2);
        int totalOnThisLevel;
      
        totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];
        
        propsNonLeafD.setWork(totalOnThisLevel, 128);
        
        printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
               tree.node_level_list[i-1], tree.node_level_list[i]); 
        propsNonLeafD.printWorkSize();
      }      
      propsNonLeafD.set_arg<int>(0,    &i); //set the level
      propsNonLeafD.execute();     
  }
  

  float theta2 = theta;
  
  propsScalingD.set_arg<int>(0,    &tree.n_nodes);
  propsScalingD.set_arg<real4>(1,  &tree.corner);
  propsScalingD.set_arg<cl_mem>(2, multipoleD.p());
  propsScalingD.set_arg<cl_mem>(3, tree.nodeLowerBounds.p());
  propsScalingD.set_arg<cl_mem>(4, tree.nodeUpperBounds.p());
  propsScalingD.set_arg<cl_mem>(5, tree.n_children.p());  
  propsScalingD.set_arg<cl_mem>(6, tree.multipole.p());
  propsScalingD.set_arg<float >(7, &theta2);
  propsScalingD.set_arg<cl_mem>(8, tree.boxSizeInfo.p());
  propsScalingD.set_arg<cl_mem>(9, tree.boxCenterInfo.p());
  propsScalingD.set_arg<cl_mem>(10, tree.node_bodies.p());
  
  propsScalingD.setWork(tree.n_nodes, 128);
  printf("propsScaling: \t "); propsScalingD.printWorkSize();
  propsScalingD.execute();   


  #ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
  #endif

    
  //Set the group properties, note that it is not based on the nodes anymore
  //but on self created groups based on particle order setPHGroupData    
  copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
  copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
  copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
  copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);    
  copyNodeDataToGroupData.printWorkSize();
  copyNodeDataToGroupData.execute();
  
  #ifdef INDSOFT  
    memCpyStream.sync();  
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
  #else
  #endif
  
  //Get the local domain boundary based on group positions and sizes
  real4 r_min, r_max;
  getBoundariesGroups(tree, r_min, r_max); 
  
  #if 0
    //Write the tree structure to file

    string nodeFileName = "fullTreeStructure.txt";
    char fileName[256];
    sprintf(fileName, "fullTreeStructure-%d.txt", mpiGetRank());
    ofstream nodeFile;
    //nodeFile.open(nodeFileName.c_str());
    nodeFile.open(fileName);
    tree.multipole.d2h();
    tree.boxSizeInfo.d2h();
    tree.boxCenterInfo.d2h();
    
    for(int i=0; i < tree.n_nodes; i++)
    {
      //nodeFile << i << "\t" << tree.boxCenterInfo[i].x << "\t" << tree.boxCenterInfo[i].y;
      //nodeFile << "\t" << 2*tree.boxSizeInfo[i].x << "\t" << 2*tree.boxSizeInfo[i].y << "\t";
      
      nodeFile << i << "\t" << tree.boxCenterInfo[i].x-tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y-tree.boxSizeInfo[i].y;
      nodeFile << "\t"      << tree.boxCenterInfo[i].x+tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y+tree.boxSizeInfo[i].y << "\t";
      
      
      nodeFile << tree.multipole[i*3+0].x << "\t" << tree.multipole[i*3+0].w << "\n";
    }
    
    nodeFile.close();
    

    sprintf(fileName, "fullTreeStructureParticles-%d.txt", mpiGetRank());
    ofstream partFile;
    partFile.open(fileName);
    tree.bodies_pos.d2h();
                                                                                                                    
    for(int i=0; i < tree.n; i++)                                                                                     
    {                                                                                                                 
      float4  pos =  tree.bodies_pos[i];                                                                              
      partFile << i << "\t" << pos.x << "\t" << pos.y << "\t" << pos.z << endl;                                                        
    }                                                                                                                 
    partFile.close(); 
    

  
  #endif
 
}
void octree::compute_properties(tree_structure &tree,  my_dev::dev_mem<float4>  &bodies_pos, int n_bodies) {

    /*****************************************************
      Assign the memory buffers, note that we check the size first
      and if needed we increase the size of the generalBuffer1
      Size required:
        - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4
        - lower/upperbounds ->               2*n_nodes*uint4
        - node lower/upper  ->               2*n_nodes*uint4
        - SUM:                               10*n_nodes*uint4
        - generalBuffer1 has default size: 3*N*uint4

      check if 10*n_nodes < 3*N if so realloc
      (Note that generalBuffer might be larger because of tree-walk stack)
     *****************************************************/

    if(10*tree.n_nodes > 3*tree.n)
    {
#ifdef _DEBUG_PRINT_
        fprintf(stderr, "Resizeing the generalBuffer1 \n");
#endif
        tree.generalBuffer1.cresize(10*tree.n_nodes*4, false);
    }

    my_dev::dev_mem<double4> multipoleD(devContext);
    my_dev::dev_mem<real4>  nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO
    my_dev::dev_mem<real4>  nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO

    multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                            tree.generalBuffer1.get_flags(),
                            tree.generalBuffer1.get_devMem(),
                            &tree.generalBuffer1[0], 0,
                            3*tree.n_nodes, getAllignmentOffset(0));

    //Offset is in uint, so: double4 = 8uint*3*n_nodes
    nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                                 tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));

    int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output

    nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes],
                                 8*3*tree.n_nodes + 4*tree.n_nodes,
                                 tree.n_nodes,
                                 prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));

    //Computes the tree-properties (size, cm, monopole, quadropole, etc)
    //start the kernel for the leaf-type nodes
    propsLeafD.set_arg<int>(0,    &tree.n_leafs);
    propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
    propsLeafD.set_arg<cl_mem>(3, bodies_pos.p());
    propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());
//   propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps

    propsLeafD.setWork(tree.n_leafs, 128);
#ifdef _DEBUG_PRINT_
    printf("PropsLeaf: ");
    propsLeafD.printWorkSize();
#endif
    propsLeafD.execute();


    int temp = tree.n_nodes-tree.n_leafs;
    propsNonLeafD.set_arg<int>(0,    &temp);
    propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
    propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());
    propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());

    //Work from the bottom up
    for(int i=tree.n_levels; i >= 1; i--)
    {
        propsNonLeafD.set_arg<int>(0,    &i);
        {
            vector<size_t> localWork(2), globalWork(2);
            int totalOnThisLevel;

            totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];

            propsNonLeafD.setWork(totalOnThisLevel, 128);

#ifdef _DEBUG_PRINT_
            printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
                   tree.node_level_list[i-1], tree.node_level_list[i]);
#endif
            propsNonLeafD.printWorkSize();
        }
        propsNonLeafD.set_arg<int>(0,    &i); //set the level
        propsNonLeafD.execute();
    }

    propsScalingD.set_arg<int>(0,    &tree.n_nodes);
    propsScalingD.set_arg<cl_mem>(1, multipoleD.p());
    propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p());
    propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p());
    propsScalingD.set_arg<cl_mem>(4, tree.n_children.p());
    propsScalingD.set_arg<cl_mem>(5, tree.multipole.p());
    propsScalingD.set_arg<float >(6, &theta);
    propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
    propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p());
    propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p());

    propsScalingD.setWork(tree.n_nodes, 128);
#ifdef _DEBUG_PRINT_
    printf("propsScaling: \t ");
    propsScalingD.printWorkSize();
#endif
    propsScalingD.execute();


#if 0
#ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
#endif


    //Set the group properties, note that it is not based on the nodes anymore
    //but on self created groups based on particle order setPHGroupData
    copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
    copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
    copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
    copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);
    copyNodeDataToGroupData.printWorkSize();
    copyNodeDataToGroupData.execute();

#ifdef INDSOFT
    memCpyStream.sync();
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
#else
#endif

    //Get the local domain boundary based on group positions and sizes
    real4 r_min, r_max;
    getBoundariesGroups(tree, r_min, r_max);

#endif

}