예제 #1
0
void octree::approximate_gravity(tree_structure &tree,
                                 my_dev::dev_mem<float4>  &j_bodies_pos,  //Bodies that are part of the tree-structure
                                 my_dev::dev_mem<float4>  &j_bodies_h,
                                 my_dev::dev_mem<int>     &j_bodies_idx,
                                 my_dev::dev_mem<float4>  &i_bodies_pos,  //Bodies that are part of the groups
                                 my_dev::dev_mem<float4>  &i_bodies_h,
                                 my_dev::dev_mem<int>     &i_bodies_idx,
                                 int n_groupBodies,                     //Number of bodies that are part of the groups
                                 my_dev::dev_mem<real4>  &i_bodies_acc,
                                 my_dev::dev_mem<real>   &i_bodies_ds2,
                                 my_dev::dev_mem<int>    &i_bodies_ngb)
{ 
#if 1
  uint2 node_begend;
  int level_start = 2;
  node_begend.x   = tree.level_list[level_start].x;
  node_begend.y   = tree.level_list[level_start].y;

  //Reset the active particles
  tree.activePartList.zeroMem();
  this->atomicValues.zeroMem();


  float eps2 = this->eps2;


  //Set the kernel parameters, many!
  int argIdx = 0;
  approxGrav.set_arg<int>(argIdx++,    &tree.n_groups);
  approxGrav.set_arg<float>(argIdx++,  &eps2);
  approxGrav.set_arg<uint2>(argIdx++,  &node_begend);
  approxGrav.set_arg<cl_mem>(argIdx++, this->atomicValues.p());  
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_pos.p());
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_h  .p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_acc.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_pos.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_h  .p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ds2.p());
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.activePartList.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.interactions.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.group_list.p());  
  approxGrav.set_arg<cl_mem>(argIdx++, tree.multipole.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p());
  approxGrav.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory
  
#if 1
  approxGrav.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles)
  approxGrav.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles)
#endif 
  
  approxGrav.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize");
  approxGrav.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter");
  approxGrav.set_arg<real4>(argIdx++, tree.multipole, 4, "texMultipole");
  approxGrav.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody");
  approxGrav.setWork(-1, NTHREAD, nBlocksForTreeWalk);
//   approxGrav.setWork(-1, NTHREAD, 1);    

#if 0
  devContext.startTiming();
#endif
  approxGrav.execute(execStream->s());  //First half

#if 0
  execStream->sync();
  devContext.stopTiming("Gravity", 1);
#endif

  //Print interaction statistics
  #if 0
  tree.n = n_groupBodies;
  
  tree.body2group_list.d2h();
  tree.interactions.d2h();
    long long directSum = 0;
    long long apprSum = 0;
    long long directSum2 = 0;
    long long apprSum2 = 0;
    
    
    int maxDir = -1;
    int maxAppr = -1;

    for(int i=0; i < tree.n; i++)
    {
      apprSum     += tree.interactions[i].x;
      directSum   += tree.interactions[i].y;
      
      maxAppr = max(maxAppr,tree.interactions[i].x);
      maxDir  = max(maxDir,tree.interactions[i].y);
      
      apprSum2     += tree.interactions[i].x*tree.interactions[i].x;
      directSum2   += tree.interactions[i].y*tree.interactions[i].y;      
    }
  
    //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl;

    cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr <<  endl;
    cout << "sigma dir: " << sqrt((directSum2  - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n)  <<  endl;    

      #if 0
      //Histogram of number of interactions
      const int bins = 256;
      const int jump = 15;
      int histoIDX[bins+1];
      for(int i=0; i < bins; i++)
        histoIDX[i] = 0;
      
      for(int i=0; i < tree.n; i++)
      {
          int idx = tree.interactions[i].x / jump;
          if(idx >= bins)
            idx = bins;
          histoIDX[idx]++;  
      }
      for(int i=0; i < bins; i++)
      {
        if(histoIDX[i] == 0)
          fprintf(stderr, "HISTO %d\t-\n", i*jump, histoIDX[i]);
        else
          fprintf(stderr, "HISTO %d\t%d\n", i*jump, histoIDX[i]);
      }     
    #endif
  #endif
  
  #if 0
    i_bodies_ngb.d2h();
    i_bodies_acc.d2h();
    i_bodies_ds2.d2h();
    
    for(int i=0; i < n_groupBodies; i++)
    {
        fprintf(stderr, "%d\t Acc: %f %f %f %f  \t Ds2: %f \t NGB: %d \n", 
                i, i_bodies_acc[i].x, i_bodies_acc[i].y,
                i_bodies_acc[i].z, i_bodies_acc[i].w,
                i_bodies_ds2[i],
                i_bodies_ngb[i]);
    }
  #endif
  
  
  
  
  
/*
    //Reduce the number of valid particles    
    getNActive.set_arg<int>(0,    &tree.n);
    getNActive.set_arg<cl_mem>(1, tree.activePartlist.p());
    getNActive.set_arg<cl_mem>(2, this->nactive.p());
    getNActive.set_arg<int>(3,    NULL, 128); //Dynamic shared memory , equal to number of threads
    getNActive.setWork(-1, 128,   NBLOCK_REDUCE);
    
    CU_SAFE_CALL(cuCtxSynchronize()); //Synchronize all streams, makes sure that the approx stream is finished
    getNActive.execute();
    
    //Reduce the last parts on the host
    this->nactive.d2h();
    tree.n_active_particles = this->nactive[0];
    for (int i = 1; i < NBLOCK_REDUCE ; i++)
        tree.n_active_particles += this->nactive[i];

    printf("Active particles: %d \n", tree.n_active_particles);
    */

  my_dev::base_mem::printMemUsage();
#endif
}
예제 #2
0
void octree::get_ngb(tree_structure &tree,
                      my_dev::dev_mem<float4>  &j_bodies_pos,  //Bodies that are part of the tree-structure
                      my_dev::dev_mem<int>     &j_bodies_idx,
                      my_dev::dev_mem<float4>  &i_bodies_pos,  //Bodies that are part of the groups
                      my_dev::dev_mem<int>     &i_bodies_idx,
                      my_dev::dev_mem<int>    &i_bodies_ngb
){ 
  uint2 node_begend;
  int level_start = 2;
  node_begend.x   = tree.level_list[level_start].x;
  node_begend.y   = tree.level_list[level_start].y;

  //Reset the active particles
  tree.activePartList.zeroMem();
  this->atomicValues.zeroMem();

  //Set the kernel parameters, many!
  int argIdx = 0;
  getNGB.set_arg<int>(argIdx++,    &tree.n_groups);
  getNGB.set_arg<uint2>(argIdx++,  &node_begend);
  getNGB.set_arg<cl_mem>(argIdx++, this->atomicValues.p());  
  getNGB.set_arg<cl_mem>(argIdx++, j_bodies_pos.p());
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_pos.p());
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_ngb.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.activePartList.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.interactions.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.group_list.p());  

  getNGB.set_arg<cl_mem>(argIdx++, tree.boxSizeInfo.p());
  getNGB.set_arg<cl_mem>(argIdx++, tree.boxCenterInfo.p());

  getNGB.set_arg<cl_mem>(argIdx++, j_bodies_idx.p()); //Particle IDs, j bodies (the tree-particles)
  getNGB.set_arg<cl_mem>(argIdx++, i_bodies_idx.p()); //Particle IDs, i bodies (the group particles)

  getNGB.set_arg<cl_mem>(argIdx++, tree.generalBuffer1.p()); //Instead of using Local memory
  
  
  getNGB.set_arg<real4>(argIdx++, tree.boxSizeInfo, 4, "texNodeSize");
  getNGB.set_arg<real4>(argIdx++, tree.boxCenterInfo, 4, "texNodeCenter");
  getNGB.set_arg<real4>(argIdx++, j_bodies_pos, 4, "texBody");
  getNGB.setWork(-1, NTHREAD, nBlocksForTreeWalk);


  devContext.startTiming();
  getNGB.execute(execStream->s());  //First half

  execStream->sync();
  devContext.stopTiming("GetNGB", 1);

  //Print interaction statistics
  #if 0
  tree.n = n_groupBodies;
  
  tree.body2group_list.d2h();
  tree.interactions.d2h();
    long long directSum = 0;
    long long apprSum = 0;
    long long directSum2 = 0;
    long long apprSum2 = 0;
    
    
    int maxDir = -1;
    int maxAppr = -1;

    for(int i=0; i < tree.n; i++)
    {
      apprSum     += tree.interactions[i].x;
      directSum   += tree.interactions[i].y;
      
      maxAppr = max(maxAppr,tree.interactions[i].x);
      maxDir  = max(maxDir,tree.interactions[i].y);
      
      apprSum2     += tree.interactions[i].x*tree.interactions[i].x;
      directSum2   += tree.interactions[i].y*tree.interactions[i].y;      
    }
  
    //cerr << "Interaction at iter: " << iter << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    //cerr << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << endl;

    cout << "Interaction at (rank= " << 0 << " ) iter: " << 0 << "\tdirect: " << directSum << "\tappr: " << apprSum << "\t";
    cout << "avg dir: " << directSum / tree.n << "\tavg appr: " << apprSum / tree.n << "\tMaxdir: " << maxDir << "\tmaxAppr: " << maxAppr <<  endl;
    cout << "sigma dir: " << sqrt((directSum2  - directSum)/ tree.n) << "\tsigma appr: " << std::sqrt((apprSum2 - apprSum) / tree.n)  <<  endl;    

  
  #endif
  
  #if 0
    i_bodies_ngb.d2h();
    i_bodies_acc.d2h();
    i_bodies_ds2.d2h();
    
    for(int i=0; i < n_groupBodies; i++)
    {
        fprintf(stderr, "%d\t Acc: %f %f %f %f  \t Ds2: %f \t NGB: %d \n", 
                i, i_bodies_acc[i].x, i_bodies_acc[i].y,
                i_bodies_acc[i].z, i_bodies_acc[i].w,
                i_bodies_ds2[i],
                i_bodies_ngb[i]);
    }
  #endif
  
  

  my_dev::base_mem::printMemUsage();
}