Esempio n. 1
0
void octree::compute_properties_double(tree_structure &tree) {

  /*****************************************************          
    Assign the memory buffers, note that we check the size first
    and if needed we increase the size of the generalBuffer1
    Size required:
      - multipoleD -> double4*3_n_nodes -> 6*n_nodes*uint4 
      - lower/upperbounds ->               2*n_nodes*uint4
      - node lower/upper  ->               2*n_nodes*uint4
      - SUM: 10*n_nodes*uint4 
      - generalBuffer1 has default size: 3*N*uint4
      
    check if 10*n_nodes < 3*N if so realloc
    
   *****************************************************/
  
  if(10*tree.n_nodes > 3*tree.n)
  {
    fprintf(stderr, "Resizeing the generalBuffer1 \n");
    tree.generalBuffer1.cresize(8*tree.n_nodes*4, false);
  }
  
  my_dev::dev_mem<double4> multipoleD(devContext);
  
  multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[0], 0, 
                          3*tree.n_nodes, getAllignmentOffset(0));

  //Offset is in uint, so: double4 = 8uint*3*n_nodes
  tree.nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                          tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));
                         
  int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output
                          
  tree.nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes], 
                          8*3*tree.n_nodes + 4*tree.n_nodes, 
                          tree.n_nodes, 
                          prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));     
       
  
  //Computes the tree-properties (size, cm, monopole, quadropole, etc)
  //start the kernel for the leaf-type nodes
  propsLeafD.set_arg<int>(0,    &tree.n_leafs);
  propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
  propsLeafD.set_arg<cl_mem>(3, tree.bodies_Ppos.p());  
  propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());
  propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps
  
  propsLeafD.setWork(tree.n_leafs, 128);
  printf("PropsLeaf: "); propsLeafD.printWorkSize();
  propsLeafD.execute(); 
   
  
  int temp = tree.n_nodes-tree.n_leafs;
  propsNonLeafD.set_arg<int>(0,    &temp);
  propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
  propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
  propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());  
  propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
  propsNonLeafD.set_arg<cl_mem>(5, tree.nodeLowerBounds.p());
  propsNonLeafD.set_arg<cl_mem>(6, tree.nodeUpperBounds.p());

  //Work from the bottom up
  for(int i=tree.n_levels; i >= 1; i--)
  {   
      propsNonLeafD.set_arg<int>(0,    &i);  
      {    
        vector<size_t> localWork(2), globalWork(2);
        int totalOnThisLevel;
      
        totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];
        
        propsNonLeafD.setWork(totalOnThisLevel, 128);
        
        printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
               tree.node_level_list[i-1], tree.node_level_list[i]); 
        propsNonLeafD.printWorkSize();
      }      
      propsNonLeafD.set_arg<int>(0,    &i); //set the level
      propsNonLeafD.execute();     
  }
  

  float theta2 = theta;
  
  propsScalingD.set_arg<int>(0,    &tree.n_nodes);
  propsScalingD.set_arg<real4>(1,  &tree.corner);
  propsScalingD.set_arg<cl_mem>(2, multipoleD.p());
  propsScalingD.set_arg<cl_mem>(3, tree.nodeLowerBounds.p());
  propsScalingD.set_arg<cl_mem>(4, tree.nodeUpperBounds.p());
  propsScalingD.set_arg<cl_mem>(5, tree.n_children.p());  
  propsScalingD.set_arg<cl_mem>(6, tree.multipole.p());
  propsScalingD.set_arg<float >(7, &theta2);
  propsScalingD.set_arg<cl_mem>(8, tree.boxSizeInfo.p());
  propsScalingD.set_arg<cl_mem>(9, tree.boxCenterInfo.p());
  propsScalingD.set_arg<cl_mem>(10, tree.node_bodies.p());
  
  propsScalingD.setWork(tree.n_nodes, 128);
  printf("propsScaling: \t "); propsScalingD.printWorkSize();
  propsScalingD.execute();   


  #ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
  #endif

    
  //Set the group properties, note that it is not based on the nodes anymore
  //but on self created groups based on particle order setPHGroupData    
  copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
  copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
  copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
  copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());  
  copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
  copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);    
  copyNodeDataToGroupData.printWorkSize();
  copyNodeDataToGroupData.execute();
  
  #ifdef INDSOFT  
    memCpyStream.sync();  
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
  #else
  #endif
  
  //Get the local domain boundary based on group positions and sizes
  real4 r_min, r_max;
  getBoundariesGroups(tree, r_min, r_max); 
  
  #if 0
    //Write the tree structure to file

    string nodeFileName = "fullTreeStructure.txt";
    char fileName[256];
    sprintf(fileName, "fullTreeStructure-%d.txt", mpiGetRank());
    ofstream nodeFile;
    //nodeFile.open(nodeFileName.c_str());
    nodeFile.open(fileName);
    tree.multipole.d2h();
    tree.boxSizeInfo.d2h();
    tree.boxCenterInfo.d2h();
    
    for(int i=0; i < tree.n_nodes; i++)
    {
      //nodeFile << i << "\t" << tree.boxCenterInfo[i].x << "\t" << tree.boxCenterInfo[i].y;
      //nodeFile << "\t" << 2*tree.boxSizeInfo[i].x << "\t" << 2*tree.boxSizeInfo[i].y << "\t";
      
      nodeFile << i << "\t" << tree.boxCenterInfo[i].x-tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y-tree.boxSizeInfo[i].y;
      nodeFile << "\t"      << tree.boxCenterInfo[i].x+tree.boxSizeInfo[i].x << "\t" << tree.boxCenterInfo[i].y+tree.boxSizeInfo[i].y << "\t";
      
      
      nodeFile << tree.multipole[i*3+0].x << "\t" << tree.multipole[i*3+0].w << "\n";
    }
    
    nodeFile.close();
    

    sprintf(fileName, "fullTreeStructureParticles-%d.txt", mpiGetRank());
    ofstream partFile;
    partFile.open(fileName);
    tree.bodies_pos.d2h();
                                                                                                                    
    for(int i=0; i < tree.n; i++)                                                                                     
    {                                                                                                                 
      float4  pos =  tree.bodies_pos[i];                                                                              
      partFile << i << "\t" << pos.x << "\t" << pos.y << "\t" << pos.z << endl;                                                        
    }                                                                                                                 
    partFile.close(); 
    

  
  #endif
 
}
Esempio n. 2
0
void octree::sort_bodies(tree_structure &tree, my_dev::dev_mem<float4>  &bodies_pos, 
                         my_dev::dev_mem<uint>  &sortPermutation, int n_bodies) {

  //We assume the bodies are already onthe GPU
  devContext.startTiming();
  
  this->allocateParticleSpecificBuffers(n_bodies);

  
  //Call the GPUSort function, since we made it general 
  //into a uint4 so we can extend the tree to 96bit key
  //we have to convert to 64bit key to a 96bit for sorting
  //and back from 96 to 64    
  my_dev::dev_mem<uint4>  srcValues(devContext);
  my_dev::dev_mem<uint4>  output(devContext);
  my_dev::dev_mem<uint4>  bodies_key(devContext);
  
  
  //Allocate memory for the generalBuffer
  
  //The generalBuffer1 has size uint*4*N*3
  //this buffer gets part: 0-uint*4*N
  srcValues.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                         tree.generalBuffer1.get_flags(), 
                         tree.generalBuffer1.get_devMem(),
                         &tree.generalBuffer1[0], 0,  
                         n_bodies, getAllignmentOffset(0));  
 
  //this buffer gets part: uint*4*N-uint*4*N*2
  output.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
		      tree.generalBuffer1.get_flags(), 
		      tree.generalBuffer1.get_devMem(),
		      &tree.generalBuffer1[4*n_bodies], 4*n_bodies,
		      n_bodies, getAllignmentOffset(4*n_bodies));
  
  int prevOffset = getAllignmentOffset(4*n_bodies);
  
  bodies_key.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
			  tree.generalBuffer1.get_flags(), 
			  tree.generalBuffer1.get_devMem(),
			  &tree.generalBuffer1[8*n_bodies], 8*n_bodies,
			  n_bodies, prevOffset + getAllignmentOffset(8*n_bodies + prevOffset));
  
  
  //This function computes the keys, seperate since we compute keys also before 
  //buidling the tree-structure
  //Corner and size are not stored, since we can use sorting without building a tree
  float4 corner;
  float domain_fac;
  compute_keys(bodies_pos, bodies_key, n_bodies, corner, domain_fac);
  
  
  
  
   //Extract the keys
  convertKey64to96.set_arg<cl_mem>(0,   bodies_key.p());
  convertKey64to96.set_arg<cl_mem>(1,   srcValues.p());
  convertKey64to96.set_arg<int>(2,      &n_bodies);  
  convertKey64to96.setWork(n_bodies, 256);
  convertKey64to96.execute();
  
 
  //Sort the keys  
  
  // If srcValues (2nd argument) and buffer (4th argument) are different, then the original values
  // are preserved, if they are the same srcValues will be overwritten  
  gpuSort(devContext, srcValues, output, srcValues, n_bodies, 32, 3, tree);
 

  //Extract the keys and get the permuation required to sort the other
  //properties of the particles
  //Extract the sorted keys
  extractKeyAndPerm.set_arg<cl_mem>(0,   output.p());
  extractKeyAndPerm.set_arg<cl_mem>(1,   bodies_key.p());
  extractKeyAndPerm.set_arg<cl_mem>(2,   sortPermutation.p());  
  extractKeyAndPerm.set_arg<int>(3,      &n_bodies);
  extractKeyAndPerm.setWork(n_bodies, 256);
  extractKeyAndPerm.execute();  
  
  devContext.stopTiming("Sorting", 0);  
}
Esempio n. 3
0
  extractKeyAndPerm.set_arg<int>(3,      &n_bodies);
  extractKeyAndPerm.setWork(n_bodies, 256);
  extractKeyAndPerm.execute();  
  
  devContext.stopTiming("Sorting", 0);  
}

#if 0
  //Use calloc here so valgrind does not complain about uninitialized values
  my_dev::dev_mem<real4>  real4Buffer(devContext);

  real4Buffer.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                         tree.generalBuffer1.get_flags(), 
                         tree.generalBuffer1.get_devMem(),
                         &tree.generalBuffer1[4*tree.n], 4*tree.n, 
                         tree.n, getAllignmentOffset(4*tree.n));      
  


  //Call the reorder data function

  //For the position
  dataReorderR4.set_arg<int>(0,      &tree.n);
  dataReorderR4.set_arg<cl_mem>(1,   tree.bodies_pos.p());
  dataReorderR4.set_arg<cl_mem>(2,   real4Buffer.p());
  dataReorderR4.set_arg<cl_mem>(3,   sortPermutation.p()); 
  
  dataReorderR4.setWork(tree.n, 256);  
  devContext.startTiming();

  dataReorderR4.execute();  
Esempio n. 4
0
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
void  octree::gpuSort(my_dev::context &devContext,
                      my_dev::dev_mem<uint4> &srcValues,
                      my_dev::dev_mem<uint4> &output,
                      my_dev::dev_mem<uint4> &buffer,
                      int N, int numberOfBits, int subItems,
                      tree_structure &tree) {

  //Extra buffer values

//   my_dev::dev_mem<uint> simpleKeys(devContext, N);    //Int keys,
//   my_dev::dev_mem<uint> permutation(devContext, N);   //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<int> output32b(devContext, N); //Permutation values, for sorting the int4 data
//   my_dev::dev_mem<uint> valuesOutput(devContext, N);  //Buffers for the values which are the indexes
  
  
  my_dev::dev_mem<uint> simpleKeys(devContext);    //Int keys,
  my_dev::dev_mem<uint> permutation(devContext);   //Permutation values, for sorting the int4 data
  my_dev::dev_mem<int>  output32b(devContext);       //Permutation values, for sorting the int4 data
  my_dev::dev_mem<uint> valuesOutput(devContext);  //Buffers for the values which are the indexes
  
  int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output

  
  simpleKeys.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[8*N], 8*N,
                          N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum));    //Ofset 8 since we have 2 uint4 before
  
  prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);
  
  permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[9*N], 9*N,
                          N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum));  //N elements after simpleKeys    

  prevOffsetSum += getAllignmentOffset(9*N + prevOffsetSum);
  

  output32b.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[10*N], 10*N,
                          N, prevOffsetSum + getAllignmentOffset(10*N + prevOffsetSum));  //N elements after permutation      
  
  prevOffsetSum += getAllignmentOffset(10*N + prevOffsetSum);
  
  valuesOutput.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                          tree.generalBuffer1.get_flags(), 
                          tree.generalBuffer1.get_devMem(),
                          &tree.generalBuffer1[11*N], 11*N,
                          N, prevOffsetSum + getAllignmentOffset(11*N + prevOffsetSum));  //N elements after output32b        

    
  //Dimensions for the kernels that shuffle and extract data
  const int blockSize = 256;
  int ng = (N)/blockSize + 1;
  int nx = (int)sqrt(ng);
  int ny = (ng-1)/nx + 1;

  vector<size_t> localWork(2), globalWork(2);
  globalWork[0] = nx*blockSize;   globalWork[1] = ny;
  localWork [0] = blockSize;       localWork[1] = 1;

  extractInt.setWork(globalWork, localWork);
  fillSequence.setWork(globalWork, localWork);
  reOrderKeysValues.setWork(globalWork, localWork);
  
  //Idx depends on subitems, z goes first, x last if subitems = 3
  //subitems = 3, than idx=2
  //subitems = 2, than idx=1
  //subitems = 1, than idx=0
  //intIdx = subItems-1   
  int intIdx = subItems-1;

  extractInt.set_arg<cl_mem>(0, srcValues.p());
  extractInt.set_arg<cl_mem>(1, simpleKeys.p());
  extractInt.set_arg<uint>(2, &N);
  extractInt.set_arg<int>(3, &intIdx);//bit idx

  fillSequence.set_arg<cl_mem>(0, permutation.p());
  fillSequence.set_arg<uint>(1, &N);

  reOrderKeysValues.set_arg<cl_mem>(0, srcValues.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());
  reOrderKeysValues.set_arg<cl_mem>(2, valuesOutput.p());
  reOrderKeysValues.set_arg<uint>(3, &N);

  extractInt.execute();
  fillSequence.execute();

  //Now sort the first 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
//                     output32b, aPing32b,
                   output32b, simpleKeys,
//                    valuesOutput,valuesAPing,
                   valuesOutput,permutation,
//                   count,
                   N, 32);


  //Now reorder the main keys
  //Use output as the new output/src value thing buffer
  reOrderKeysValues.execute();
  
  if(subItems == 1)
  {
    //Only doing one 32bit sort. Data is already in output so done
    return;
  }


  //2nd set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=1
  //subitems = 2, than idx=0
  //subitems = 1, completed previous round
  //intIdx = subItems-2   
  intIdx = subItems-2;
  
  extractInt.set_arg<cl_mem>(0, output.p());
  extractInt.set_arg<int>(3, &intIdx);//smem size

  reOrderKeysValues.set_arg<cl_mem>(0, output.p());
  reOrderKeysValues.set_arg<cl_mem>(1, buffer.p());
 
  extractInt.execute();
  
  fillSequence.execute();

  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
                   simpleKeys, permutation,
                   output32b, simpleKeys,
//                    output32b, aPing32b,
//                   valuesOutput,valuesAPing,
                   valuesOutput,permutation,
                   //count,
                   N, 32);
                   
  reOrderKeysValues.execute();
  

  if(subItems == 2)
  {
    //Doing two 32bit sorts. Data is in buffer
    //so move the data from buffer to output    
    output.copy(buffer, buffer.get_size());    
    return;
  }

  //3th set of 32bit keys
  //Idx depends on subitems, z goes first, x last if subitems = 3  
  //subitems = 3, than idx=0
  //subitems = 2, completed previous round
  //subitems = 1, completed previous round
  //intIdx = subItems-2     
  intIdx = 0;
  
  extractInt.set_arg<cl_mem>(0, buffer.p());
  extractInt.set_arg<int>(3, &intIdx);//integer idx

  reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
  reOrderKeysValues.set_arg<cl_mem>(1, output.p());

  extractInt.execute();
  fillSequence.execute();
  //Now sort the 32bit keys
  //Using int2 with key and value combined
  //See sortArray4
  //Using key and value in a seperate array
  //Now sort the 2nd 32bit keys
  //Using 32bit sort with key and value seperated    
  gpuSort_32b(devContext, 
              simpleKeys, permutation,
              output32b, simpleKeys,
//               output32b, aPing32b,
//               valuesOutput,valuesAPing,
              valuesOutput,permutation,
              //count,
              N, 32);  

  reOrderKeysValues.execute();

  clFinish(devContext.get_command_queue());

//   fprintf(stderr, "sortArray2 done in %g sec (Without memory alloc & compilation) \n", get_time() - t0);
}
Esempio n. 5
0
void octree::build (tree_structure &tree) {

  int level      = 0;
  int validCount = 0;
  int offset     = 0;


  /******** load kernels **********/

  /******** create memory buffers **********/


  my_dev::dev_mem<uint>  validList(devContext);
  my_dev::dev_mem<uint>  compactList(devContext);
  
  validList.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                                    tree.generalBuffer1.get_flags(), 
                                    tree.generalBuffer1.get_devMem(),
                                    &tree.generalBuffer1[0], 0,
                                    tree.n*2, getAllignmentOffset(0));
  validList.zeroMem(); 
                                    
  compactList.cmalloc_copy(tree.generalBuffer1.get_pinned(), 
                                    tree.generalBuffer1.get_flags(), 
                                    tree.generalBuffer1.get_devMem(),
                                    &tree.generalBuffer1[tree.n*2], tree.n*2,
                                    tree.n*2, getAllignmentOffset(tree.n*2));
                                                                        

  
  /******** set kernels parameters **********/
  

  build_key_list.set_arg<cl_mem>(0,   tree.bodies_key.p());
  build_key_list.set_arg<cl_mem>(1,   tree.bodies_Ppos.p());
  build_key_list.set_arg<int>(2,      &tree.n);
  build_key_list.set_arg<real4>(3,    &tree.corner);
  build_key_list.setWork(tree.n, 128);
  
  build_valid_list.set_arg<int>(0, &tree.n);
  build_valid_list.set_arg<int>(1, &level);
  build_valid_list.set_arg<cl_mem>(2,  tree.bodies_key.p());
  build_valid_list.set_arg<cl_mem>(3,  validList.p());  
  build_valid_list.setWork(tree.n, 128);
  


  build_nodes.set_arg<int>(0,     &level);
  build_nodes.set_arg<int>(1,     &validCount);
  build_nodes.set_arg<int>(2,     &offset);
  build_nodes.set_arg<cl_mem>(3,  compactList.p());
  build_nodes.set_arg<cl_mem>(4,  tree.bodies_key.p());
  build_nodes.set_arg<cl_mem>(5,  tree.node_key.p());
  build_nodes.set_arg<cl_mem>(6,  tree.n_children.p());
  build_nodes.set_arg<cl_mem>(7,  tree.node_bodies.p());

  link_tree.set_arg<int>(0,     &offset);
  link_tree.set_arg<cl_mem>(1,  tree.n_children.p());
  link_tree.set_arg<cl_mem>(2,  tree.node_bodies.p());
  link_tree.set_arg<cl_mem>(3,  tree.bodies_Ppos.p());
  link_tree.set_arg<real4>(4,   &tree.corner);
  link_tree.set_arg<cl_mem>(5,  tree.level_list.p());
  link_tree.set_arg<cl_mem>(6,  validList.p()); 
  link_tree.set_arg<cl_mem>(7,  tree.node_key.p());
  link_tree.set_arg<cl_mem>(8,  tree.bodies_key.p());
  link_tree.set_arg<int>(9,     &level);


  /********** build  list of keys ********/
  
  build_key_list.execute();  
  
  /******  build the levels *********/
  
  int nodeSum = 0;
  for (level = 0; level < MAXLEVELS; level++) {
    // mark bodies to be combined into nodes
    build_valid_list.set_arg<int>(1, &level);
    build_valid_list.execute();
      
    //gpuCompact to get number of created nodes    
    gpuCompact(devContext, validList, compactList, tree.n*2, &validCount);
                 
    nodeSum += validCount / 2;
    printf("ValidCount (%d): %d \tSum: %d Offset: %d\n", mpiGetRank(), validCount, nodeSum, offset);
    
    validCount /= 2;     
                  
    if (validCount == 0) break;                 
      
    // asssemble nodes           
    build_nodes.setWork(validCount, 128);
    build_nodes.set_arg<int>(0, &level);
    build_nodes.set_arg<int>(1, &validCount);
    build_nodes.set_arg<int>(2, &offset);    
    build_nodes.execute();
                 
    tree.level_list[level] = (uint2){offset, offset + validCount};
    offset += validCount;

  } //end for lvl
  

  //Put the last level + 1 index to 0,0 
  //so we dont need an extra if statement in the linking phase
  tree.level_list[level] = (uint2){0, 0};
  tree.level_list.h2d();
    
  int n_nodes  = offset;
  tree.n_nodes = n_nodes;
  
 
  /***** Link the tree ******/
  
  link_tree.set_arg<int>(0, &offset);   //Offset=number of nodes
  link_tree.set_arg<int>(9, &level);   //level=highest number of levels
  
  //The maximum number of levels that can be used is MAXLEVEl 
  //if max level is larger than that the program will exit
  printf("Max level : %d \n", level);
  if(level >= MAXLEVELS)
  {
    cerr << "The tree has become too deep, the program will exit. \n";
    cerr << "Consider the removal of far away particles to prevent a too large box. \n";
    maxlevels_exceeded = true;
    return;
    //exit(0);
  }
  
  link_tree.setWork(n_nodes, 128);
  printf("Link_tree: "); link_tree.printWorkSize();
  
  tree.n_levels = level-1;

  for(int i=0; i < level; i++)
    printf("%d\t%d\t%d\n", i, tree.level_list[i].x, tree.level_list[i].y);
 
  //Link the tree      
  link_tree.execute();
  

  //After executing link_tree, the id_list contains for each node
  //the ID of its parent.
  //Valid_list contains for each node if its a leaf (valid) or a normal
  //node -> non_valid
  //Execute a split on the validList to get seperate id lists 
  //for the leafs and nodes. Used when computing multipoles
    
  tree.leafNodeIdx.cmalloc(tree.n_nodes , false);
  
  //Split the leaf ids and non-leaf node ids
  gpuSplit(devContext, validList, tree.leafNodeIdx, tree.n_nodes, &tree.n_leafs);     
                 
  printf("Total nodes: %d N_leafs: %d  non-leafs: %d \n", tree.n_nodes, tree.n_leafs, tree.n_nodes - tree.n_leafs);
  

  build_level_list.set_arg<int>(0, &tree.n_nodes);
  build_level_list.set_arg<int>(1, &tree.n_leafs);
  build_level_list.set_arg<cl_mem>(2, tree.leafNodeIdx.p());
  build_level_list.set_arg<cl_mem>(3, tree.node_bodies.p());
  build_level_list.set_arg<cl_mem>(4, validList.p());  
  build_level_list.setWork(tree.n_nodes-tree.n_leafs, 128);
  
  validList.zeroMem();  

  //Build the level list based on the leafIdx list
  //required for easy access in the compute node properties
  build_level_list.execute();  

  tree.node_level_list.cmalloc(level*2 , false);

  int levelThing;
  
  gpuCompact(devContext, validList, tree.node_level_list, 
             2*(tree.n_nodes-tree.n_leafs), &levelThing);             
  
  tree.node_level_list.d2h();
  
  //We only care about end positions, so compress the list:
  int j=0;
  for(int i=0; i < levelThing; i+=2, j++)
    tree.node_level_list[j] = tree.node_level_list[i];
  
  tree.node_level_list[j] =tree.node_level_list[levelThing-1]+1; //Add 1 to make it the end position
  levelThing = j+1;
  tree.node_level_list.h2d();
  
  printf("Finished level list \n");
  
  for(int i=0; i < levelThing; i++)
  {
    printf("node_level_list: %d \t%d\n", i, tree.node_level_list[i]);
  }
  
  ///******   Start building the particle groups *******///////

  //Compute the box size, the max length of one of the sides of the rectangle
  real size     = fmax(fabs(rMaxLocalTree.z - rMinLocalTree.z), 
                  fmax(fabs(rMaxLocalTree.y - rMinLocalTree.y),
                       fabs(rMaxLocalTree.x - rMinLocalTree.x)));
  real dist     = ((rMaxLocalTree.z - rMinLocalTree.z) * (rMaxLocalTree.z - rMinLocalTree.z) + 
                   (rMaxLocalTree.y - rMinLocalTree.y) * (rMaxLocalTree.y - rMinLocalTree.y) +
                   (rMaxLocalTree.x - rMinLocalTree.x) * (rMaxLocalTree.x - rMinLocalTree.x));      
                   
  float maxDist = sqrt(dist) / 10;
  maxDist *= maxDist; //Square since we dont do sqrt on device
                       
  fprintf(stderr,"Box max size: %f en max dist: %f \t %f en %f  \n", size, dist, sqrt(dist), maxDist);
  
  //maxDist = 50;
  
  validList.zeroMem();
  //The newest group creation method!
  define_groups.set_arg<int>(0, &tree.n);  
  define_groups.set_arg<cl_mem>(1, validList.p());    
  define_groups.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
  define_groups.set_arg<float>(3, &maxDist);     
  define_groups.setWork(tree.n, 128);  
  define_groups.execute();
  
  //gpuCompact    
  gpuCompact(devContext, validList, compactList, tree.n*2, &validCount);
  
  printf("Found number of groups: %d \n", validCount/2);

  tree.n_groups = validCount/2;
  //Now compact validList to get the list of group ids
  tree.group_list_test.cmalloc(tree.n_groups , false);  
  
  store_groups.set_arg<int>(0, &tree.n);  
  store_groups.set_arg<int>(1, &tree.n_groups);  
  store_groups.set_arg<cl_mem>(2, compactList.p());    
  store_groups.set_arg<cl_mem>(3, tree.body2group_list.p());     
  store_groups.set_arg<cl_mem>(4, tree.group_list_test.p());     
  store_groups.setWork(-1, NCRIT, tree.n_groups);  
  store_groups.execute();  

  //Memory allocation for the valid group lists
  if(tree.active_group_list.get_size() > 0)
  {
    tree.active_group_list.cresize(tree.n_groups, false);
    tree.activeGrpList.cresize(tree.n_groups, false);     
    
  }
  else
  {
    tree.active_group_list.cmalloc(tree.n_groups, false);
    tree.activeGrpList.cmalloc(tree.n_groups, false);     
  }


  printf("Tree built complete!\n");

  /*************************/

}
void octree::compute_properties(tree_structure &tree,  my_dev::dev_mem<float4>  &bodies_pos, int n_bodies) {

    /*****************************************************
      Assign the memory buffers, note that we check the size first
      and if needed we increase the size of the generalBuffer1
      Size required:
        - multipoleD -> double4*3_n_nodes -> 2*3*n_nodes*uint4
        - lower/upperbounds ->               2*n_nodes*uint4
        - node lower/upper  ->               2*n_nodes*uint4
        - SUM:                               10*n_nodes*uint4
        - generalBuffer1 has default size: 3*N*uint4

      check if 10*n_nodes < 3*N if so realloc
      (Note that generalBuffer might be larger because of tree-walk stack)
     *****************************************************/

    if(10*tree.n_nodes > 3*tree.n)
    {
#ifdef _DEBUG_PRINT_
        fprintf(stderr, "Resizeing the generalBuffer1 \n");
#endif
        tree.generalBuffer1.cresize(10*tree.n_nodes*4, false);
    }

    my_dev::dev_mem<double4> multipoleD(devContext);
    my_dev::dev_mem<real4>  nodeLowerBounds(devContext); //Lower bounds used for scaling? TODO
    my_dev::dev_mem<real4>  nodeUpperBounds(devContext); //Upper bounds used for scaling? TODO

    multipoleD.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                            tree.generalBuffer1.get_flags(),
                            tree.generalBuffer1.get_devMem(),
                            &tree.generalBuffer1[0], 0,
                            3*tree.n_nodes, getAllignmentOffset(0));

    //Offset is in uint, so: double4 = 8uint*3*n_nodes
    nodeLowerBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes],  8*3*tree.n_nodes,
                                 tree.n_nodes, getAllignmentOffset(8*3*tree.n_nodes));

    int prevOffsetSum = getAllignmentOffset(8*3*tree.n_nodes); //The offset of output

    nodeUpperBounds.cmalloc_copy(tree.generalBuffer1.get_pinned(),
                                 tree.generalBuffer1.get_flags(),
                                 tree.generalBuffer1.get_devMem(),
                                 &tree.generalBuffer1[8*3*tree.n_nodes + 4*tree.n_nodes],
                                 8*3*tree.n_nodes + 4*tree.n_nodes,
                                 tree.n_nodes,
                                 prevOffsetSum + getAllignmentOffset(8*3*tree.n_nodes + 4*tree.n_nodes + prevOffsetSum));

    //Computes the tree-properties (size, cm, monopole, quadropole, etc)
    //start the kernel for the leaf-type nodes
    propsLeafD.set_arg<int>(0,    &tree.n_leafs);
    propsLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsLeafD.set_arg<cl_mem>(2, tree.node_bodies.p());
    propsLeafD.set_arg<cl_mem>(3, bodies_pos.p());
    propsLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());
//   propsLeafD.set_arg<cl_mem>(7, tree.bodies_Pvel.p());  //Velocity to get max eps

    propsLeafD.setWork(tree.n_leafs, 128);
#ifdef _DEBUG_PRINT_
    printf("PropsLeaf: ");
    propsLeafD.printWorkSize();
#endif
    propsLeafD.execute();


    int temp = tree.n_nodes-tree.n_leafs;
    propsNonLeafD.set_arg<int>(0,    &temp);
    propsNonLeafD.set_arg<cl_mem>(1, tree.leafNodeIdx.p());
    propsNonLeafD.set_arg<cl_mem>(2, tree.node_level_list.p());
    propsNonLeafD.set_arg<cl_mem>(3, tree.n_children.p());
    propsNonLeafD.set_arg<cl_mem>(4, multipoleD.p());
    propsNonLeafD.set_arg<cl_mem>(5, nodeLowerBounds.p());
    propsNonLeafD.set_arg<cl_mem>(6, nodeUpperBounds.p());

    //Work from the bottom up
    for(int i=tree.n_levels; i >= 1; i--)
    {
        propsNonLeafD.set_arg<int>(0,    &i);
        {
            vector<size_t> localWork(2), globalWork(2);
            int totalOnThisLevel;

            totalOnThisLevel = tree.node_level_list[i]-tree.node_level_list[i-1];

            propsNonLeafD.setWork(totalOnThisLevel, 128);

#ifdef _DEBUG_PRINT_
            printf("PropsNonLeaf, nodes on level %d : %d (start: %d end: %d) , config: \t", i, totalOnThisLevel,
                   tree.node_level_list[i-1], tree.node_level_list[i]);
#endif
            propsNonLeafD.printWorkSize();
        }
        propsNonLeafD.set_arg<int>(0,    &i); //set the level
        propsNonLeafD.execute();
    }

    propsScalingD.set_arg<int>(0,    &tree.n_nodes);
    propsScalingD.set_arg<cl_mem>(1, multipoleD.p());
    propsScalingD.set_arg<cl_mem>(2, nodeLowerBounds.p());
    propsScalingD.set_arg<cl_mem>(3, nodeUpperBounds.p());
    propsScalingD.set_arg<cl_mem>(4, tree.n_children.p());
    propsScalingD.set_arg<cl_mem>(5, tree.multipole.p());
    propsScalingD.set_arg<float >(6, &theta);
    propsScalingD.set_arg<cl_mem>(7, tree.boxSizeInfo.p());
    propsScalingD.set_arg<cl_mem>(8, tree.boxCenterInfo.p());
    propsScalingD.set_arg<cl_mem>(9, tree.node_bodies.p());

    propsScalingD.setWork(tree.n_nodes, 128);
#ifdef _DEBUG_PRINT_
    printf("propsScaling: \t ");
    propsScalingD.printWorkSize();
#endif
    propsScalingD.execute();


#if 0
#ifdef INDSOFT
    //If we use individual softening we need to get the max softening value
    //to be broadcasted during the exchange of the LET boundaries.
    //Only copy the root node that contains the max value
    my_dev::dev_stream memCpyStream;
    tree.multipole.d2h(3, false, memCpyStream.s());
#endif


    //Set the group properties, note that it is not based on the nodes anymore
    //but on self created groups based on particle order setPHGroupData
    copyNodeDataToGroupData.set_arg<int>(0,    &tree.n_groups);
    copyNodeDataToGroupData.set_arg<int>(1,    &tree.n);
    copyNodeDataToGroupData.set_arg<cl_mem>(2, tree.bodies_Ppos.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(3, tree.group_list_test.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(4, tree.groupCenterInfo.p());
    copyNodeDataToGroupData.set_arg<cl_mem>(5, tree.groupSizeInfo.p());
    copyNodeDataToGroupData.setWork(-1, NCRIT, tree.n_groups);
    copyNodeDataToGroupData.printWorkSize();
    copyNodeDataToGroupData.execute();

#ifdef INDSOFT
    memCpyStream.sync();
    this->maxLocalEps = tree.multipole[0*3 + 1].w; //Softening value
#else
#endif

    //Get the local domain boundary based on group positions and sizes
    real4 r_min, r_max;
    getBoundariesGroups(tree, r_min, r_max);

#endif

}