void smp_coll_barrier_tree_pull_push(smp_coll_t handle, int flags) {
  int i;
  int flagset = handle->barrier_flag_set;
  gasnett_local_wmb();
  for(i=0; i<handle->barrier_num_children; i++) {
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->barrier_children[i], flagset)==0);
  }
  
  /*reset old one*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, !flagset, 0);

  /*set my flag indicating barrier is done*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 1);  
  if(handle->MYTHREAD!=handle->barrier_root) {
    /*singal parent and wait for parent to signal us*/
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+flagset)==0);
    SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+flagset, 0);
  }
  
  /*signal all my children*/
  for(i=0; i<handle->barrier_num_children; i++) {
    SMP_COLL_SET_BARRIER_FLAG(handle,  handle->barrier_children[i], 2+flagset, 1);
  }
  
  handle->barrier_flag_set = !handle->barrier_flag_set;
  gasnett_local_rmb();
}
void smp_coll_barrier_tree_pull_pull(smp_coll_t handle, int flags) {
  int i;
  int flagset = handle->barrier_flag_set;
  gasnett_local_wmb();
  for(i=0; i<handle->barrier_num_children; i++) {
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->barrier_children[i], flagset)==0);
  }
  
  /*set my flag indicating barrier is done*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, !flagset, 0);
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 1);
    
  /*wait for parent to raise flag*/
  if(handle->MYTHREAD!=handle->barrier_root) {

    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->barrier_parent, 2+flagset)==0);
  }  

  /*parent has now acked my signal so we can clear the up signal*/

  /*clear my down flags from previous round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+(!flagset), 0);
  
  /*set my down flag for this round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+flagset, 1);
  
  handle->barrier_flag_set = !handle->barrier_flag_set;
  gasnett_local_rmb();
}
/*this is a push based implementation since each thread signals on the remote thread when it is ready*/
void smp_coll_barrier_tree_push_push(smp_coll_t handle, int flags) {
  int i;
  int flagset = handle->barrier_flag_set;
  int atomicset = handle->curr_atomic_set;
  gasnett_local_wmb();
  
  /*push based tree wait for all children*/
  gasneti_waitwhile(SMP_COLL_READ_ATOMIC(handle, handle->MYTHREAD, 0, atomicset)!=handle->barrier_num_children);
  SMP_COLL_RESET_ATOMIC(handle, handle->MYTHREAD, 0, atomicset);

  /*if i'm not root*/
  if(handle->MYTHREAD!=handle->barrier_root) {
    /*singal parent and wait for parent to signal us*/
    SMP_COLL_INC_ATOMIC(handle, handle->barrier_parent, 0, atomicset);
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset)==0);
    SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 0);
  }
  
  /*signal all my children*/
  for(i=0; i<handle->barrier_num_children; i++) {
   SMP_COLL_SET_BARRIER_FLAG(handle,  handle->barrier_children[i], flagset, 1);
  }

  handle->barrier_flag_set = !handle->barrier_flag_set;
  handle->curr_atomic_set = !handle->curr_atomic_set;
  gasnett_local_rmb();

}
void smp_coll_barrier_tree_push_pull(smp_coll_t handle, int flags) {
  int flagset = handle->barrier_flag_set;
  gasnett_local_wmb();
  
  /*push based tree wait for all children*/
  gasneti_waitwhile(SMP_COLL_READ_ATOMIC(handle, handle->MYTHREAD, 0, handle->curr_atomic_set)!=handle->barrier_num_children);
  SMP_COLL_RESET_ATOMIC(handle, handle->MYTHREAD, 0, handle->curr_atomic_set);
  
  /*signal parent and wiat for parent*/
  if(handle->MYTHREAD!=handle->barrier_root) { 
    SMP_COLL_INC_ATOMIC(handle, handle->barrier_parent, 0, handle->curr_atomic_set);
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->barrier_parent, flagset)==0);
  }
  
  /*parent has now acked my signal so we can clear the up signal*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 0);
  
  /*clear my down flags from previous round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, (!flagset), 0);
  
  /*set my down flag for this round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 1);
  
  handle->barrier_flag_set = !handle->barrier_flag_set;
  handle->curr_atomic_set = !handle->curr_atomic_set;
  gasnett_local_rmb();
}
void smp_coll_barrier_tree_flag(smp_coll_t handle, int flags) {
  int idx = 0;
  int num_digits = handle->barrier_log_radix_THREADS;
  int radixlog2 = handle->barrier_log_2_radix;
  int radix = handle->barrier_radix;
  int i,j,k;
  int parent=-1;
  
  gasnett_local_wmb();

  /* reduce data from all the children*/
  for(i=num_digits-1,j=0; i>=0; i--,j++) {
    /*if my i^th digit is 0 that means that i am a sender for this round*/
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0 && 
       SMP_COLL_GET_LOWER_K_DIGITS_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0) {
      for(k=1;k<radix;k++) {
        int dest = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, k, radix, radixlog2);
        if(dest<handle->THREADS) {
          /*wait for dest to be ready before we send*/
          gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, dest, 0)==0);
        }
      }
    } 
  }
  
  /*set the flag indicating that my data (and all the data under my subtree is ready)*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 0, 1);
  
  if(handle->MYTHREAD!=barrier_root) {
    /*Wait for parent to signal that my data for the entire tree has arrived*/
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, handle->MYTHREAD, 0)!=0);
  }

  /*broadcast the data back down my subtree*/
  for(i=num_digits-1,j=0; i>=0; i--,j++) {
    /*if my i^th digit is 0 that means that i am a sender for this round*/
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0 && 
       SMP_COLL_GET_LOWER_K_DIGITS_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0) {
      for(k=1;k<radix;k++) {
        int dest = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, k, radix, radixlog2);
        if(dest<handle->THREADS) {
	  /*write memory barrier to ensure data is transfered before we set the flag*/

          SMP_COLL_SET_BARRIER_FLAG(handle, dest, 0, 0);
        }
      }
    } 
  }
  gasnett_local_rmb();

}
void smp_coll_barrier_flag_tree_up_tree_down(smp_coll_t handle, int flags) {
  int idx = 0;
  int num_digits = handle->barrier_log_radix_THREADS;
  int radixlog2 = handle->barrier_log_2_radix;
  int radix = handle->barrier_radix;
  int i,j,k;

  int flagset = handle->barrier_flag_set;
  int parent = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, 0, 0, radix, radixlog2);
  
  /* reduce data from all the children*/
  for(i=num_digits-1,j=0; i>=0; i--,j++) {
    /*if my i^th digit is 0 that means that i am a sender for this round*/
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0 && 
       SMP_COLL_GET_LOWER_K_DIGITS_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0) {
      for(k=1;k<radix;k++) {
        int dest = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, k, radix, radixlog2);
        if(dest<handle->THREADS) {
          /*wait for dest to be ready before we send*/
          gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, dest, flagset)==0);
        }
      }
    } 
  }
  
  
  /*set the flag indicating that my data (and all the data under my subtree is ready)*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 1);

  /*wait for parent to raise flag*/
  if(handle->MYTHREAD!=barrier_root) {
    gasneti_waitwhile(SMP_COLL_GET_BARRIER_FLAG(handle, parent, 2+flagset)==0);

  }  
  /*parent has now acked my signal so we can clear the up signal*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, flagset, 0);
  
  /*clear my down flags from previous round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+(!flagset), 0);

  /*set my down flag for this round*/
  SMP_COLL_SET_BARRIER_FLAG(handle, handle->MYTHREAD, 2+flagset, 1);

  handle->barrier_flag_set = !handle->barrier_flag_set;


  
}
void smp_coll_broadcast_tree_flag(smp_coll_t handle, int num_addrs, void * const dstlist[], const void *src, 
                                    size_t nbytes, int flags, int radix){
  int num_digits = smp_coll_mylogn(handle->THREADS, radix); 
  int radixlog2 = smp_coll_mylogn(radix,2);

  int i,j,k;
  
  if((flags & SMP_COLL_ALL_SYNC)) smp_coll_barrier(handle, flags);
  /*first thing all threads do is set their flag to be 1 indicating they have arrived*/
  SMP_COLL_SET_BCAST_FLAG(handle, handle->MYTHREAD, 0, 1);
  
  /*
   Don't care who my parent is for this algorithm
  for(i=0; i<num_digits; i++) {
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)!=0) {
      parent = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, 0,  radix, radixlog2);
      break;
    }
  }
   */
  
  /*they then wait for the parent to come around and reset their flag back to 0 indicating the data has also arrived*/
  if(handle->MYTHREAD!=0) {
    gasneti_waitwhile(SMP_COLL_GET_BCAST_FLAG(handle, handle->MYTHREAD,0)!=0);
  } else {
    memcpy(dstlist[0], src, nbytes);
  }
  for(i=num_digits-1,j=0; i>=0; i--,j++) {
    /*if my i^th digit is 0 that means that i am a sender for this round*/
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0 && 
       SMP_COLL_GET_LOWER_K_DIGITS_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0) {
      for(k=1;k<radix;k++) {
        int dest = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, k, radix, radixlog2);
        if(dest<handle->THREADS) {
          /*wait for dest to be ready before we send*/
          gasneti_waitwhile(SMP_COLL_GET_BCAST_FLAG(handle, dest, 0)==0);
          memcpy(dstlist[dest], dstlist[handle->MYTHREAD], nbytes); 
          /*write memory barrier to ensure data is transfered before we set the flag*/
          gasnett_local_wmb();
          SMP_COLL_SET_BCAST_FLAG(handle, dest, 0, 0);
        }
      }
    } 
    
  } 
  if((flags & SMP_COLL_ALL_SYNC)) smp_coll_barrier(handle, flags); 

}
void smp_coll_broadcast_tree_atomic(smp_coll_t handle, int num_addrs, void * const dstlist[], const void *src, 
                             size_t nbytes, int flags, int radix){
  int num_digits = smp_coll_mylogn(handle->THREADS, radix); 
  int radixlog2 = smp_coll_mylogn(radix,2);
  int i,j,k;
  int parent=-1;
  
  if((flags & SMP_COLL_ALL_SYNC)) smp_coll_barrier(handle, flags);
  for(i=0; i<num_digits; i++) {
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)!=0) {
      parent = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, 0,  radix, radixlog2);
      break;
    }
  }
  if(parent!=-1) {
    gasneti_waitwhile(SMP_COLL_READ_ATOMIC(handle, handle->MYTHREAD, 0, handle->curr_atomic_set)!=1);
    SMP_COLL_DEC_ATOMIC(handle, handle->MYTHREAD, 0, handle->curr_atomic_set);
    gasnett_local_rmb();
  } else {
    memcpy(dstlist[0], src, nbytes);
  }
  for(i=num_digits-1,j=0; i>=0; i--,j++) {
    /*if my i^th digit is 0 that means that i am a sender for this round*/
    if(SMP_COLL_GET_ITH_DIGIT_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0 && 
       SMP_COLL_GET_LOWER_K_DIGITS_POWER2RADIX(handle->MYTHREAD, i, radix, radixlog2)==0) {
      for(k=1;k<radix;k++) {
        int dest = SMP_COLL_MAKE_NUM_POWER2RADIX(handle->MYTHREAD, i, k, radix, radixlog2);
        if(dest<handle->THREADS) {
          GASNETE_FAST_UNALIGNED_MEMCPY(dstlist[dest], dstlist[handle->MYTHREAD], nbytes); 
          gasnett_local_wmb();
          SMP_COLL_INC_ATOMIC(handle, dest, 0, handle->curr_atomic_set);
        }
      }
    } 
    
  } 
  if((flags & SMP_COLL_ALL_SYNC)) smp_coll_barrier(handle, flags); 
  handle->curr_atomic_set = !handle->curr_atomic_set;
}
/*log_r(n) rounds with O(n) messages per round*/
void smp_coll_barrier_dissem_atomic(smp_coll_t handle, int flags) {
  const dissem_info_t *dissem = handle->dissem_info;
  const dissem_vector_t *barrier_order = dissem->barrier_order;
  int i,j;
  gasnett_local_wmb();
  for(i=0; i<dissem->dissemination_phases; i++) {
    if(barrier_order[i].n > 0) {
      const int* elem_list = barrier_order[i].elem_list;
      /*post on the sems*/
      for(j=0; j<barrier_order[i].n; j++) {
        int dest = elem_list[j];
        /*increment counter i on dest by 1*/
        SMP_COLL_INC_ATOMIC(handle, dest, i, handle->curr_atomic_set);
      }
      /*wait for counter i to be barrier_order[i].n*/
      gasneti_waitwhile(SMP_COLL_READ_ATOMIC(handle, handle->MYTHREAD, i, handle->curr_atomic_set)!=barrier_order[i].n);
      SMP_COLL_RESET_ATOMIC(handle, handle->MYTHREAD, i, handle->curr_atomic_set);
    }
  }
  handle->curr_atomic_set = !handle->curr_atomic_set;
  gasnett_local_rmb();

}
static void
gasnete_coll_pami_allgavi(const gasnet_team_handle_t team,
                          void *dst, const void *src,
                          size_t nbytes, int flags GASNETE_THREAD_FARG)
{
    int i_am_leader = gasnete_coll_pami_images_barrier(team); /* XXX: over-synced for IN_NO and IN_MY */
    const gasnete_coll_threaddata_t * const td = GASNETE_COLL_MYTHREAD_NOALLOC;

    if (flags & GASNET_COLL_IN_ALLSYNC) {
        if (i_am_leader) gasnetc_fast_barrier();
        (void) gasnete_coll_pami_images_barrier(team);
    }

    GASNETE_FAST_UNALIGNED_MEMCPY(gasnete_coll_scale_ptr(team->pami.scratch_space,
                                                         td->my_local_image,
                                                         nbytes),
                                  src, nbytes);
    (void) gasnete_coll_pami_images_barrier(team);

    if (i_am_leader) {
        volatile unsigned int done = 0;
        pami_result_t rc;
        pami_xfer_t op;

        op = gasnete_op_template_allgavi; /* allgatherv_int */
        op.cookie = (void *)&done;
        op.algorithm = team->pami.allgavi_alg;
        op.cmd.xfer_allgatherv_int.sndbuf = team->pami.scratch_space;
        op.cmd.xfer_allgatherv_int.stypecount = nbytes * team->my_images;

        op.cmd.xfer_allgatherv_int.rcvbuf = dst;
        op.cmd.xfer_allgatherv_int.rtypecounts = team->pami.counts;
        op.cmd.xfer_allgatherv_int.rdispls = team->pami.displs;
        if (team->pami.prev_nbytes != nbytes) {
            int i;
            for (i = 0; i < team->total_ranks; ++i) {
                op.cmd.xfer_allgatherv_int.rtypecounts[i] = nbytes * team->all_images[i];
                op.cmd.xfer_allgatherv_int.rdispls[i] = nbytes * team->all_offset[i];
            }
            team->pami.prev_nbytes = nbytes;
        }

        GASNETC_PAMI_LOCK(gasnetc_context);
        rc = PAMI_Collective(gasnetc_context, &op);
        GASNETC_PAMI_UNLOCK(gasnetc_context);
        GASNETC_PAMI_CHECK(rc, "initiating blocking allgatherv_int");

        gasneti_polluntil(done);

        gasneti_assert(NULL == team->pami.tmp_addr);
        gasneti_sync_writes(); /* XXX: is this necessary? */
        team->pami.tmp_addr = dst; /* wakes pollers, below */
        (void) gasnete_coll_pami_images_barrier(team); /* matches instance below vvvv */
        team->pami.tmp_addr = NULL;
    } else {
        gasneti_waitwhile(NULL == team->pami.tmp_addr);
        GASNETE_FAST_UNALIGNED_MEMCPY(dst, team->pami.tmp_addr, nbytes * team->total_images);
        (void) gasnete_coll_pami_images_barrier(team); /* matches instance above ^^^^ */
    }
      
    if (flags & GASNET_COLL_OUT_ALLSYNC) {
        if (i_am_leader) gasnetc_fast_barrier();
        (void) gasnete_coll_pami_images_barrier(team);
    }
}
static void
gasnete_coll_pami_scattvi(const gasnet_team_handle_t team, void *dst,
                          gasnet_image_t srcimage, const void *src,
                          size_t nbytes, int flags GASNETI_THREAD_FARG)
{
    const int i_am_root = gasnete_coll_image_is_local(team, srcimage);
    int i_am_leader = gasnete_coll_pami_images_barrier(team); /* XXX: over-synced for IN_NO and IN_MY */
    const gasnete_coll_threaddata_t * const td = GASNETE_COLL_MYTHREAD_NOALLOC;

    if ((flags & GASNET_COLL_LOCAL) && i_am_root) {
        /* root thread must be leader for its node */
        i_am_leader = (srcimage == td->my_image);
    }

    if (i_am_leader) {
        volatile unsigned int done = 0;
        pami_result_t rc;
        pami_xfer_t op;

        if (flags & GASNET_COLL_IN_ALLSYNC) gasnetc_fast_barrier();

        op = gasnete_op_template_scattvi; /* scatterv_int */
        op.cookie = (void *)&done;
        op.algorithm = team->pami.scattvi_alg;
        op.cmd.xfer_scatterv_int.root = gasnetc_endpoint(GASNETE_COLL_REL2ACT(team,gasnete_coll_image_node(team, srcimage)));
        op.cmd.xfer_scatterv_int.rcvbuf = team->pami.scratch_space;
        op.cmd.xfer_scatterv_int.rtypecount = nbytes * team->my_images;

        if (i_am_root) {
            op.cmd.xfer_scatterv_int.sndbuf = (/*not-const*/ void *)src;
            op.cmd.xfer_scatterv_int.stypecounts = team->pami.counts;
            op.cmd.xfer_scatterv_int.sdispls = team->pami.displs;
            if (team->pami.prev_nbytes != nbytes) {
                int i;
                for (i = 0; i < team->total_ranks; ++i) {
                    op.cmd.xfer_scatterv_int.stypecounts[i] = nbytes * team->all_images[i];
                    op.cmd.xfer_scatterv_int.sdispls[i] = nbytes * team->all_offset[i];
                }
                team->pami.prev_nbytes = nbytes;
            }
        }

        GASNETC_PAMI_LOCK(gasnetc_context);
        rc = PAMI_Collective(gasnetc_context, &op);
        GASNETC_PAMI_UNLOCK(gasnetc_context);
        GASNETC_PAMI_CHECK(rc, "initiating blocking scatterv_int");

        gasneti_polluntil(done);

        gasneti_assert(NULL == team->pami.tmp_addr);
        gasneti_sync_writes();
        team->pami.tmp_addr = team->pami.scratch_space; /* wakes pollers, below */
    } else {
        gasneti_waitwhile(NULL == team->pami.tmp_addr);
    }

    GASNETI_MEMCPY               (dst,
                                  gasnete_coll_scale_ptr(team->pami.tmp_addr,
                                                         td->my_local_image,
                                                         nbytes),
                                  nbytes);
    (void) gasnete_coll_pami_images_barrier(team);

    if (i_am_leader) {
        team->pami.tmp_addr = NULL;
    }

    if (flags & GASNET_COLL_OUT_ALLSYNC) {
        if (i_am_leader) gasnetc_fast_barrier();
        (void) gasnete_coll_pami_images_barrier(team);
    }
}