void op_exchange_halo(op_arg* arg)
{
  //int my_rank, comm_size;
  //MPI_Comm_rank(OP_MPI_WORLD, &my_rank);
  //MPI_Comm_size(OP_MPI_WORLD, &comm_size);

  op_dat dat = arg->dat;

  if(arg->sent == 1)
  {
    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
    fflush(stdout);
    MPI_Abort(OP_MPI_WORLD, 2);
  }

  //need to exchange both direct and indirect data sets if they are dirty
  if((arg->acc == OP_READ || arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
     (dat->dirtybit == 1))
  {
    //printf("Exchanging Halo of data array %10s\n",dat->name);
    halo_list imp_exec_list = OP_import_exec_list[dat->set->index];
    halo_list imp_nonexec_list = OP_import_nonexec_list[dat->set->index];

    halo_list exp_exec_list = OP_export_exec_list[dat->set->index];
    halo_list exp_nonexec_list = OP_export_nonexec_list[dat->set->index];

    //-------first exchange exec elements related to this data array--------

    //sanity checks
    if(compare_sets(imp_exec_list->set,dat->set) == 0)
    {
      printf("Error: Import list and set mismatch\n");
      MPI_Abort(OP_MPI_WORLD, 2);
    }
    if(compare_sets(exp_exec_list->set,dat->set) == 0)
    {
      printf("Error: Export list and set mismatch\n");
      MPI_Abort(OP_MPI_WORLD, 2);
    }

    int set_elem_index;
    for(int i=0; i<exp_exec_list->ranks_size; i++) {
      for(int j = 0; j < exp_exec_list->sizes[i]; j++)
      {
        set_elem_index = exp_exec_list->list[exp_exec_list->disps[i]+j];
        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_exec[exp_exec_list->disps[i]*dat->size+j*dat->size],
          (void *)&dat->data[dat->size*(set_elem_index)],dat->size);
      }
      //printf("export from %d to %d data %10s, number of elements of size %d | sending:\n ",
      //          my_rank, exp_exec_list->ranks[i], dat->name,exp_exec_list->sizes[i]);
      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_exec[exp_exec_list->disps[i]*dat->size],
          dat->size*exp_exec_list->sizes[i],
          MPI_CHAR, exp_exec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
    }


    int init = dat->set->size*dat->size;
    for(int i=0; i < imp_exec_list->ranks_size; i++) {
     // printf("import on to %d from %d data %10s, number of elements of size %d | recieving:\n ",
     //       my_rank, imp_exec_list->ranks[i], dat->name, imp_exec_list->sizes[i]);
      MPI_Irecv(&(dat->data[init+imp_exec_list->disps[i]*dat->size]),
          dat->size*imp_exec_list->sizes[i],
          MPI_CHAR, imp_exec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
    }

    //-----second exchange nonexec elements related to this data array------
    //sanity checks
    if(compare_sets(imp_nonexec_list->set,dat->set) == 0)
    {
      printf("Error: Non-Import list and set mismatch");
      MPI_Abort(OP_MPI_WORLD, 2);
    }
    if(compare_sets(exp_nonexec_list->set,dat->set)==0)
    {
      printf("Error: Non-Export list and set mismatch");
      MPI_Abort(OP_MPI_WORLD, 2);
    }


    for(int i=0; i<exp_nonexec_list->ranks_size; i++) {
      for(int j = 0; j < exp_nonexec_list->sizes[i]; j++)
      {
        set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i]+j];
        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))->
            buf_nonexec[exp_nonexec_list->disps[i]*dat->size+j*dat->size],
            (void *)&dat->data[dat->size*(set_elem_index)],dat->size);
      }
      //printf("export from %d to %d data %10s, number of elements of size %d | sending:\n ",
      //          my_rank, exp_nonexec_list->ranks[i], dat->name,exp_nonexec_list->sizes[i]);
      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_nonexec[exp_nonexec_list->disps[i]*dat->size],
          dat->size*exp_nonexec_list->sizes[i],
          MPI_CHAR, exp_nonexec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
    }

    int nonexec_init = (dat->set->size+imp_exec_list->size)*dat->size;
    for(int i=0; i<imp_nonexec_list->ranks_size; i++) {
      //printf("import on to %d from %d data %10s, number of elements of size %d | recieving:\n ",
      //      my_rank, imp_nonexec_list->ranks[i], dat->name, imp_nonexec_list->sizes[i]);
      MPI_Irecv(&(dat->data[nonexec_init+imp_nonexec_list->disps[i]*dat->size]),
          dat->size*imp_nonexec_list->sizes[i],
          MPI_CHAR, imp_nonexec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
    }
    //clear dirty bit
    dat->dirtybit = 0;
    arg->sent = 1;
  }
}
void op_exchange_halo_partial(op_arg* arg, int exec_flag)
{
  op_dat dat = arg->dat;

  if (arg->opt == 0) return;

  if(arg->sent == 1)
  {
    printf("Error: Halo exchange already in flight for dat %s\n", dat->name);
    fflush(stdout);
    MPI_Abort(OP_MPI_WORLD, 2);
  }
  arg->sent = 0; //reset flag

  //need to exchange indirect data sets if they are dirty
  if((arg->acc == OP_READ || arg->acc == OP_RW /* good for debug || arg->acc == OP_INC*/) &&
     (dat->dirtybit == 1))
  {
    halo_list imp_nonexec_list = OP_import_nonexec_permap[arg->map->index];
    halo_list exp_nonexec_list = OP_export_nonexec_permap[arg->map->index];
    //-------exchange nonexec elements related to this data array and map--------

    //sanity checks
    if(compare_sets(imp_nonexec_list->set,dat->set) == 0)
    {
      printf("Error: Import list and set mismatch\n");
      MPI_Abort(OP_MPI_WORLD, 2);
    }
    if(compare_sets(exp_nonexec_list->set,dat->set) == 0)
    {
      printf("Error: Export list and set mismatch\n");
      MPI_Abort(OP_MPI_WORLD, 2);
    }

    int set_elem_index;
    for(int i=0; i<exp_nonexec_list->ranks_size; i++) {
      for(int j = 0; j < exp_nonexec_list->sizes[i]; j++)
      {
        set_elem_index = exp_nonexec_list->list[exp_nonexec_list->disps[i]+j];
        memcpy(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_nonexec[exp_nonexec_list->disps[i]*dat->size+j*dat->size],
          (void *)&dat->data[dat->size*(set_elem_index)],dat->size);
      }
      MPI_Isend(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_nonexec[exp_nonexec_list->disps[i]*dat->size],
          dat->size*exp_nonexec_list->sizes[i],
          MPI_CHAR, exp_nonexec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          s_req[((op_mpi_buffer)(dat->mpi_buffer))->s_num_req++]);
    }

    int init = exp_nonexec_list->size;
    for(int i=0; i < imp_nonexec_list->ranks_size; i++) {
      MPI_Irecv(&((op_mpi_buffer)(dat->mpi_buffer))->
          buf_nonexec[(init+imp_nonexec_list->disps[i])*dat->size],
          dat->size*imp_nonexec_list->sizes[i],
          MPI_CHAR, imp_nonexec_list->ranks[i],
          dat->index, OP_MPI_WORLD,
          &((op_mpi_buffer)(dat->mpi_buffer))->
          r_req[((op_mpi_buffer)(dat->mpi_buffer))->r_num_req++]);
    }

    //note that we are not settinging the dirtybit to 0, since it's not a full exchange
    arg->sent = 1;
  }
}