void run_rma_test(int nprocs_per_node)
{
  int myrank, nprocs;
  int mem_rank;
  MPI_Win win;
  int *baseptr;
  MPI_Aint local_size;

  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  if (nprocs < nprocs_per_node * 2)
  {
    if (!myrank) printf("should start program with at least %d processes\n", nprocs_per_node * 2);
    MPI_Finalize();
    exit(EXIT_FAILURE);
  }


  mem_rank = nprocs_per_node + nprocs_per_node / 2;

  local_size = (myrank == mem_rank) ? COUNT : 0;

  MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &win);

  MPI_Win_lock_all(0, win);



  int type_size;
  MPI_Type_size(MPI_INT, &type_size);

  size_t nbytes = COUNT * type_size;

  assert(MPI_Alloc_mem(nbytes, MPI_INFO_NULL, &baseptr) == MPI_SUCCESS);
  assert(MPI_Win_attach(win, baseptr, nbytes) == MPI_SUCCESS);

  MPI_Aint ldisp;
  MPI_Aint *disps = malloc(nprocs * sizeof(MPI_Aint));

  assert(MPI_Get_address(baseptr, &ldisp) == MPI_SUCCESS);

  assert(MPI_Allgather(&ldisp, 1, MPI_AINT, disps, nprocs, MPI_AINT, MPI_COMM_WORLD) == MPI_SUCCESS);

  if (myrank == 0)
  {
    for (size_t idx = 0; idx < COUNT; ++idx) {
      baseptr[idx] = idx * COUNT + 1;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);

  if (myrank == mem_rank) {
    assert(MPI_Get(baseptr, 10, MPI_INT, 0, disps[0], 10, MPI_INT, win) == MPI_SUCCESS);
    assert(MPI_Win_flush(0, win) == MPI_SUCCESS);

    for (size_t idx = 0; idx < COUNT; ++idx) {
      assert(baseptr[idx] == idx * 10 + 1);
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Win_unlock_all(win);

  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Win_free(&win);

  MPI_Free_mem(baseptr);

  printf("Test finished\n");
}
Esempio n. 2
0
File: MPI-api.c Progetto: 8l/rose
void declareBindings (void)
{
  /* === Point-to-point === */
  void* buf;
  int count;
  MPI_Datatype datatype;
  int dest;
  int tag;
  MPI_Comm comm;
  MPI_Send (buf, count, datatype, dest, tag, comm); // L12
  int source;
  MPI_Status status;
  MPI_Recv (buf, count, datatype, source, tag, comm, &status); // L15
  MPI_Get_count (&status, datatype, &count);
  MPI_Bsend (buf, count, datatype, dest, tag, comm);
  MPI_Ssend (buf, count, datatype, dest, tag, comm);
  MPI_Rsend (buf, count, datatype, dest, tag, comm);
  void* buffer;
  int size;
  MPI_Buffer_attach (buffer, size); // L22
  MPI_Buffer_detach (buffer, &size);
  MPI_Request request;
  MPI_Isend (buf, count, datatype, dest, tag, comm, &request); // L25
  MPI_Ibsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Issend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irecv (buf, count, datatype, source, tag, comm, &request);
  MPI_Wait (&request, &status);
  int flag;
  MPI_Test (&request, &flag, &status); // L32
  MPI_Request_free (&request);
  MPI_Request* array_of_requests;
  int index;
  MPI_Waitany (count, array_of_requests, &index, &status); // L36
  MPI_Testany (count, array_of_requests, &index, &flag, &status);
  MPI_Status* array_of_statuses;
  MPI_Waitall (count, array_of_requests, array_of_statuses); // L39
  MPI_Testall (count, array_of_requests, &flag, array_of_statuses);
  int incount;
  int outcount;
  int* array_of_indices;
  MPI_Waitsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L44--45
  MPI_Testsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L46--47
  MPI_Iprobe (source, tag, comm, &flag, &status); // L48
  MPI_Probe (source, tag, comm, &status);
  MPI_Cancel (&request);
  MPI_Test_cancelled (&status, &flag);
  MPI_Send_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Bsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Ssend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Rsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Recv_init (buf, count, datatype, source, tag, comm, &request);
  MPI_Start (&request);
  MPI_Startall (count, array_of_requests);
  void* sendbuf;
  int sendcount;
  MPI_Datatype sendtype;
  int sendtag;
  void* recvbuf;
  int recvcount;
  MPI_Datatype recvtype;
  MPI_Datatype recvtag;
  MPI_Sendrecv (sendbuf, sendcount, sendtype, dest, sendtag,
		recvbuf, recvcount, recvtype, source, recvtag,
		comm, &status); // L67--69
  MPI_Sendrecv_replace (buf, count, datatype, dest, sendtag, source, recvtag,
			comm, &status); // L70--71
  MPI_Datatype oldtype;
  MPI_Datatype newtype;
  MPI_Type_contiguous (count, oldtype, &newtype); // L74
  int blocklength;
  {
    int stride;
    MPI_Type_vector (count, blocklength, stride, oldtype, &newtype); // L78
  }
  {
    MPI_Aint stride;
    MPI_Type_hvector (count, blocklength, stride, oldtype, &newtype); // L82
  }
  int* array_of_blocklengths;
  {
    int* array_of_displacements;
    MPI_Type_indexed (count, array_of_blocklengths, array_of_displacements,
		      oldtype, &newtype); // L87--88
  }
  {
    MPI_Aint* array_of_displacements;
    MPI_Type_hindexed (count, array_of_blocklengths, array_of_displacements,
                       oldtype, &newtype); // L92--93
    MPI_Datatype* array_of_types;
    MPI_Type_struct (count, array_of_blocklengths, array_of_displacements,
                     array_of_types, &newtype); // L95--96
  }
  void* location;
  MPI_Aint address;
  MPI_Address (location, &address); // L100
  MPI_Aint extent;
  MPI_Type_extent (datatype, &extent); // L102
  MPI_Type_size (datatype, &size);
  MPI_Aint displacement;
  MPI_Type_lb (datatype, &displacement); // L105
  MPI_Type_ub (datatype, &displacement);
  MPI_Type_commit (&datatype);
  MPI_Type_free (&datatype);
  MPI_Get_elements (&status, datatype, &count);
  void* inbuf;
  void* outbuf;
  int outsize;
  int position;
  MPI_Pack (inbuf, incount, datatype, outbuf, outsize, &position, comm); // L114
  int insize;
  MPI_Unpack (inbuf, insize, &position, outbuf, outcount, datatype,
	      comm); // L116--117
  MPI_Pack_size (incount, datatype, comm, &size);

  /* === Collectives === */
  MPI_Barrier (comm); // L121
  int root;
  MPI_Bcast (buffer, count, datatype, root, comm); // L123
  MPI_Gather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
	      root, comm); // L124--125
  int* recvcounts;
  int* displs;
  MPI_Gatherv (sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
	       root, comm); // L128--130
  MPI_Scatter (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
               root, comm); // L131--132
  int* sendcounts;
  MPI_Scatterv (sendbuf, sendcounts, displs, sendtype,
		recvbuf, recvcount, recvtype, root, comm); // L134--135
  MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
                 comm); // L136--137
  MPI_Allgatherv (sendbuf, sendcount, sendtype,
		  recvbuf, recvcounts, displs, recvtype,
		  comm); // L138--140
  MPI_Alltoall (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
		comm); // L141--142
  int* sdispls;
  int* rdispls;
  MPI_Alltoallv (sendbuf, sendcounts, sdispls, sendtype,
                 recvbuf, recvcounts, rdispls, recvtype,
		 comm); // L145--147
  MPI_Op op;
  MPI_Reduce (sendbuf, recvbuf, count, datatype, op, root, comm); // L149
#if 0
  MPI_User_function function;
  int commute;
  MPI_Op_create (function, commute, &op); // L153
#endif
  MPI_Op_free (&op); // L155
  MPI_Allreduce (sendbuf, recvbuf, count, datatype, op, comm);
  MPI_Reduce_scatter (sendbuf, recvbuf, recvcounts, datatype, op, comm);
  MPI_Scan (sendbuf, recvbuf, count, datatype, op, comm);

  /* === Groups, contexts, and communicators === */
  MPI_Group group;
  MPI_Group_size (group, &size); // L162
  int rank;
  MPI_Group_rank (group, &rank); // L164
  MPI_Group group1;
  int n;
  int* ranks1;
  MPI_Group group2;
  int* ranks2;
  MPI_Group_translate_ranks (group1, n, ranks1, group2, ranks2); // L170
  int result;
  MPI_Group_compare (group1, group2, &result); // L172
  MPI_Group newgroup;
  MPI_Group_union (group1, group2, &newgroup); // L174
  MPI_Group_intersection (group1, group2, &newgroup);
  MPI_Group_difference (group1, group2, &newgroup);
  int* ranks;
  MPI_Group_incl (group, n, ranks, &newgroup); // L178
  MPI_Group_excl (group, n, ranks, &newgroup);
  extern int ranges[][3];
  MPI_Group_range_incl (group, n, ranges, &newgroup); // L181
  MPI_Group_range_excl (group, n, ranges, &newgroup);
  MPI_Group_free (&group);
  MPI_Comm_size (comm, &size);
  MPI_Comm_rank (comm, &rank);
  MPI_Comm comm1;
  MPI_Comm comm2;
  MPI_Comm_compare (comm1, comm2, &result);
  MPI_Comm newcomm;
  MPI_Comm_dup (comm, &newcomm);
  MPI_Comm_create (comm, group, &newcomm);
  int color;
  int key;
  MPI_Comm_split (comm, color, key, &newcomm); // L194
  MPI_Comm_free (&comm);
  MPI_Comm_test_inter (comm, &flag);
  MPI_Comm_remote_size (comm, &size);
  MPI_Comm_remote_group (comm, &group);
  MPI_Comm local_comm;
  int local_leader;
  MPI_Comm peer_comm;
  int remote_leader;
  MPI_Comm newintercomm;
  MPI_Intercomm_create (local_comm, local_leader, peer_comm, remote_leader, tag,
			&newintercomm); // L204--205
  MPI_Comm intercomm;
  MPI_Comm newintracomm;
  int high;
  MPI_Intercomm_merge (intercomm, high, &newintracomm); // L209
  int keyval;
#if 0
  MPI_Copy_function copy_fn;
  MPI_Delete_function delete_fn;
  void* extra_state;
  MPI_Keyval_create (copy_fn, delete_fn, &keyval, extra_state); // L215
#endif
  MPI_Keyval_free (&keyval); // L217
  void* attribute_val;
  MPI_Attr_put (comm, keyval, attribute_val); // L219
  MPI_Attr_get (comm, keyval, attribute_val, &flag);
  MPI_Attr_delete (comm, keyval);

  /* === Environmental inquiry === */
  char* name;
  int resultlen;
  MPI_Get_processor_name (name, &resultlen); // L226
  MPI_Errhandler errhandler;
#if 0
  MPI_Handler_function function;
  MPI_Errhandler_create (function, &errhandler); // L230
#endif
  MPI_Errhandler_set (comm, errhandler); // L232
  MPI_Errhandler_get (comm, &errhandler);
  MPI_Errhandler_free (&errhandler);
  int errorcode;
  char* string;
  MPI_Error_string (errorcode, string, &resultlen); // L237
  int errorclass;
  MPI_Error_class (errorcode, &errorclass); // L239
  MPI_Wtime ();
  MPI_Wtick ();
  int argc;
  char** argv;
  MPI_Init (&argc, &argv); // L244
  MPI_Finalize ();
  MPI_Initialized (&flag);
  MPI_Abort (comm, errorcode);
}
Esempio n. 3
0
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
				   MPI_Datatype datatype,
				   int file_ptr_type, ADIO_Offset offset,
				   ADIO_Status *status, int *error_code)
{
    /* Uses a generalized version of the extended two-phase method described
     * in "An Extended Two-Phase Method for Accessing Sections of
     * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
     * Scientific Programming, (5)4:301--317, Winter 1996.
     * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
     */

    ADIOI_Access *my_req;
    /* array of nprocs access structures, one for each other process has
       this process's request */

    ADIOI_Access *others_req;
    /* array of nprocs access structures, one for each other process
       whose request is written by this process. */

    int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
    int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
    ADIO_Offset orig_fp, start_offset, end_offset, off;
    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
    ADIO_Offset *len_list = NULL;
    int **buf_idx = NULL, *striping_info = NULL;
    int old_error, tmp_error;

    MPI_Comm_size(fd->comm, &nprocs);
    MPI_Comm_rank(fd->comm, &myrank);

    orig_fp = fd->fp_ind;

    /* IO patten identification if cb_write isn't disabled */
    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
	/* For this process's request, calculate the list of offsets and
	   lengths in the file and determine the start and end offsets. */

	/* Note: end_offset points to the last byte-offset that will be accessed.
         * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
         */

	ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
	                      &offset_list, &len_list, &start_offset,
	                      &end_offset, &contig_access_count);

	/* each process communicates its start and end offsets to other
         * processes. The result is an array each of start and end offsets
         * stored in order of process rank.
         */
	st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
	end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
	MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
		      ADIO_OFFSET, fd->comm);
	MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
		      ADIO_OFFSET, fd->comm);
	/* are the accesses of different processes interleaved? */
	for (i = 1; i < nprocs; i++)
	    if ((st_offsets[i] < end_offsets[i-1]) &&
                (st_offsets[i] <= end_offsets[i]))
                interleave_count++;
	/* This is a rudimentary check for interleaving, but should suffice
	   for the moment. */

	/* Two typical access patterns can benefit from collective write.
         *   1) the processes are interleaved, and
         *   2) the req size is small.
         */
        if (interleave_count > 0) {
	    do_collect = 1;
        } else {
            do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
			                        len_list, nprocs);
        }
    }
    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);

    /* Decide if collective I/O should be done */
    if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
        fd->hints->cb_write == ADIOI_HINT_DISABLE) {

	/* use independent accesses */
	if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
	    ADIOI_Free(offset_list);
	    ADIOI_Free(len_list);
            ADIOI_Free(st_offsets);
            ADIOI_Free(end_offsets);
	}

	fd->fp_ind = orig_fp;
	ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
	if (buftype_is_contig && filetype_is_contig) {
	    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
                off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset;
		ADIO_WriteContig(fd, buf, count, datatype,
				 ADIO_EXPLICIT_OFFSET,
				 off, status, error_code);
	    } else
		ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
				 0, status, error_code);
	} else {
	    ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
			      offset, status, error_code);
	}
	return;
    }

    /* Get Lustre hints information */
    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);

    /* calculate what portions of the access requests of this process are
     * located in which process
     */
    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
                             striping_info, nprocs, &count_my_req_procs,
                             &count_my_req_per_proc, &my_req,
                             &buf_idx);

    /* based on everyone's my_req, calculate what requests of other processes
     * will be accessed by this process.
     * count_others_req_procs = number of processes whose requests (including
     * this process itself) will be accessed by this process
     * count_others_req_per_proc[i] indicates how many separate contiguous
     * requests of proc. i will be accessed by this process.
     */

    ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
                          my_req, nprocs, myrank, &count_others_req_procs,
                          &others_req);
    ADIOI_Free(count_my_req_per_proc);

    /* exchange data and write in sizes of no more than stripe_size. */
    ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
                                others_req, my_req, offset_list, len_list,
                                contig_access_count, striping_info,
                                buf_idx, error_code);

    /* If this collective write is followed by an independent write,
     * it's possible to have those subsequent writes on other processes
     * race ahead and sneak in before the read-modify-write completes.
     * We carry out a collective communication at the end here so no one
     * can start independent i/o before collective I/O completes.
     *
     * need to do some gymnastics with the error codes so that if something
     * went wrong, all processes report error, but if a process has a more
     * specific error code, we can still have that process report the
     * additional information */

    old_error = *error_code;
    if (*error_code != MPI_SUCCESS)
	*error_code = MPI_ERR_IO;

    /* optimization: if only one process performing i/o, we can perform
     * a less-expensive Bcast  */
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
#endif
    if (fd->hints->cb_nodes == 1)
	MPI_Bcast(error_code, 1, MPI_INT,
		  fd->hints->ranklist[0], fd->comm);
    else {
	tmp_error = *error_code;
	MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
		      MPI_MAX, fd->comm);
    }
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
#endif

    if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
	*error_code = old_error;


    if (!buftype_is_contig)
	ADIOI_Delete_flattened(datatype);

    /* free all memory allocated for collective I/O */
    /* free others_req */
    for (i = 0; i < nprocs; i++) {
	if (others_req[i].count) {
	    ADIOI_Free(others_req[i].offsets);
	    ADIOI_Free(others_req[i].lens);
	    ADIOI_Free(others_req[i].mem_ptrs);
	}
    }
    ADIOI_Free(others_req);
    /* free my_req here */
    for (i = 0; i < nprocs; i++) {
	if (my_req[i].count) {
	    ADIOI_Free(my_req[i].offsets);
	    ADIOI_Free(my_req[i].lens);
	}
    }
    ADIOI_Free(my_req);
    for (i = 0; i < nprocs; i++) {
        ADIOI_Free(buf_idx[i]);
    }
    ADIOI_Free(buf_idx);
    ADIOI_Free(offset_list);
    ADIOI_Free(len_list);
    ADIOI_Free(st_offsets);
    ADIOI_Free(end_offsets);
    ADIOI_Free(striping_info);

#ifdef HAVE_STATUS_SET_BYTES
    if (status) {
	int bufsize, size;
	/* Don't set status if it isn't needed */
	MPI_Type_size(datatype, &size);
	bufsize = size * count;
	MPIR_Status_set_bytes(status, datatype, bufsize);
    }
    /* This is a temporary way of filling in status. The right way is to
     * keep track of how much data was actually written during collective I/O.
     */
#endif

    fd->fp_sys_posn = -1;	/* set it to null. */
}
Esempio n. 4
0
void all_gather(const T& elem, std::vector<T>& results) {
#ifdef HAS_MPI
    // Get the mpi rank and size
    size_t mpi_size(size());
    if(results.size() != mpi_size) results.resize(mpi_size);

    // Serialize the local map
    graphlab::charstream cstrm(128);
    graphlab::oarchive oarc(cstrm);
    oarc << elem;
    cstrm.flush();
    char* send_buffer = cstrm->c_str();
    int send_buffer_size = (int)cstrm->size();
    assert(send_buffer_size >= 0);

    // compute the sizes
    std::vector<int> recv_sizes(mpi_size, -1);
    // Compute the sizes
    int error = MPI_Allgather(&send_buffer_size,  // Send buffer
                              1,                  // send count
                              MPI_INT,            // send type
                              &(recv_sizes[0]),  // recvbuffer
                              1,                  // recvcount
                              MPI_INT,           // recvtype
                              MPI_COMM_WORLD);
    assert(error == MPI_SUCCESS);
    for(size_t i = 0; i < recv_sizes.size(); ++i)
        assert(recv_sizes[i] >= 0);


    // Construct offsets
    std::vector<int> recv_offsets(recv_sizes);
    int sum = 0, tmp = 0;
    for(size_t i = 0; i < recv_offsets.size(); ++i) {
        tmp = recv_offsets[i];
        recv_offsets[i] = sum;
        sum += tmp;
    }

    // if necessary realloac recv_buffer
    std::vector<char> recv_buffer(sum);

    // recv all the maps
    error = MPI_Allgatherv(send_buffer,         // send buffer
                           send_buffer_size,    // how much to send
                           MPI_BYTE,            // send type
                           &(recv_buffer[0]),   // recv buffer
                           &(recv_sizes[0]),    // amount to recv
                           // for each cpuess
                           &(recv_offsets[0]),  // where to place data
                           MPI_BYTE,
                           MPI_COMM_WORLD);
    assert(error == MPI_SUCCESS);
    // Update the local map
    namespace bio = boost::iostreams;
    typedef bio::stream<bio::array_source> icharstream;
    icharstream strm(&(recv_buffer[0]), recv_buffer.size());
    graphlab::iarchive iarc(strm);
    for(size_t i = 0; i < results.size(); ++i) {
        iarc >> results[i];
    }
#else
    logstream(LOG_FATAL) << "MPI not installed!" << std::endl;
#endif
} // end of mpi all gather
int uctSort(NimGameState rootState, int maximumIterations, bool useRanks)
{
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    MctsNode root(0, NULL, &rootState);
    for(int i=0; i<maximumIterations; i++)
    {
        MctsNode* node = &root;
        NimGameState state = rootState.clone();

        // std::cout<< "Copy Repr " << node->representation() << std::endl;
        while(node->actionsNotTaken.empty() && !node->childNodes.empty())
        {
            // std::cout<< "Entering Selection Step" << std::endl;
            node = node->selectNextChildNode();
            state.performAction(node->previousAction);
        }

        if(!node->actionsNotTaken.empty())
        {
            // std::cout<< "Entering Expansion Step" << std::endl;
            std::random_shuffle(node->actionsNotTaken.begin(), node->actionsNotTaken.end());
            int action = node->actionsNotTaken.back();
            // std::cout<< "Action: " << action << std::endl;

            state.performAction(action);
            // std::cout<< state.representation() << std::endl;

            node = node->addChildNode(action, &state);
            // std::cout<< node->representation() << std::endl;
        }

        while(!state.getAvailableActions().empty())
        {
            // std::cout<< "Entering Simulation Step" << std::endl;
            std::random_shuffle(node->actionsNotTaken.begin(), node->actionsNotTaken.end());
            int action = node->actionsNotTaken.back();
            state.performAction(action);
        }

        while(node->parentNode != NULL)
        {
            // std::cout<< "Entering Backpropagation Step" << std::endl;
            node->update(state.getValue(node->lastActivePlayer));
            node = node->parentNode;
        }
    }


    if(useRanks && world_size > 1)
    {
        // Root synchronization
        std::string serialized = MctsNodeSerializer::Serialize(root);
        serialized += "#";
        if (world_rank == 0)
        {
            //std::cout << "Sending:" << serialized.length() << "/" << DEFAULT_MESSAGE_SIZE << std::endl;
            //std::cout << "Rank 0, root looks before merge: " << serialized << std::endl;
        }

        if (serialized.length() > DEFAULT_MESSAGE_SIZE)
        {
            std::cout << "Error:" << serialized.length() << "/" << DEFAULT_MESSAGE_SIZE << std::endl;
            std::cout << "Seralized tree is too big!" << std::endl;
            return -1;
        }

        if (serialized.length() < DEFAULT_MESSAGE_SIZE)
        {
            serialized.resize(DEFAULT_MESSAGE_SIZE, '@');
        }

        char *rcv_buffer = new char[DEFAULT_MESSAGE_SIZE * world_size];

        MPI_Allgather((void*)serialized.c_str(), DEFAULT_MESSAGE_SIZE, MPI::CHAR,
                      rcv_buffer, DEFAULT_MESSAGE_SIZE, MPI::CHAR,
                      MPI_COMM_WORLD);


        std::string received(rcv_buffer, (unsigned long) (DEFAULT_MESSAGE_SIZE * world_size));
        delete rcv_buffer;

        //long ct = std::count(received.begin(), received.end(), '@');

        received.erase(std::remove(received.begin(), received.end(), '@'), received.end());
        std::stringstream receivedDataStream(received);
        //std::cout << "All data: " << received << std::endl;
        std::string serializedTree;
        while (std::getline(receivedDataStream, serializedTree, '#'))
        {
            serializedTree = serializedTree.substr(0, serializedTree.length() - 1);
            if (world_rank == 0)
            {
                //std::cout << "Rank 0, deserialized new tree before unboxing: " << serializedTree << std::endl;
                //std::cout << serialized.length() << std::endl;
                //std::cout << "There was " << ct << "/" << DEFAULT_MESSAGE_SIZE*world_size << "empty chars in data for iteration " << i << std::endl;
                //std::cout << "Attempting to deserialize data: '" << serializedTree << "'" <<  std::endl;
                //std::cout << "Attempting to deserialize data.. '" <<  std::endl;
                MctsNode remoteTree = MctsNodeDeserializer::Deserialize(serializedTree);


                //std::cout << "Rank 0, root before merge: " << MctsNodeSerializer::Serialize(root) << std::endl;
                //std::cout << "Rank 0, newTree before merge: " << MctsNodeSerializer::Serialize(root) << std::endl;
                //std::cout << "Data deserialized" << std::endl;
                //std::cout << "Attempting to merge" << std::endl;
                MctsTreeMerger::MergeTrees(&root, &remoteTree);

                std::cout << "Rank 0, after merge: " << MctsNodeSerializer::Serialize(root) << std::endl;
                //std::cout << "Merge is done" << std::endl;
            }
            else
            {
                MctsNode remoteTree = MctsNodeDeserializer::Deserialize(serializedTree);
                MctsTreeMerger::MergeTrees(&root, &remoteTree);
            }
        }
    }

    std::sort(root.childNodes.begin(), root.childNodes.end(), compareNodesByVisists);
    MctsNode lastItem = root.childNodes.back();
//    if(world_rank == 0)
//    {
//        //std::cout << MctsNodeSerializer::Serialize(root) << std::endl;
//        for(std::vector<MctsNode>::iterator it = root.childNodes.begin(); it != root.childNodes.end(); ++it)
//        {
//            std::cout << it->representation() << std::endl;
//        }
//    }
    // std::cout<< "RootState @exit: "<< rootState.representation() << std::endl;
    return lastItem.previousAction;
}
Esempio n. 6
0
/*  data1, odata1 and odata2 are packed in the format (for communication):
       data[0]          = is_max, no of is
       data[1]          = size of is[0]
        ...
       data[is_max]     = size of is[is_max-1]
       data[is_max + 1] = data(is[0])
        ...
       data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
        ...
   data2 is packed in the format (for creating output is[]):
       data[0]          = is_max, no of is
       data[1]          = size of is[0]
        ...
       data[is_max]     = size of is[is_max-1]
       data[is_max + 1] = data(is[0])
        ...
       data[is_max + 1 + Mbs*i) = data(is[i])
        ...
*/
static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[])
{
    Mat_MPISBAIJ   *c = (Mat_MPISBAIJ*)C->data;
    PetscErrorCode ierr;
    PetscMPIInt    size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len,*iwork;
    const PetscInt *idx_i;
    PetscInt       idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i;
    PetscInt       Mbs,i,j,k,*odata1,*odata2;
    PetscInt       proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est;
    PetscInt       proc_end=0,len_unused,nodata2;
    PetscInt       ois_max; /* max no of is[] in each of processor */
    char           *t_p;
    MPI_Comm       comm;
    MPI_Request    *s_waits1,*s_waits2,r_req;
    MPI_Status     *s_status,r_status;
    PetscBT        *table;  /* mark indices of this processor's is[] */
    PetscBT        table_i;
    PetscBT        otable; /* mark indices of other processors' is[] */
    PetscInt       bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners;
    IS             garray_local,garray_gl;

    PetscFunctionBegin;
    ierr = PetscObjectGetComm((PetscObject)C,&comm);
    CHKERRQ(ierr);
    size = c->size;
    rank = c->rank;
    Mbs  = c->Mbs;

    ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);
    CHKERRQ(ierr);
    ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);
    CHKERRQ(ierr);

    /* create tables used in
       step 1: table[i] - mark c->garray of proc [i]
       step 3: table[i] - mark indices of is[i] when whose=MINE
               table[0] - mark incideces of is[] when whose=OTHER */
    len  = PetscMax(is_max, size);
    CHKERRQ(ierr);
    ierr = PetscMalloc2(len,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,&t_p);
    CHKERRQ(ierr);
    for (i=0; i<len; i++) {
        table[i] = t_p  + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
    }

    ierr = MPIU_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);
    CHKERRQ(ierr);

    /* 1. Send this processor's is[] to other processors */
    /*---------------------------------------------------*/
    /* allocate spaces */
    ierr = PetscMalloc1(is_max,&n);
    CHKERRQ(ierr);
    len  = 0;
    for (i=0; i<is_max; i++) {
        ierr = ISGetLocalSize(is[i],&n[i]);
        CHKERRQ(ierr);
        len += n[i];
    }
    if (!len) {
        is_max = 0;
    } else {
        len += 1 + is_max; /* max length of data1 for one processor */
    }


    ierr = PetscMalloc1(size*len+1,&data1);
    CHKERRQ(ierr);
    ierr = PetscMalloc1(size,&data1_start);
    CHKERRQ(ierr);
    for (i=0; i<size; i++) data1_start[i] = data1 + i*len;

    ierr = PetscMalloc4(size,&len_s,size,&btable,size,&iwork,size+1,&Bowners);
    CHKERRQ(ierr);

    /* gather c->garray from all processors */
    ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local);
    CHKERRQ(ierr);
    ierr = ISAllGather(garray_local, &garray_gl);
    CHKERRQ(ierr);
    ierr = ISDestroy(&garray_local);
    CHKERRQ(ierr);
    ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);
    CHKERRQ(ierr);

    Bowners[0] = 0;
    for (i=0; i<size; i++) Bowners[i+1] += Bowners[i];

    if (is_max) {
        /* hash table ctable which maps c->row to proc_id) */
        ierr = PetscMalloc1(Mbs,&ctable);
        CHKERRQ(ierr);
        for (proc_id=0,j=0; proc_id<size; proc_id++) {
            for (; j<C->rmap->range[proc_id+1]/bs; j++) ctable[j] = proc_id;
        }

        /* hash tables marking c->garray */
        ierr = ISGetIndices(garray_gl,&idx_i);
        CHKERRQ(ierr);
        for (i=0; i<size; i++) {
            table_i = table[i];
            ierr    = PetscBTMemzero(Mbs,table_i);
            CHKERRQ(ierr);
            for (j = Bowners[i]; j<Bowners[i+1]; j++) { /* go through B cols of proc[i]*/
                ierr = PetscBTSet(table_i,idx_i[j]);
                CHKERRQ(ierr);
            }
        }
        ierr = ISRestoreIndices(garray_gl,&idx_i);
        CHKERRQ(ierr);
    }  /* if (is_max) */
    ierr = ISDestroy(&garray_gl);
    CHKERRQ(ierr);

    /* evaluate communication - mesg to who, length, and buffer space */
    for (i=0; i<size; i++) len_s[i] = 0;

    /* header of data1 */
    for (proc_id=0; proc_id<size; proc_id++) {
        iwork[proc_id]        = 0;
        *data1_start[proc_id] = is_max;
        data1_start[proc_id]++;
        for (j=0; j<is_max; j++) {
            if (proc_id == rank) {
                *data1_start[proc_id] = n[j];
            } else {
                *data1_start[proc_id] = 0;
            }
            data1_start[proc_id]++;
        }
    }

    for (i=0; i<is_max; i++) {
        ierr = ISGetIndices(is[i],&idx_i);
        CHKERRQ(ierr);
        for (j=0; j<n[i]; j++) {
            idx                = idx_i[j];
            *data1_start[rank] = idx;
            data1_start[rank]++; /* for local proccessing */
            proc_end           = ctable[idx];
            for (proc_id=0; proc_id<=proc_end; proc_id++) {  /* for others to process */
                if (proc_id == rank) continue; /* done before this loop */
                if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) continue; /* no need for sending idx to [proc_id] */
                *data1_start[proc_id] = idx;
                data1_start[proc_id]++;
                len_s[proc_id]++;
            }
        }
        /* update header data */
        for (proc_id=0; proc_id<size; proc_id++) {
            if (proc_id== rank) continue;
            *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id];
            iwork[proc_id]                 = len_s[proc_id];
        }
        ierr = ISRestoreIndices(is[i],&idx_i);
        CHKERRQ(ierr);
    }

    nrqs = 0;
    nrqr = 0;
    for (i=0; i<size; i++) {
        data1_start[i] = data1 + i*len;
        if (len_s[i]) {
            nrqs++;
            len_s[i] += 1 + is_max; /* add no. of header msg */
        }
    }

    for (i=0; i<is_max; i++) {
        ierr = ISDestroy(&is[i]);
        CHKERRQ(ierr);
    }
    ierr = PetscFree(n);
    CHKERRQ(ierr);
    ierr = PetscFree(ctable);
    CHKERRQ(ierr);

    /* Determine the number of messages to expect, their lengths, from from-ids */
    ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrqr);
    CHKERRQ(ierr);
    ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);
    CHKERRQ(ierr);

    /*  Now  post the sends */
    ierr = PetscMalloc2(size,&s_waits1,size,&s_waits2);
    CHKERRQ(ierr);
    k    = 0;
    for (proc_id=0; proc_id<size; proc_id++) {  /* send data1 to processor [proc_id] */
        if (len_s[proc_id]) {
            ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);
            CHKERRQ(ierr);
            k++;
        }
    }

    /* 2. Receive other's is[] and process. Then send back */
    /*-----------------------------------------------------*/
    len = 0;
    for (i=0; i<nrqr; i++) {
        if (len_r1[i] > len) len = len_r1[i];
    }
    ierr = PetscFree(len_r1);
    CHKERRQ(ierr);
    ierr = PetscFree(id_r1);
    CHKERRQ(ierr);

    for (proc_id=0; proc_id<size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;

    ierr = PetscMalloc1(len+1,&odata1);
    CHKERRQ(ierr);
    ierr = PetscMalloc1(size,&odata2_ptr);
    CHKERRQ(ierr);
    ierr = PetscBTCreate(Mbs,&otable);
    CHKERRQ(ierr);

    len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */
    len_est = 2*len_max;       /* estimated space of storing is[] for all receiving messages */
    ierr    = PetscMalloc1(len_est+1,&odata2);
    CHKERRQ(ierr);
    nodata2 = 0;               /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */

    odata2_ptr[nodata2] = odata2;

    len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */

    k = 0;
    while (k < nrqr) {
        /* Receive messages */
        ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);
        CHKERRQ(ierr);
        if (flag) {
            ierr    = MPI_Get_count(&r_status,MPIU_INT,&len);
            CHKERRQ(ierr);
            proc_id = r_status.MPI_SOURCE;
            ierr    = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);
            CHKERRQ(ierr);
            ierr    = MPI_Wait(&r_req,&r_status);
            CHKERRQ(ierr);

            /*  Process messages */
            /*  make sure there is enough unused space in odata2 array */
            if (len_unused < len_max) { /* allocate more space for odata2 */
                ierr = PetscMalloc1(len_est+1,&odata2);
                CHKERRQ(ierr);

                odata2_ptr[++nodata2] = odata2;

                len_unused = len_est;
            }

            ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);
            CHKERRQ(ierr);
            len  = 1 + odata2[0];
            for (i=0; i<odata2[0]; i++) len += odata2[1 + i];

            /* Send messages back */
            ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);
            CHKERRQ(ierr);
            k++;
            odata2        += len;
            len_unused    -= len;
            len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
        }
    }
    ierr = PetscFree(odata1);
    CHKERRQ(ierr);
    ierr = PetscBTDestroy(&otable);
    CHKERRQ(ierr);

    /* 3. Do local work on this processor's is[] */
    /*-------------------------------------------*/
    /* make sure there is enough unused space in odata2(=data) array */
    len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */
    if (len_unused < len_max) { /* allocate more space for odata2 */
        ierr = PetscMalloc1(len_est+1,&odata2);
        CHKERRQ(ierr);

        odata2_ptr[++nodata2] = odata2;
    }

    data = odata2;
    ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);
    CHKERRQ(ierr);
    ierr = PetscFree(data1_start);
    CHKERRQ(ierr);

    /* 4. Receive work done on other processors, then merge */
    /*------------------------------------------------------*/
    /* get max number of messages that this processor expects to recv */
    ierr = MPIU_Allreduce(len_s,iwork,size,MPI_INT,MPI_MAX,comm);
    CHKERRQ(ierr);
    ierr = PetscMalloc1(iwork[rank]+1,&data2);
    CHKERRQ(ierr);
    ierr = PetscFree4(len_s,btable,iwork,Bowners);
    CHKERRQ(ierr);

    k = 0;
    while (k < nrqs) {
        /* Receive messages */
        ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status);
        CHKERRQ(ierr);
        if (flag) {
            ierr = MPI_Get_count(&r_status,MPIU_INT,&len);
            CHKERRQ(ierr);

            proc_id = r_status.MPI_SOURCE;

            ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);
            CHKERRQ(ierr);
            ierr = MPI_Wait(&r_req,&r_status);
            CHKERRQ(ierr);
            if (len > 1+is_max) { /* Add data2 into data */
                data2_i = data2 + 1 + is_max;
                for (i=0; i<is_max; i++) {
                    table_i = table[i];
                    data_i  = data + 1 + is_max + Mbs*i;
                    isz     = data[1+i];
                    for (j=0; j<data2[1+i]; j++) {
                        col = data2_i[j];
                        if (!PetscBTLookupSet(table_i,col)) data_i[isz++] = col;
                    }
                    data[1+i] = isz;
                    if (i < is_max - 1) data2_i += data2[1+i];
                }
            }
            k++;
        }
    }
    ierr = PetscFree(data2);
    CHKERRQ(ierr);
    ierr = PetscFree2(table,t_p);
    CHKERRQ(ierr);

    /* phase 1 sends are complete */
    ierr = PetscMalloc1(size,&s_status);
    CHKERRQ(ierr);
    if (nrqs) {
        ierr = MPI_Waitall(nrqs,s_waits1,s_status);
        CHKERRQ(ierr);
    }
    ierr = PetscFree(data1);
    CHKERRQ(ierr);

    /* phase 2 sends are complete */
    if (nrqr) {
        ierr = MPI_Waitall(nrqr,s_waits2,s_status);
        CHKERRQ(ierr);
    }
    ierr = PetscFree2(s_waits1,s_waits2);
    CHKERRQ(ierr);
    ierr = PetscFree(s_status);
    CHKERRQ(ierr);

    /* 5. Create new is[] */
    /*--------------------*/
    for (i=0; i<is_max; i++) {
        data_i = data + 1 + is_max + Mbs*i;
        ierr   = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i);
        CHKERRQ(ierr);
    }
    for (k=0; k<=nodata2; k++) {
        ierr = PetscFree(odata2_ptr[k]);
        CHKERRQ(ierr);
    }
    ierr = PetscFree(odata2_ptr);
    CHKERRQ(ierr);
    PetscFunctionReturn(0);
}
/*-----------------------------------------------------------------------------
 * Function:  Parallel_daypx
 * Purpose:   scalar a * y + x = y and gathers the complete solution vector
 *            onto each processor
 * In args:   a, loc_x, loc_y, y, n, comm
 * Out args:  loc_y, y
 */
void Parallel_daypx(double a, double loc_x[], double loc_y[], double y[],
                    int n, MPI_Comm comm) {

    daypx(a, loc_x, loc_y, n);
    MPI_Allgather(loc_y, n, MPI_DOUBLE, y, n, MPI_DOUBLE, comm);
}   /* Parallel_daypx */
Esempio n. 8
0
/* communicate integers and doubles using point to point communication */
int COM (MPI_Comm comm, int tag,
         COMDATA *send, int nsend,
	 COMDATA **recv, int *nrecv) /* recv is contiguous => free (*recv) releases all memory */
{
  COMDATA *cd;
  int rank,
      ncpu,
      send_size,
    (*send_sizes) [3],
     *send_position,
     *send_rank,
      send_count,
     *send_rank_all,
     *send_count_all,
     *send_rank_disp,
     *recv_rank,
    (*recv_sizes) [3],
      recv_count,
      i, j, k, l;
  char **send_data,
       **recv_data;
  MPI_Request *req;
  MPI_Status *sta;
  void *p;

  MPI_Comm_rank (comm, &rank);
  MPI_Comm_size (comm, &ncpu);

  ERRMEM (send_sizes = MEM_CALLOC (ncpu * sizeof (int [3])));
  ERRMEM (send_position = MEM_CALLOC (ncpu * sizeof (int)));
  ERRMEM (send_rank = malloc (ncpu * sizeof (int)));
  ERRMEM (send_data = malloc (ncpu * sizeof (char*)));

  /* compute send sizes */
  for (i = 0, cd = send; i < nsend; i ++, cd ++)
  {
    send_sizes [cd->rank][0] += cd->ints;
    send_sizes [cd->rank][1] += cd->doubles;
    MPI_Pack_size (cd->ints, MPI_INT, comm, &j);
    MPI_Pack_size (cd->doubles, MPI_DOUBLE, comm, &k);
    send_sizes [cd->rank][2] += (j + k);
  }

  /* allocate send buffers */
  for (send_size = i = 0; i < ncpu; i ++)
  {
    if (send_sizes [i][2])
    {
      ERRMEM (send_data [i] = malloc (send_sizes [i][2]));
      send_position [i] = 0;
      send_size += send_sizes [i][2];
    }
  }

  /* pack ints */
  for (i = 0, cd = send; i < nsend; i ++, cd ++)
  {
    if (cd->ints)
    {
      MPI_Pack (cd->i, cd->ints, MPI_INT, send_data [cd->rank], send_sizes [cd->rank][2], &send_position [cd->rank], comm);
    }
  }

  /* pack doubles */
  for (i = 0, cd = send; i < nsend; i ++, cd ++)
  {
    if (cd->doubles)
    {
      MPI_Pack (cd->d, cd->doubles, MPI_DOUBLE, send_data [cd->rank], send_sizes [cd->rank][2], &send_position [cd->rank], comm);
    }
  }

#if DEBUG
  for (i = 0; i < ncpu; i ++)
  {
    ASSERT_DEBUG (send_position [i] <= send_sizes [i][2], "Incorrect packing");
  }
#endif

  /* compute send ranks and move data */
  for (send_count = i = 0; i < ncpu; i ++)
  {
    if (send_sizes [i][2])
    {
      send_rank [send_count] = i;
      send_data [send_count] = send_data [i];
      send_sizes [send_count][0] = send_sizes [i][0];
      send_sizes [send_count][1] = send_sizes [i][1];
      send_sizes [send_count][2] = send_sizes [i][2];
      send_count ++;
    }
  }

  ERRMEM (send_count_all = malloc (ncpu * sizeof (int)));
  ERRMEM (recv_rank = malloc (ncpu * sizeof (int)));

  /* gather all send ranks */
  MPI_Allgather (&send_count, 1, MPI_INT, send_count_all, 1, MPI_INT, comm);
  ERRMEM (send_rank_disp = malloc (ncpu * sizeof (int)));
  for (send_rank_disp [0] = l = i = 0; i < ncpu; i ++)
  { l += send_count_all [i]; if (i < ncpu-1) send_rank_disp [i+1] = l; }
  ERRMEM (send_rank_all = malloc (l * sizeof (int)));
  MPI_Allgatherv (send_rank, send_count, MPI_INT, send_rank_all, send_count_all, send_rank_disp, MPI_INT, comm);

  /* compute receive ranks */
  for (recv_count = k = i = 0; i < l; i += send_count_all [k], k ++)
  {
    for (j = 0; j < send_count_all [k]; j ++)
    {
      if (send_rank_all [i+j] == rank) /* 'k'th rank is sending here */
      {
	recv_rank [recv_count] = k;
	recv_count ++;
	break;
      }
    }
  }

  ERRMEM (recv_sizes = malloc (recv_count * sizeof (int [3])));
  ERRMEM (req = malloc (recv_count * sizeof (MPI_Request)));
  ERRMEM (sta = malloc (recv_count * sizeof (MPI_Status)));

  /* communicate receive sizes */
  for (i = 0; i < recv_count; i ++)
  {
    MPI_Irecv (recv_sizes [i], 3, MPI_INT, recv_rank [i], tag, comm, &req [i]);
  }
  MPI_Barrier (comm);
  for (i = 0; i < send_count; i ++)
  {
    MPI_Rsend (send_sizes [i], 3, MPI_INT, send_rank [i], tag, comm);
  }
  MPI_Waitall (recv_count, req, sta);

  /* contiguous receive size */
  j = recv_count * sizeof (COMDATA);
  for (i = 0; i < recv_count; i ++)
  {
    j += recv_sizes [i][0] * sizeof (int) + 
         recv_sizes [i][1] * sizeof (double);
  }

  /* prepare receive buffers */
  ERRMEM (recv_data = malloc (recv_count * sizeof (char*)));
  ERRMEM ((*recv) = malloc (j));
  p = (*recv) + recv_count;
  *nrecv = recv_count;
  for (i = 0, cd = *recv; i < recv_count; i ++, cd ++)
  {
    cd->rank = recv_rank [i];
    cd->ints = recv_sizes [i][0];
    cd->doubles = recv_sizes [i][1];
    cd->i = p; p = (cd->i + cd->ints);
    cd->d = p; p = (cd->d + cd->doubles);
    ERRMEM (recv_data [i] = malloc (recv_sizes [i][2]));
  }

  /* communicate data */
  for (i = 0; i < recv_count; i ++)
  {
    MPI_Irecv (recv_data [i], recv_sizes [i][2], MPI_PACKED, recv_rank [i], tag, comm, &req [i]);
  }
  MPI_Barrier (comm);
  for (i = 0; i < send_count; i ++)
  {
    MPI_Rsend (send_data [i], send_sizes [i][2], MPI_PACKED, send_rank [i], tag, comm);
  }
  MPI_Waitall (recv_count, req, sta);

  /* unpack data */
  for (i = j = 0; i < recv_count; i ++, j = 0)
  {
    MPI_Unpack (recv_data [i], recv_sizes [i][2], &j, (*recv) [i].i, (*recv) [i].ints, MPI_INT, comm);
    MPI_Unpack (recv_data [i], recv_sizes [i][2], &j, (*recv) [i].d, (*recv) [i].doubles, MPI_DOUBLE, comm);
  }

  /* cleanup */
  free (send_rank_disp);
  free (send_sizes);
  free (send_position);
  free (send_rank);
  for (i = 0; i < send_count; i ++)
    free (send_data [i]);
  free (send_data);
  free (send_count_all);
  free (send_rank_all);
  free (recv_rank);
  free (recv_sizes);
  for (i = 0; i < recv_count; i ++)
    free (recv_data [i]);
  free (recv_data);
  free (req);
  free (sta);

  return send_size;
}
int main(int argc, char *argv[])
{

    int i,j,n;
    double **mat, *x, **mymat, *myx;
    double sum,temp,diff,bb;
    double e;
    int iter=0;
    int proc_num, my_rank;

    MPI_Init(&argc, &argv);

    // get the number of procs and rank in the comm
    MPI_Comm_size(MPI_COMM_WORLD, &proc_num);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    double start_time, end_time, total_time;

    MPI_Status status;

    if(my_rank == 0) {
        // Proc 0 is in charge of reading matrix and distribute data

        if (argc != 3) {
            printf("Usage: mpirun -np 4 hw1_3b_mpi <filename> <error>\n");
            return 1;
        }
        printf("\nInput File: %s\n", argv[1]);

        e = (double)atof(argv[2]);
        printf("error= %f\n", e);

        /* Opening the input file */
        FILE *fp = fopen(argv[1], "r");
        if (fp == NULL) {
            printf("Error in opening a file: %s", argv[1]);
            return 0;
        }

        /* Reading the maxtrix dimension */
        fscanf(fp, "%d",&n);
        printf("n= %d\n", n);

        // store in row major
        mat = (double**) malloc( n * sizeof(double*));
        for (i= 0; i<n; i++)
            mat[i]= (double*) malloc((n+1) * sizeof(double));

        /* Reading the input matrix */
        for(i=0; i<n; i++) {
            for(j=0; j<n+1; j++) {
                fscanf(fp, "%lf", &mat[i][j]);
            }
        }

        fclose(fp);

    }


    /* Solving the given matrix iteratively */
    
      /*  if(my_rank == 1){

        	int dowait = 1;
        	while(dowait){
        		;
        	}
        }
*/
    

    // Broadcast matrix dimension 'n' and convergence 'e'
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&e, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    int myrows = n / proc_num;

    // mymat store's each proc's share of matrix columns
    mymat = (double**) malloc( myrows * sizeof(double*));
    for (i= 0; i<myrows; i++)
        mymat[i]= (double*) malloc((n+1) * sizeof(double));
    myx = (double*) malloc((myrows) * sizeof(double));
    x = (double*) malloc((n) * sizeof(double));
    for(i=0; i<myrows; i++) {
        myx[i]=0.0;
    }

    // start time, total time should include distributing the data
    // to other processes as part of the parallization 
    start_time = MPI_Wtime();

    // make every proc has myrows rows of the mat
    if(my_rank == 0){
        int dest = 0;
        for(i = myrows; i < n; i++){   
            dest = i / myrows;         
            MPI_Send(&mat[i][0], n + 1, MPI_DOUBLE, dest, i, MPI_COMM_WORLD);
        }

        for(i = 0; i < myrows; i++){   
            for(j = 0; j < n + 1; j++)         
                mymat[i][j] = mat[i][j];
        }  

    }
    else{
        for(i = 0; i < myrows; i++){            
            MPI_Recv(&mymat[i][0], n + 1, MPI_DOUBLE, 0, my_rank * myrows + i, MPI_COMM_WORLD, &status);

        }
    }

    iter=0;
    double allbb;
    double compute_time = MPI_Wtime();
    do {
        bb=0;

        // all proc get all x
        MPI_Allgather(myx, myrows, MPI_DOUBLE, x, myrows, MPI_DOUBLE, MPI_COMM_WORLD);

        for(i=0;i<myrows;i++){
            sum=0;
            for(j=0;j<n;j++){
                if(j!=i+myrows*my_rank){
                    sum=sum+mymat[i][j]*x[j];
                }
            }
			temp=(mymat[i][n]-sum) / mymat[i][i+myrows*my_rank];
            diff=fabs(x[i]-temp);
            if(diff>bb){
				bb=diff;
            }
            myx[i]=temp;

        }

        // each process get same bb value so all can go out of loop
        MPI_Allreduce( &bb, &allbb, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD );
		iter++;

    }
    while(allbb>=e);

    // gather final x for print
    MPI_Allgather(myx, myrows, MPI_DOUBLE, x, myrows, MPI_DOUBLE, MPI_COMM_WORLD);

    if(my_rank ==0 ){
        // record end time of computation
        end_time = MPI_Wtime();
        total_time = end_time - start_time;
		printf("Total time:%lf; Computation time is:%lf\n", total_time, end_time - compute_time);

    }

#if DEBUG
    /* prints the solution */
    printf("\nAnswer >>", i, x[i]);
    for(i=0; i<n; i++) {
        printf("\nx[%d]=%f", i, x[i]);
    }
    printf("\n");
#endif

    if(my_rank == 0) {

        print_x(iter, n, x);
        printf("\ndone\n");

    }


    // free allocated memory
    for (i=0; i<myrows; i++) {
        free(mymat[i]);
    }
    free(mymat);
    free(myx);
    free(x);

    if(my_rank == 0) {

        for (i=0; i<n; i++) {
            free(mat[i]);
        }
        free(mat);
    }

    MPI_Finalize();

    return 0;
}
Esempio n. 10
0
/*
 * Compute the aggregator-related parameters that are required in 2-phase
 * collective IO of ADIO.
 * The parameters are
 * 	. the number of aggregators (proxies) : fd->hints->cb_nodes
 *	. the ranks of the aggregators :        fd->hints->ranklist
 * If MP_IONODEFILE is defined, POE determines all tasks on every node listed
 * in the node file and defines MP_IOTASKLIST with them, making them all
 * aggregators.  Alternatively, the user can explictly set MP_IOTASKLIST
 * themselves.  The format of the MP_IOTASKLIST is a colon-delimited list of
 * task ids, the first entry being the total number of aggregators, for example
 * to specify 4 aggregators on task ids 0,8,16,24  the value would be:
 * 4:0:8:16:24.  If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the
 * default aggregator selection is 1 task per node for every node of the job -
 * additionally, an environment variable MP_IOAGGR_CNT  can be specified, which
 * defines the total number of aggregators, spread evenly across all the nodes.
 * The romio_cb_nodes and romio_cb_config_list hint user settings are ignored.
 */
int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd)
{

    int numAggs = 0;
    char *ioTaskList = getenv( "MP_IOTASKLIST" );
    char *ioAggrCount = getenv("MP_IOAGGR_CNT");
    int i,j;
    int inTERcommFlag = 0;

    int myRank,commSize;
    MPI_Comm_rank(fd->comm, &myRank);
    MPI_Comm_size(fd->comm, &commSize);

    MPI_Comm_test_inter(fd->comm, &inTERcommFlag);
    if (inTERcommFlag) {
      FPRINTF(stderr,"ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n");
      perror("ADIOI_PE_gen_agg_ranklist:");
      MPI_Abort(MPI_COMM_WORLD, 1);
    }

    if (ioTaskList) {
      int ioTaskListLen = strlen(ioTaskList);
      int ioTaskListPos = 0;
      char tmpBuf[8];   /* Big enough for 1M tasks (7 digits task ID). */
      tmpBuf[7] = '\0';
      for (i=0; i<7; i++) {
         tmpBuf[i] = *ioTaskList++;      /* Maximum is 7 digits for 1 million. */
         ioTaskListPos++;
         if (*ioTaskList == ':') {       /* If the next char is a ':' ends it. */
             tmpBuf[i+1] = '\0';
             break;
         }
      }
      numAggs = atoi(tmpBuf);
      if (numAggs == 0)
        FPRINTF(stderr,"ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n");
      else if (!((numAggs > 0 ) && (numAggs <= commSize))) {
        FPRINTF(stderr,"ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",tmpBuf,commSize);
        numAggs = commSize;
      }
      fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));

      int aggIndex = 0;
      while (aggIndex < numAggs) {
         ioTaskList++;                /* Advance past the ':' */
         ioTaskListPos++;
         int allDigits=1;
         for (i=0; i<7; i++) {
            if (*ioTaskList < '0' || *ioTaskList > '9')
              allDigits=0;
            tmpBuf[i] = *ioTaskList++;
            ioTaskListPos++;
            if ( (*ioTaskList == ':') || (*ioTaskList == '\0') ) {
                tmpBuf[i+1] = '\0';
                break;
            }
         }
         if (allDigits) {
           int newAggRank = atoi(tmpBuf);
           if (!((newAggRank >= 0 ) && (newAggRank < commSize))) {
             FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d  - it will be ignored.\n",tmpBuf,commSize-1);
           }
           else {
             int aggAlreadyAdded = 0;
             for (i=0;i<aggIndex;i++)
               if (fd->hints->ranklist[i] == newAggRank) {
                 aggAlreadyAdded = 1;
                 break;
               }
             if (!aggAlreadyAdded)
               fd->hints->ranklist[aggIndex++] = newAggRank;
             else
               FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",newAggRank);
           }
         }
         else {
           FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id  - it will be ignored.\n",tmpBuf);
         }

         /* At the end check whether the list is shorter than specified. */
         if (ioTaskListPos == ioTaskListLen) {
           if (aggIndex == 0) {
             FPRINTF(stderr,"ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n");
             ADIOI_Free(fd->hints->ranklist);
           }
           else if (aggIndex < numAggs)
             FPRINTF(stderr,"ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",numAggs, aggIndex,aggIndex);
           numAggs = aggIndex;
         }
      }
    }
    if (numAggs == 0)  {
      MPID_Comm *mpidCommData;

      MPID_Comm_get_ptr(fd->comm,mpidCommData);
      int localSize = mpidCommData->local_size;

      // get my node rank
      int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank];

      int *allNodeRanks = (int *) ADIOI_Malloc (localSize * sizeof(int));

      allNodeRanks[myRank] = myNodeRank;
      MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm);

#ifdef AGG_DEBUG
      printf("MPID_Comm data: local_size is %d\nintranode_table entries:\n",mpidCommData->local_size);
      for (i=0;i<localSize;i++) {
        printf("%d ",mpidCommData->intranode_table[i]);
      }
      printf("\ninternode_table entries:\n");
      for (i=0;i<localSize;i++) {
        printf("%d ",mpidCommData->internode_table[i]);
      }
      printf("\n");

      printf("\nallNodeRanks entries:\n");
      for (i=0;i<localSize;i++) {
        printf("%d ",allNodeRanks[i]);
      }
      printf("\n");

#endif

      if (ioAggrCount) {
        int cntType = -1;

        if ( strcasecmp(ioAggrCount, "ALL") ) {
           if ( (cntType = atoi(ioAggrCount)) <= 0 ) {
              /* Input is other non-digit or less than 1 the  assume */
              /* 1 aggregator per node.  Note: atoi(-1) reutns -1.   */
              /* No warning message given here -- done earlier.      */
              cntType = -1;
           }
        }
        else {
           /* ALL is specified set aggr count to localSize */
           cntType = -2;
        }
        switch(cntType) {
           case -1:
              /* 1 aggr/node case */
            {
             int rankListIndex = 0;
             fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
             for (i=0;i<localSize;i++) {
               if (allNodeRanks[i] == 0) {
                 fd->hints->ranklist[rankListIndex++] = i;
                 numAggs++;
               }
             }
            }
              break;
           case -2:
              /* ALL tasks case */
             fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
             for (i=0;i<localSize;i++) {
               fd->hints->ranklist[i] = i;
               numAggs++;
             }
              break;
           default:
              /* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */
             if (cntType > localSize)
               cntType = localSize;

             numAggs = cntType;
             // Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc
             int currentNodeRank = 0;  // node rank currently being selected as aggregator
             int rankListIndex = 0;
             int currentAllNodeIndex = 0;

             fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));

             while (rankListIndex < numAggs) {
               int foundEntry = 0;
               while (!foundEntry && (currentAllNodeIndex < localSize)) {
                 if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) {
                   fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex;
                   foundEntry = 1;
                 }
                 currentAllNodeIndex++;
               }
               if (!foundEntry) {
                 currentNodeRank++;
                 currentAllNodeIndex = 0;
               }
             } // while
          break;
        } // switch(cntType)
      } // if (ioAggrCount)

      else { // default is 1 aggregator per node
        // take the 0 entries from allNodeRanks
        int rankListIndex = 0;
        fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
        for (i=0;i<localSize;i++) {
          if (allNodeRanks[i] == 0) {
            fd->hints->ranklist[rankListIndex++] = i;
            numAggs++;
          }
        }
      }

      ADIOI_Free(allNodeRanks);

    }

    if ( getenv("MP_I_SHOW_AGGRS") ) {
      if (myRank == 0) {
        printf("Agg rank list of %d generated:\n", numAggs);
        for (i=0;i<numAggs;i++) {
          printf("%d ",fd->hints->ranklist[i]);
        }
        printf("\n");
      }
    }

    fd->hints->cb_nodes = numAggs;

    return 0;
}
Esempio n. 11
0
/* create a repetitive point to point communication pattern;
 * ranks and sizes must not change during the communication;
 * pointers to send and receive buffers data must not change */
void* COM_Pattern (MPI_Comm comm, int tag,
                   COMDATA *send, int nsend,
	           COMDATA **recv, int *nrecv) /* recv is contiguous => free (*recv) releases all memory */
{
  COMPATTERN *pattern;
  COMDATA *cd;
  int rank,
      ncpu,
     *send_rank_all,
     *send_count_all,
     *send_rank_disp,
      i, j, k, l;
  void *p;

  MPI_Comm_rank (comm, &rank);
  MPI_Comm_size (comm, &ncpu);

  ERRMEM (pattern = malloc (sizeof (COMPATTERN)));
  ERRMEM (pattern->rankmap = MEM_CALLOC (ncpu * sizeof (int)));
  ERRMEM (pattern->send_sizes = MEM_CALLOC (ncpu * sizeof (int [3])));
  ERRMEM (pattern->send_position = MEM_CALLOC (ncpu * sizeof (int)));
  ERRMEM (pattern->send_rank = malloc (ncpu * sizeof (int)));
  ERRMEM (pattern->send_data = malloc (ncpu * sizeof (char*)));
  pattern->nsend = nsend;
  pattern->send = send;
  pattern->comm = comm;
  pattern->tag = tag;

  /* compute send sizes */
  for (i = 0, cd = send; i < nsend; i ++, cd ++)
  {
    pattern->send_sizes [cd->rank][0] += cd->ints;
    pattern->send_sizes [cd->rank][1] += cd->doubles;
    MPI_Pack_size (cd->ints, MPI_INT, comm, &j);
    MPI_Pack_size (cd->doubles, MPI_DOUBLE, comm, &k);
    pattern->send_sizes [cd->rank][2] += (j + k);
  }

  /* allocate send buffers and prepare rank map */
  for (pattern->send_size = i = j = 0; i < ncpu; i ++)
  {
    if (pattern->send_sizes [i][2])
    {
      ERRMEM (pattern->send_data [i] = malloc (pattern->send_sizes [i][2]));
      pattern->rankmap [i] = j;
      pattern->send_size += pattern->send_sizes [i][2];
      j ++;
    }
  }

  /* compute send ranks and move data */
  for (pattern->send_count = i = 0; i < ncpu; i ++)
  {
    if (pattern->send_sizes [i][2])
    {
      pattern->send_rank [pattern->send_count] = i;
      pattern->send_data [pattern->send_count] = pattern->send_data [i];
      pattern->send_sizes [pattern->send_count][0] = pattern->send_sizes [i][0];
      pattern->send_sizes [pattern->send_count][1] = pattern->send_sizes [i][1];
      pattern->send_sizes [pattern->send_count][2] = pattern->send_sizes [i][2];
      pattern->send_count ++;
    }
  }

  ERRMEM (send_count_all = malloc (ncpu * sizeof (int)));
  ERRMEM (pattern->recv_rank = malloc (ncpu * sizeof (int)));

  /* gather all send ranks */
  MPI_Allgather (&pattern->send_count, 1, MPI_INT, send_count_all, 1, MPI_INT, comm);
  ERRMEM (send_rank_disp = malloc (ncpu * sizeof (int)));
  for (send_rank_disp [0] = l = i = 0; i < ncpu; i ++)
  { l += send_count_all [i]; if (i < ncpu-1) send_rank_disp [i+1] = l; }
  ERRMEM (send_rank_all = malloc (l * sizeof (int)));
  MPI_Allgatherv (pattern->send_rank, pattern->send_count, MPI_INT, send_rank_all, send_count_all, send_rank_disp, MPI_INT, comm);

  /* compute receive ranks */
  for (pattern->recv_count = k = i = 0; i < l; i += send_count_all [k], k ++)
  {
    for (j = 0; j < send_count_all [k]; j ++)
    {
      if (send_rank_all [i+j] == rank) /* 'k'th rank is sending here */
      {
	pattern->recv_rank [pattern->recv_count] = k;
	pattern->recv_count ++;
	break;
      }
    }
  }

  ERRMEM (pattern->recv_sizes = malloc (pattern->recv_count * sizeof (int [3])));
  ERRMEM (pattern->recv_req = malloc (pattern->recv_count * sizeof (MPI_Request)));
  ERRMEM (pattern->recv_sta = malloc (pattern->recv_count * sizeof (MPI_Status)));
  ERRMEM (pattern->send_req = malloc (pattern->send_count * sizeof (MPI_Request)));
  ERRMEM (pattern->send_sta = malloc (pattern->send_count * sizeof (MPI_Status)));

  /* communicate receive sizes */
  for (i = 0; i < pattern->recv_count; i ++)
  {
    MPI_Irecv (pattern->recv_sizes [i], 3, MPI_INT, pattern->recv_rank [i], tag, comm, &pattern->recv_req [i]);
  }
  MPI_Barrier (comm);
  for (i = 0; i < pattern->send_count; i ++)
  {
    MPI_Rsend (pattern->send_sizes [i], 3, MPI_INT, pattern->send_rank [i], tag, comm);
  }
  MPI_Waitall (pattern->recv_count, pattern->recv_req, pattern->recv_sta);

  /* contiguous receive size */
  j = pattern->recv_count * sizeof (COMDATA);
  for (i = 0; i < pattern->recv_count; i ++)
  {
    j += pattern->recv_sizes [i][0] * sizeof (int) + 
         pattern->recv_sizes [i][1] * sizeof (double);
  }

  /* prepare receive buffers */
  ERRMEM (pattern->recv_data = malloc (pattern->recv_count * sizeof (char*)));
  ERRMEM (pattern->recv = malloc (j));
  p = pattern->recv + pattern->recv_count;
  pattern->nrecv = pattern->recv_count;
  for (i = 0, cd = pattern->recv; i < pattern->recv_count; i ++, cd ++)
  {
    cd->rank = pattern->recv_rank [i];
    cd->ints = pattern->recv_sizes [i][0];
    cd->doubles = pattern->recv_sizes [i][1];
    cd->i = p; p = (cd->i + cd->ints);
    cd->d = p; p = (cd->d + cd->doubles);
    ERRMEM (pattern->recv_data [i] = malloc (pattern->recv_sizes [i][2]));
  }

  /* truncate */
  if (pattern->send_count)
  {
    ERRMEM (pattern->send_sizes = realloc (pattern->send_sizes, pattern->send_count * sizeof (int [3])));
    ERRMEM (pattern->send_position = realloc (pattern->send_position, pattern->send_count * sizeof (int)));
    ERRMEM (pattern->send_rank = realloc (pattern->send_rank, pattern->send_count * sizeof (int)));
    ERRMEM (pattern->send_data = realloc (pattern->send_data, pattern->send_count * sizeof (char*)));
  }
  if (pattern->recv_count) ERRMEM (pattern->recv_rank = realloc (pattern->recv_rank, pattern->recv_count * sizeof (int)));
 
  /* cleanup */
  free (send_rank_disp);
  free (send_count_all);
  free (send_rank_all);

  /* output */
  *nrecv = pattern->nrecv;
  *recv = pattern->recv;

  return pattern;
}
Esempio n. 12
0
void splitSources(std::vector<double>& expandSources, std::vector<double>& directSources, 
    std::vector<ot::TreeNode>& fgtList, std::vector<double>& sources, const unsigned int minPtsInFgt, 
    const unsigned int FgtLev, MPI_Comm comm) {
  PetscLogEventBegin(splitSourcesEvent, 0, 0, 0, 0);

  int numPts = ((sources.size())/4);

#ifdef DEBUG
  assert(!(sources.empty()));
  assert(fgtList.empty());
#endif
  {
    unsigned int px = static_cast<unsigned int>(sources[0]*(__DTPMD__));
    unsigned int py = static_cast<unsigned int>(sources[1]*(__DTPMD__));
    unsigned int pz = static_cast<unsigned int>(sources[2]*(__DTPMD__));
    ot::TreeNode ptOct(px, py, pz, __MAX_DEPTH__, __DIM__, __MAX_DEPTH__);
    ot::TreeNode newFgt = ptOct.getAncestor(FgtLev);
    fgtList.push_back(newFgt);
  }

  for(int i = 1; i < numPts; ++i) {
    unsigned int px = static_cast<unsigned int>(sources[4*i]*(__DTPMD__));
    unsigned int py = static_cast<unsigned int>(sources[(4*i)+1]*(__DTPMD__));
    unsigned int pz = static_cast<unsigned int>(sources[(4*i)+2]*(__DTPMD__));
    ot::TreeNode ptOct(px, py, pz, __MAX_DEPTH__, __DIM__, __MAX_DEPTH__);
    ot::TreeNode newFgt = ptOct.getAncestor(FgtLev);
    if(fgtList[fgtList.size() - 1] == newFgt) {
      fgtList[fgtList.size() - 1].addWeight(1);
    } else {
      fgtList.push_back(newFgt);
    }
  }//end for i

#ifdef DEBUG
  assert(!(fgtList.empty()));
#endif

  int rank;
  int npes;
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &npes);

  int localFlag = 0;
  if( (rank > 0) && (rank < (npes - 1)) && ((fgtList.size()) == 1) ) {
    localFlag = 1;
  }

  int globalFlag;
  MPI_Allreduce(&localFlag, &globalFlag, 1, MPI_INT, MPI_SUM, comm);

  int prevRank = rank - 1;
  int nextRank = rank + 1;

  if(globalFlag > 0) {
    int gatherSendBuf = 0;
    if( (rank > 0) && (rank < (npes - 1)) && (fgtList.size() == 1) ) {
      gatherSendBuf = sources.size();
    }

    int* gatherList = new int[npes];

    MPI_Allgather((&gatherSendBuf), 1, MPI_INT, gatherList, 1, MPI_INT, comm);

    if(rank > 0) {
      while(gatherList[prevRank] > 0) {
        --prevRank;
      }//end while
    }

    if(rank < (npes - 1)) {
      while(gatherList[nextRank] > 0) {
        ++nextRank;
      }//end while
    }

    int* sendFgtCnts = new int[npes];
    int* recvFgtCnts = new int[npes];

    int* sendSourceCnts = new int[npes];
    int* recvSourceCnts = new int[npes];

    for(int i = 0; i < npes; ++i) {
      sendFgtCnts[i] = 0;
      recvFgtCnts[i] = 0;
      sendSourceCnts[i] = 0;
      recvSourceCnts[i] = 0;
    }//end i

    if(gatherSendBuf > 0) {
      sendFgtCnts[prevRank] = 1;
      sendSourceCnts[prevRank] = gatherSendBuf;
    }
    for(int i = rank + 1; i < nextRank; ++i) {
      recvFgtCnts[i] = 1;
      recvSourceCnts[i] = gatherList[i];
    }//end i

    delete [] gatherList;

    int* sendFgtDisps = new int[npes];
    int* recvFgtDisps = new int[npes];
    sendFgtDisps[0] = 0;
    recvFgtDisps[0] = 0;
    for(int i = 1; i < npes; ++i) {
      sendFgtDisps[i] = sendFgtDisps[i - 1] + sendFgtCnts[i - 1];
      recvFgtDisps[i] = recvFgtDisps[i - 1] + recvFgtCnts[i - 1];
    }//end i

    std::vector<ot::TreeNode> tmpFgtList(recvFgtDisps[npes - 1] + recvFgtCnts[npes - 1]);

    ot::TreeNode* recvFgtBuf = NULL;
    if(!(tmpFgtList.empty())) {
      recvFgtBuf = (&(tmpFgtList[0]));
    }

    MPI_Alltoallv( (&(fgtList[0])), sendFgtCnts, sendFgtDisps, par::Mpi_datatype<ot::TreeNode>::value(),
        recvFgtBuf, recvFgtCnts, recvFgtDisps, par::Mpi_datatype<ot::TreeNode>::value(), comm);

    if(gatherSendBuf > 0) {
      fgtList.clear();
    } else {
      for(int i = 0; i < tmpFgtList.size(); ++i) {
        if(tmpFgtList[i] == fgtList[fgtList.size() - 1]) {
          fgtList[fgtList.size() - 1].addWeight(tmpFgtList[i].getWeight());
        } else {
          fgtList.push_back(tmpFgtList[i]);
        }
      }//end i
    }

    delete [] sendFgtCnts;
    delete [] recvFgtCnts;
    delete [] sendFgtDisps;
    delete [] recvFgtDisps;

    int* sendSourceDisps = new int[npes];
    int* recvSourceDisps = new int[npes];
    sendSourceDisps[0] = 0;
    recvSourceDisps[0] = 0;
    for(int i = 1; i < npes; ++i) {
      sendSourceDisps[i] = sendSourceDisps[i - 1] + sendSourceCnts[i - 1];
      recvSourceDisps[i] = recvSourceDisps[i - 1] + recvSourceCnts[i - 1];
    }//end i

    std::vector<double> tmpSources(recvSourceDisps[npes - 1] + recvSourceCnts[npes - 1]);

    double* recvSourceBuf = NULL;
    if(!(tmpSources.empty())) {
      recvSourceBuf = (&(tmpSources[0]));
    }

    MPI_Alltoallv( (&(sources[0])), sendSourceCnts, sendSourceDisps, MPI_DOUBLE,
        recvSourceBuf, recvSourceCnts, recvSourceDisps, MPI_DOUBLE, comm);

    if(gatherSendBuf > 0) {
      sources.clear();
    } else {
      if(!(tmpSources.empty())) {
        sources.insert(sources.end(), tmpSources.begin(), tmpSources.end());
      }
    }

    delete [] sendSourceCnts;
    delete [] recvSourceCnts;
    delete [] sendSourceDisps;
    delete [] recvSourceDisps;
  }

  if(!(fgtList.empty())) {
    ot::TreeNode prevFgt;
    ot::TreeNode nextFgt;
    ot::TreeNode firstFgt = fgtList[0];
    ot::TreeNode lastFgt = fgtList[fgtList.size() - 1];
    MPI_Request recvPrevReq;
    MPI_Request recvNextReq;
    MPI_Request sendFirstReq;
    MPI_Request sendLastReq;
    if(rank > 0) {
      MPI_Irecv(&prevFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(),
          prevRank, 1, comm, &recvPrevReq);
      MPI_Isend(&firstFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(),
          prevRank, 2, comm, &sendFirstReq);
    }
    if(rank < (npes - 1)) {
      MPI_Irecv(&nextFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(),
          nextRank, 2, comm, &recvNextReq);
      MPI_Isend(&lastFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(),
          nextRank, 1, comm, &sendLastReq);
    }

    if(rank > 0) {
      MPI_Status status;
      MPI_Wait(&recvPrevReq, &status);
      MPI_Wait(&sendFirstReq, &status);
    }
    if(rank < (npes - 1)) {
      MPI_Status status;
      MPI_Wait(&recvNextReq, &status);
      MPI_Wait(&sendLastReq, &status);
    }

    bool removeFirst = false;
    bool addToLast = false;
    if(rank > 0) {
      if(prevFgt == firstFgt) {
        removeFirst = true;
      }
    }
    if(rank < (npes - 1)) {
      if(nextFgt == lastFgt) {
        addToLast = true;
      }
    }

    MPI_Request recvPtsReq;
    if(addToLast) {
      numPts = ((sources.size())/4);
      sources.resize(4*(numPts + (nextFgt.getWeight())));
      fgtList[fgtList.size() - 1].addWeight(nextFgt.getWeight());
      MPI_Irecv((&(sources[4*numPts])), (4*(nextFgt.getWeight())), MPI_DOUBLE, nextRank,
          3, comm, &recvPtsReq);
    }
    if(removeFirst) {
      MPI_Send((&(sources[0])), (4*(firstFgt.getWeight())), MPI_DOUBLE, prevRank, 3, comm);
      fgtList.erase(fgtList.begin());
    }
    if(addToLast) {
      MPI_Status status;
      MPI_Wait(&recvPtsReq, &status);
    }
    if(removeFirst) {
      sources.erase(sources.begin(), sources.begin() + (4*(firstFgt.getWeight())));
    }
  } 

#ifdef DEBUG
  assert(expandSources.empty());
  assert(directSources.empty());
#endif
  std::vector<ot::TreeNode> dummyList;
  int sourceIdx = 0;
  for(size_t i = 0; i < fgtList.size(); ++i) {
    if((fgtList[i].getWeight()) < minPtsInFgt) {
      directSources.insert(directSources.end(), (sources.begin() + sourceIdx),
          (sources.begin() + sourceIdx + (4*(fgtList[i].getWeight()))));
    } else {
      dummyList.push_back(fgtList[i]);
      expandSources.insert(expandSources.end(), (sources.begin() + sourceIdx), 
          (sources.begin() + sourceIdx + (4*(fgtList[i].getWeight()))));
    }
    sourceIdx += (4*(fgtList[i].getWeight()));
  }//end i
  swap(dummyList, fgtList);
#ifdef DEBUG
  assert((sources.size()) == ((directSources.size()) + (expandSources.size())));
#endif

  PetscLogEventEnd(splitSourcesEvent, 0, 0, 0, 0);
}
Esempio n. 13
0
template <class T> PCrsMatrix<U>::PCrsMatrix(const CrsMatrix<T>& s, const  MPI_Comm ncomm)
{
    mpi_init(ncomm);
    
    int iamsender=(s.rows()>0?1:0);    //! we rely on the fact that only one node has this cond. fulfilled.

    // now we let everyone know who is the sender
    std::valarray<int> slist(mysize);
    MPI_Allgather(&iamsender, 1, MPI_INTEGER, &slist[0], 1, MPI_INTEGER, mycomm);
    int sender=-1;
    
    for (int i=0; i< mysize; ++i) 
        if (slist[i]==1) if(sender==-1) sender=i; else ERROR("More than one process qualified as sender!");
    if (sender==-1) ERROR("No process qualified as sender!");
    
    // now we get the matrix size and resize it.
    typename PCrsMatrix<U>::index_type dim[2];
    if (iamsender) { dim[0]=s.rows(); dim[1]=s.cols(); }
    MPI_Bcast(dim,2,mpi_index,sender,mycomm);
    resize(dim[0],dim[1]);
    
    // now we copy the options, as if it were an array of char...
    MPI_Bcast(&(const_cast<CrsMatrix<T> &>(s).pops),sizeof(pops),MPI_CHAR,sender,mycomm);
    setops(s.pops);
    
    // now we can send out the row indices to the nodes.
    unsigned long nmyrows=nroots[myrank+1]-nroots[myrank];
    MPI_Request rreq;
    MPI_Irecv(&pmat.rpoints[0],nmyrows+1,mpi_index,sender,101,mycomm,&rreq);
    
    if (iamsender) {
        for (int i=0; i<mysize; ++i)
            MPI_Send(&(const_cast<CrsMatrix<T> &>(s).rpoints[nroots[i]]),nroots[i+1]-nroots[i]+1,mpi_index,i,101,mycomm);
    };
    
    //wait for receive
    MPI_Status rstatus;
    MPI_Wait(&rreq, &rstatus);
    //then shift the indices as necessary, since we are getting chunks of data
    for (typename PCrsMatrix<U>::index_type i=1;i<=nmyrows; ++i)
        pmat.rpoints[i]-=pmat.rpoints[0];
    pmat.rpoints[0]=0;
    pmat.presize(pmat.rpoints[nmyrows]);
    
    //very well. now we can share the column indices and the data!
    MPI_Request rreq_i, rreq_d;
    MPI_Irecv(&pmat.indices[0],pmat.rpoints[nmyrows],mpi_index,sender,202,mycomm,&rreq_i);
    
    if (iamsender) {
        for (int i=0; i<mysize; ++i)
            MPI_Send(&(const_cast<CrsMatrix<T> &>(s).indices[s.rpoints[nroots[i]]]),
                     s.rpoints[nroots[i+1]]-s.rpoints[nroots[i]],mpi_index,i,202,mycomm);
    };
    
    MPI_Irecv(&pmat.values[0],pmat.rpoints[nmyrows],mpi_data,sender,303,mycomm,&rreq_d);
    
    if (iamsender) {
        for (int i=0; i<mysize; ++i)
            MPI_Send(&(const_cast<CrsMatrix<T> &>(s).values[s.rpoints[nroots[i]]]),
                     s.rpoints[nroots[i+1]]-s.rpoints[nroots[i]],mpi_data,i,303,mycomm);
    };
    MPI_Wait(&rreq_i, &rstatus);
    MPI_Wait(&rreq_d, &rstatus);
}
Esempio n. 14
0
/*------------------------------------------------*/
int main (int argc, char **argv)
{
  int cols, rows, iter, particles, x, y;
  int *pic;
  PartStr *p, *changes, *totalChanges;
  int rank, num, i, numChanges, numTotalChanges;
  int *changesPerNode, *buffDispl;
  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &num);
  
  if (argc < 2)			// use default values if user does not specify anything
    {
      cols = PIC_SIZE + 2;
      rows = PIC_SIZE + 2;
      iter = MAX_ITER;
      particles = PARTICLES;
    }
  else
    {
      cols = atoi (argv[1]) + 2;
      rows = atoi (argv[2]) + 2;
      particles = atoi (argv[3]);
      iter = atoi (argv[4]);
    }

  // initialize the random number generator
  srand(rank);
  // srand(time(0)); // this should be used instead if the program runs on multiple hosts

    
  int particlesPerNode = particles / num;
  if (rank == num - 1)
    particlesPerNode = particles - particlesPerNode * (num - 1);	// in case particles cannot be split evenly
// printf("%i has %i\n", rank, particlesPerNode);
  pic = (int *) malloc (sizeof (int) * cols * rows);
  p = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  changes = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  totalChanges = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  changesPerNode = (int *) malloc (sizeof (int) * num);
  buffDispl = (int *) malloc (sizeof (int) * num);
  assert (pic != 0 && p != 0 && changes != 0 && totalChanges != 0
	  && changesPerNode != 0);



  // MPI user type declaration
  int lengths[2] = { 1, 1 };
  MPI_Datatype types[2] = { MPI_INT, MPI_INT };
  MPI_Aint add1, add2;
  MPI_Aint displ[2];
  MPI_Datatype Point;

  MPI_Address (p, &add1);
  MPI_Address (&(p[0].y), &add2);
  displ[0] = 0;
  displ[1] = add2 - add1;

  MPI_Type_struct (2, lengths, displ, types, &Point);
  MPI_Type_commit (&Point);


  dla_init_plist (pic, rows, cols, p, particlesPerNode, 1);
  while (--iter)
    {
      dla_evolve_plist (pic, rows, cols, p, &particlesPerNode, changes, &numChanges);      
//       printf("%i changed %i on iter %i : ",rank, numChanges, iter);
//       for(i=0;i<numChanges;i++) printf("(%i, %i) ", changes[i].x, changes[i].y);
//       printf("\n");
      
      //exchange information with other nodes
      MPI_Allgather (&numChanges, 1, MPI_INT, changesPerNode, 1, MPI_INT, MPI_COMM_WORLD);
      //calculate offsets
      numTotalChanges = 0;
      for (i = 0; i < num; i++)
	{
	  buffDispl[i] = numTotalChanges;
	  numTotalChanges += changesPerNode[i];
	}
//        if(rank==0)
//        {
//  	for(i=0;i<num;i++)
//  	  printf("%i tries to send %i\n",i,changesPerNode[i]);
//  	printf("-----------\n");
//        }
      if(numTotalChanges >0)
      {
      MPI_Allgatherv (changes, numChanges, Point,
		      totalChanges, changesPerNode, buffDispl, Point,
		      MPI_COMM_WORLD);
      apply_changes (pic, rows, cols, totalChanges, numTotalChanges);

	
//       if(rank==0)
//       {
//         printf("Total changes %i : ", numTotalChanges);
//         for(i=0;i<numTotalChanges;i++) printf("(%i, %i) ", totalChanges[i].x, totalChanges[i].y);
// 	
//         printf("\n");
// 	printf("-----------\n");
//       }
      }
    }

  /* Print to stdout a PBM picture of the simulation space */
  if (rank == 0)
    {
      printf ("P1\n%i %i\n", cols - 2, rows - 2);

      for (y = 1; y < rows - 1; y++)
	{
	  for (x = 1; x < cols - 1; x++)
	    {
	      if (pic[y * cols + x] < 0)
		printf ("1 ");
	      else
		printf ("0 ");
	    }
	  printf ("\n");
	}
    }
    
  MPI_Reduce(&particlesPerNode, &particles, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);  
  if(rank==0) fprintf(stderr, "Remaining particles %i\n", particles);
    
  free (pic);
  free (p);
  free (changes);
  free (changesPerNode);
  free (buffDispl);
  MPI_Finalize ();
  return 0;
}
Esempio n. 15
0
	int MeshData::getNodesWithKnownValues(pMesh theMesh){
		pEntity node, edge, face;
		int i,ID;
		double coord1[3],coord2[3],x1,y1,x2,y2;

		// search for flagged nodes
		VIter vit = M_vertexIter( theMesh );
		while ( (node = VIter_next(vit)) ){
			int flag = (!node->getClassification())?0:GEN_tag(node->getClassification());
			if ( !pSimPar->isNodeFree(flag) ){
				ID = get_AppToPETSc_Ordering(EN_id(node));
				dirichlet[ID] = pSimPar->getBC_Value(flag);
			}
		}
		VIter_delete(vit);

		// search for flagged edges
		EIter eit = M_edgeIter( theMesh );
		while ( (edge = EIter_next(eit)) ){
			if (!theMesh->getRefinementDepth(edge)){
				int flag = EN_getFlag(edge);
				/*
				 * This is a way to use the simulator to evaluate elliptic equation without screw-up the input data procedure.
				 */
				V_coord(edge->get(0,0),coord1); x1 = coord1[0]; y1 = coord1[1];
				V_coord(edge->get(0,1),coord2); x2 = coord2[0]; y2 = coord2[1];

				#ifdef CRUMPTON_EXAMPLE
				if (flag >= 2000 && flag <3000 &&  flag != 2005 ){	// take only external boundary edges
					// boundary conditions:
					// u(x,y) = [2*sin(y)+cos(y)]alpha*x + sin(y),	x <= 0
					//			exp(x)cos(y),						x > 0

					ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,0)));
					dirichlet[ID] = (x1 <= .0)?((2.*sin(y1)+cos(y1))*ALPHA*x1 + sin(y1)):exp(x1)*sin(y1);
					ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,1)));
					dirichlet[ID] = (x2 <= .0)?((2.*sin(y2)+cos(y2))*ALPHA*x2 + sin(y2)):exp(x2)*sin(y2);
				}
				#else
				if ( !pSimPar->isNodeFree(flag) )
					for (i=0; i<2; i++){
						ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,i)));
						dirichlet[ID] = pSimPar->getBC_Value(flag);
					}
				#endif
			}
		}
		EIter_delete(eit);

		// search for flagged faces (on boundary only)
		// -------------------------------------------------------
		if (theMesh->getDim()==3){

			// External boundary condition defined
			// -------------------------------------------------------
			if (pSimPar->SimulationHas_BC_ExternalDefinition()){
				double coords[3], x, y, z;
				FIter fit = M_faceIter( theMesh );
				while ( (face = FIter_next(fit)) ){
					if (!theMesh->getRefinementDepth(face)){
						int flag = EN_getFlag(face);
						if ( !pSimPar->isNodeFree(flag) ){
							for (i=0; i<3; i++){
								ID = EN_id(face->get(0,i));
								pVertex v = (pVertex)theMesh->getVertex(ID);
								ID = get_AppToPETSc_Ordering(ID);

								V_coord(v,coords);
								x = coords[0];
								y = coords[1];
								z = coords[2];

								dirichlet[ID] = pSimPar->exact_solution(x,y,z);
							}
						}
					}
				}
				FIter_delete(fit);
			}
			else{
				// conventional (Dirichlet) boundary condition: specified in numeric.dat
				// --------------------------------------------------------------------
				FIter fit = M_faceIter( theMesh );
				while ( (face = FIter_next(fit)) ){
					if (!theMesh->getRefinementDepth(face)){
						int flag = EN_getFlag(face);
						if ( !pSimPar->isNodeFree(flag) ){
							for (i=0; i<3; i++){
								ID = get_AppToPETSc_Ordering(EN_id(face->get(0,i)));
								dirichlet[ID] = pSimPar->getBC_Value(flag);
							}
						}
					}
				}
				FIter_delete(fit);
			}
		}

		if (!P_getSumInt(dirichlet.size()) ){
			throw Exception(__LINE__,__FILE__,"Prescribed (dirichlet) nodes were not found. Simulation cannot proceed.\n");
		}

		// go ahead only if parallel
		if (P_size()==1) return 0;
		// now, all partitions must know all prescribed nodes and their flags
		// first of all, let partitions aware of how many prescribed nodes exist on each one
		// if processor p does not have any prescribed node let nLPN=1 because p cannot send 0 element
		int nLPN = dirichlet.size();
		int *recvLP = new int[P_size()];
		MPI_Allgather ( &nLPN, 1, MPI_INT, recvLP, 1, MPI_INT, MPI_COMM_WORLD );
		// number of global prescribed nodes
		// Note that nGPN value is not necessary the real global prescribed nodes
		// Nodes on partition boundary can be counted twice or more
		int nGPN=0;
		for (i=0; i<P_size(); i++) nGPN += recvLP[i];
		// sPIDs = send prescribed IDs    sPFlags = send prescribed flags

		i=0;
		int *sPIDs = new int[nLPN];
		double *sPFlags = new double[nLPN];
		for(MIter mit = dirichlet.begin(); mit != dirichlet.end(); mit++){
			sPIDs[i] = mit->first;
			sPFlags[i] = mit->second;
			i++;
		}

		// rcount says how many values each rank will send
		int *rcounts = recvLP;
		// displs says where to start to read in recv buffer
		int *displs = new int[P_size()];
		displs[0] = 0;
		for (i=1; i<P_size(); i++) displs[i] = displs[i-1]+recvLP[i-1];
		int *rPIDs = new int[nGPN];
		// get all prescribed nodes
		MPI_Allgatherv(sPIDs,nLPN,MPI_INT,rPIDs,rcounts,displs,MPI_INT,MPI_COMM_WORLD);
		double *rPFlags = new double[nGPN];
		// get flags from all prescribed nodes
		MPI_Allgatherv(sPFlags,nLPN,MPI_DOUBLE,rPFlags,rcounts,displs,MPI_DOUBLE,MPI_COMM_WORLD);
		for (i=0; i<nGPN; i++)  dirichlet.insert( pair<int,double>(rPIDs[i],rPFlags[i]) );
		delete[] sPIDs; sPIDs=0;
		delete[] sPFlags; sPFlags=0;
		delete[] rPIDs; rPIDs=0;
		delete[] rcounts; rcounts=0;
		delete[] displs; displs=0;
		return 0;
	}
Esempio n. 16
0
int
main (int argc, char **argv)
{
  int mpi_rank, mpi_size;

  int n, ln;
  double *A, *b, *xA, *xB, *xC, *r, *X, *x;
  double lmax, max;
  struct timeval before, after;

  MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &mpi_size);
  MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank);
  
  /* Get the argument that indicates the problem size */
  n = JACOBI_DEF_SIZE;
  if (argc == 2)
    n = atoi (argv[1]);
  if (mpi_rank == ROOT_NODE)
    fprintf (stdout, "n = %d\n", n);
  ln = n / mpi_size;
  
  /* Initialize the random seed */
  /*srandom((unsigned int) getpid ());*/

  /* Allocate memory */
  A = (double *) calloc (ln * n, sizeof(double));
  if (A == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  b = (double *) calloc (ln, sizeof(double));
  if (b == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }
  
  xA = (double *) calloc (ln, sizeof(double));
  if (xA == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  xB = (double *) calloc (ln, sizeof(double));
  if (xB == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  xC = (double *) calloc (ln, sizeof(double));
  if (xC == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  X = (double *) calloc (n, sizeof(double));
  if (X == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  r = (double *) calloc (ln, sizeof(double));
  if (r == NULL)
    {
      perror ("calloc");
      MPI_Finalize ();
      return EXIT_FAILURE;
    }

  generate_matrix (A, ln, n, mpi_rank);
  generate_vector (b, ln);
  
  gettimeofday(&before, NULL);
  x = jacobi (xA, xB, xC, A, b, ln, n, mpi_rank, mpi_size);
  gettimeofday(&after, NULL);

  MPI_Allgather (x, ln, MPI_DOUBLE,
		 X, ln, MPI_DOUBLE, MPI_COMM_WORLD);

  /* Compute the residual */
  compute_residual (r, A, X, b, ln, n);

  /* Compute the maximum absolute value of the residual */
  lmax = find_max_abs (r, ln);
  MPI_Reduce (&lmax, &max, 1, MPI_DOUBLE, MPI_MAX, ROOT_NODE, MPI_COMM_WORLD);
  
  if (mpi_rank == ROOT_NODE)
    display_info (A, x, b, r, n, max, &before, &after);

  /* Free the memory */
  free (A);
  free (b);
  free (xA);
  free (xB);
  free (xC);
  free (X);
  free (r);

  /* Return success */
  MPI_Finalize ();
  return 0;
}
Esempio n. 17
0
	void MeshData::reorderVerticesIds(pMesh theMesh, int (*pFunc_numRemoteCopies)(pEntity)){
		// rank p must take all remote nodes
		set<int> remoteNodesSet;
		set<int>::iterator Iter;
		int numUniqueNodes = 0;

		int i = 0;
		pEntity node;
		VIter vit = M_vertexIter(theMesh);
		while ( (node=VIter_next(vit)) ){
//			if ( pFunc_numRemoteCopies(node) ){
//				remoteNodesSet.insert(EN_id(node));
//			}
//			else{
				numUniqueNodes++;
//			}
		}

		i = 0;
		int numRemoteNodes = remoteNodesSet.size();
		int *remoteNodes = new int[numRemoteNodes];

		// transfer nodes from set to an array
		for (Iter=remoteNodesSet.begin(); Iter != remoteNodesSet.end(); Iter++) remoteNodes[i++] = *Iter;

		// rank must know how many nodes will receive from all other ranks
		int *numAllRemoteNodes = new int[P_size()];
		MPI_Allgather(&numRemoteNodes,1,MPI_INT,numAllRemoteNodes,1,MPI_INT,MPI_COMM_WORLD);

		int total = 0;
		for (i=0; i<P_size(); i++) total += numAllRemoteNodes[i];
		int *allRemoteNodes = new int[total];

		int *displs = new int[P_size()];
		displs[0] = 0;
		for (i=1; i<P_size(); i++) displs[i] = displs[i-1] + numAllRemoteNodes[i-1];

		// receive remote nodes from all other ranks
		MPI_Allgatherv (remoteNodes,numRemoteNodes,MPI_INT,allRemoteNodes,numAllRemoteNodes,displs,MPI_INT,MPI_COMM_WORLD);

		delete[] displs;
		remoteNodesSet.clear();

		// how many nodes from allRemoteNodes will be checked by rank p
		total = 0;
		for (i=0;i<P_pid(); i++) total += numAllRemoteNodes[i];
		delete[] numAllRemoteNodes;

		// transfer to a set container only those remote nodes from allRemoteNodes from ranks
		// lower than rank p
		for (i=0;i<total;i++) remoteNodesSet.insert(allRemoteNodes[i]);
		delete[] allRemoteNodes;

		// duplicated numbers in different ranks must be avoided to no disturb PETSc!
		// each rank will check which remote nodes from rank p belong also to ranks lower than it
		// store on another set container those nodes that do not belong to ranks lower that rank p
		set<int> remoteNodesLowSet;
		for (i=0;i<numRemoteNodes;i++){
			Iter = remoteNodesSet.find(remoteNodes[i]);
			if ( Iter == remoteNodesSet.end()) remoteNodesLowSet.insert(remoteNodes[i]);
		}
		remoteNodesSet.clear();
		delete[] remoteNodes;

		// N means how many nodes rank p should map.
		int N = numUniqueNodes + remoteNodesLowSet.size();
		// rank 0: mapping = 1,2,3,...,N0
		// rank 1: mapping = N0+1,N0+2,N0+3,...,N0+N1
		// rank 2: mapping = N0+N1+1,NO+N1+2,NO+N1+3,...,N0+N1+N2

		this->numGN = P_getSumInt(N);

		// each rank should know all Ni's
		int *recvNs = new int[P_size()];
		MPI_Allgather(&N,1,MPI_INT,recvNs,1,MPI_INT,MPI_COMM_WORLD);

		// start mapping
		int *apOrdering = new int[N];
		int *petscOrdering = new int[N];

		int from=1, ID, j=0;
		for (i=0; i<P_pid(); i++) from += recvNs[i];

		i = from;
		vit = M_vertexIter(theMesh);
		while ( (node=VIter_next(vit)) ){
			ID = EN_id(node);
//			if ( pFunc_numRemoteCopies(node) ){
//				Iter = remoteNodesLowSet.find( ID );
//				if ( Iter != remoteNodesLowSet.end() ){
//					apOrdering[j] = ID;
//					petscOrdering[j] = i++;
//					j++;
//				}
//			}
//			else{
				apOrdering[j] = ID;
				petscOrdering[j] = i++;
				j++;
//			}
		}
		remoteNodesLowSet.clear();
		delete[] recvNs;

		// Petsc will be used to make the parallel job
		AOCreateMapping(PETSC_COMM_WORLD,N,apOrdering,petscOrdering,&ao);
		delete[] apOrdering;
		delete[] petscOrdering;
	}
Esempio n. 18
0
int Zoltan_PHG_Gather_To_All_Procs(
  ZZ *zz, 
  HGraph *phg,           /* Input:   Local part of distributed hypergraph */
  PHGPartParams *hgp,        /* Input:   Hypergraph parameters */
  PHGComm *scomm,        /* Input:   Serial PHGComm for use by shg. */
  HGraph **gathered_hg   /* Output:  combined hypergraph combined to proc */
)
{
/* 
 * Function to gather distributed hypergraph onto each processor for
 * coarsest partitioning.
 * First hypergraph arrays for the hypergraph on a column of processors
 * are built using MPI_Allgathers down the processor columns.
 * These hypergraph arrays contain complete info about a subset of vertices.
 * Second the column hypergraphs are gathered along processor rows.
 * Each processor then has a complete description of the hypergraph.
 */
char *yo = "Zoltan_PHG_Gather_To_All_Procs";
int ierr = ZOLTAN_OK;
int i, tmp, sum;
int *each = NULL,
    *disp = NULL;      /* Size and displacement arrays for MPI_Allgatherv */
int *send_buf = NULL;    /* Buffer of values to be sent */
int send_size;           /* Size of buffer send_buf */
int *col_vedge = NULL;   /* vedge array for the proc-column hypergraph */
int *col_vindex = NULL;  /* vindex array for the proc-column hypergraph */
int *col_hvertex = NULL; /* hvertex array for the proc-column hypergraph */
int *col_hindex = NULL;  /* hindex array for the proc-column hypergraph */
int col_nVtx;            /* Number of vertices in processor column */
int col_nEdge;           /* Number of edges in processor column */
int col_nPin;            /* Number of pins in processor column */

int *recv_size = NULL;   /* nPins for each proc in col or row */

HGraph *shg;             /* Pointer to the serial hypergraph to be
                            returned by this function. */

int myProc_x = phg->comm->myProc_x;
int nProc_x = phg->comm->nProc_x;
int nProc_y = phg->comm->nProc_y;
int max_nProc_xy = MAX(nProc_x, nProc_y);

  if (phg->comm->nProc == 1) {
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Do not call this routine on one proc.");
    return ZOLTAN_FATAL;
  }

#ifdef KDDKDD_CHECK
  Zoltan_HG_Print(zz, phg, NULL, stdout, "GatherBefore");/* NULL parts for now;
                                                           add non-NULL later */
#endif

  /******************************************************************
   *  0. Allocate the hypergraph to be returned. 
   *  Set values that we already know. 
   ******************************************************************/

  shg = *gathered_hg = (HGraph *) ZOLTAN_MALLOC(sizeof(HGraph));
  if (!shg) MEMORY_ERROR;

  Zoltan_HG_HGraph_Init(shg);
  shg->nVtx = phg->dist_x[nProc_x];    /* TODO64 - can this exceed 2B? */
  shg->nEdge = phg->dist_y[nProc_y];

  shg->dist_x = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE));
  shg->dist_y = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE));
  if (!shg->dist_x || !shg->dist_y) MEMORY_ERROR;

  shg->dist_x[0] = shg->dist_y[0] = 0;
  shg->dist_x[1] = shg->nVtx;
  shg->dist_y[1] = shg->nEdge;

  shg->comm = scomm;

  shg->EdgeWeightDim = phg->EdgeWeightDim;
  shg->VtxWeightDim = phg->VtxWeightDim;
  if (shg->VtxWeightDim && shg->nVtx)
    shg->vwgt = (float *) ZOLTAN_MALLOC(shg->nVtx * shg->VtxWeightDim 
                                                  * sizeof(float));
  if (shg->EdgeWeightDim && shg->nEdge)
    shg->ewgt = (float *) ZOLTAN_MALLOC(shg->nEdge * shg->EdgeWeightDim 
                                                  * sizeof(float));
  /* Fixed vertices */
  shg->bisec_split = phg->bisec_split;
  if (hgp->UseFixedVtx)
    shg->fixed_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int));
  if (hgp->UsePrefPart)
    shg->pref_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int));
  
  /* Allocate arrays for use in gather operations */
  recv_size = (int *) ZOLTAN_MALLOC(3 * max_nProc_xy * sizeof(int));
  each = recv_size + max_nProc_xy;
  disp = each + max_nProc_xy;
 
  /* TODO64 - phg->dist_y[nProc_y] could exceed 2 Billion, NO? */
  send_size = MAX(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x], 
                  phg->dist_y[nProc_y]);
  send_buf = (int *) ZOLTAN_MALLOC(send_size * sizeof(int));
  

  if ((shg->VtxWeightDim && shg->nVtx && !shg->vwgt) ||
      (shg->EdgeWeightDim && shg->nEdge && !shg->ewgt) || !recv_size ||
      (send_size && !send_buf)) 
    MEMORY_ERROR;
  

  /*************************************************************
   *  1. Gather all non-zeros for vertices in processor column *
   *************************************************************/
  
  if (nProc_y == 1) {
    /* 
     * Don't need a gather; just set pointers appropriately for row-gather
     * in Step 2 below.
     */

    col_nVtx = phg->nVtx;
    col_nEdge = phg->nEdge;
    col_nPin = phg->nPins;
    col_vindex = phg->vindex;
    col_vedge = phg->vedge;
    col_hindex = phg->hindex;
    col_hvertex = phg->hvertex;

    for (i = 0; i < shg->EdgeWeightDim * shg->nEdge; i++)
      shg->ewgt[i] = phg->ewgt[i];
  }

  else {

    /* Gather local size info for each proc in column */

    MPI_Allgather(&(phg->nPins), 1, MPI_INT, recv_size, 1, MPI_INT, 
                  phg->comm->col_comm);
  
    /* Compute number of vtx, edge, and nnz in column */
    col_nVtx = (int)(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x]);
    col_nEdge = phg->dist_y[nProc_y];   /* SCHEMEA */
    col_nPin = 0;
    for (i = 0; i < nProc_y; i++) {
      col_nPin += recv_size[i];
    }
    
    /* Allocate arrays for column hypergraph */
    col_hindex = (int *) ZOLTAN_CALLOC((col_nEdge+1), sizeof(int));
    col_hvertex = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int));
  
    col_vindex = (int *) ZOLTAN_CALLOC((col_nVtx+1), sizeof(int));
    col_vedge = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int));
  
    if (!col_vindex || !col_hindex || 
        (col_nPin && (!col_vedge || !col_hvertex)))
      MEMORY_ERROR;
    
    /* Gather hvertex data for all procs in column */
  
    /* SCHEMEA uses same vertex LNO on each proc in column. */
    /* SCHEMEB would require conversion from vertex LNO to GNO here. */
  
    disp[0] = 0;
    for (i = 1; i < nProc_y; i++)
      disp[i] = disp[i-1] + recv_size[i-1];
  
    MPI_Allgatherv(phg->hvertex, phg->nPins, MPI_INT,
                   col_hvertex, recv_size, disp, MPI_INT, phg->comm->col_comm);
  
    /* SCHEMEA uses same vertex LNO on each proc in column. */
    /* SCHEMEB would require conversion from vertex GNO to LNO here */
  
    /* Gather hindex data for all procs in column */
  
    for (i = 0; i < phg->nEdge; i++)
      send_buf[i] = phg->hindex[i+1] - phg->hindex[i];
  
    /* SCHEMEA can assume a recv for each edge;
     * SCHEMEB needs to gather the number of edges recv'd from each proc. */
  
    for (i = 0; i < nProc_y; i++) 
      each[i] = phg->dist_y[i+1] - phg->dist_y[i];

    disp[0] = 0;  /* Can't use dist_y because it may not be sizeof(int) */
    for (i=1; i < nProc_y; i++){
      disp[i] = disp[i-1] + each[i-1];
    }
  
    /* SCHEMEA can use phg->dist_y for displacement array.
     * SCHEMEB requires separate displacement array. */

    MPI_Allgatherv(send_buf, phg->nEdge, MPI_INT, 
                   col_hindex, each, disp, MPI_INT, phg->comm->col_comm);
  
    /* Perform prefix sum on col_hindex */
    sum = 0;
    for (i = 0; i < col_nEdge; i++) {
      tmp = col_hindex[i];
      col_hindex[i] = sum;
      sum += tmp;
    }
    col_hindex[col_nEdge] = sum;

    /* Sanity check */
    if (col_hindex[col_nEdge] != col_nPin) {
      printf("%d Sanity check failed:  "
             "col_hindex[col_nEdge] %d != col_nPin %d\n", 
              zz->Proc, col_hindex[col_nEdge], col_nPin);
      exit(-1);
    }
  
    /* Gather edge weights, if any. */
    if (shg->EdgeWeightDim) {
  
      /* Can use nearly the same each array. */
      /* Need to compute new disp array. */
  
      disp[0] = 0;
      each[0] *= phg->EdgeWeightDim;
      for (i = 1; i < nProc_y; i++) {
        each[i] *= phg->EdgeWeightDim;
        disp[i] = disp[i-1] + each[i-1];
      }
      
      MPI_Allgatherv(phg->ewgt, phg->nEdge*phg->EdgeWeightDim, MPI_FLOAT, 
                     shg->ewgt, each, disp, MPI_FLOAT, phg->comm->col_comm);
    }
   
  
    Zoltan_HG_Mirror(col_nEdge, col_hindex, col_hvertex, 
                     col_nVtx, col_vindex, col_vedge);
  
  }  /* End column-gather */
  
  /*************************************************************
   *  2. Gather all non-zeros for edges in processor rows      *
   *  All processors in a processor column now have the same   *
   *  hypergraph; we now gather it across rows.                *
   *************************************************************/

  if (nProc_x == 1) {
    /* 
     * Don't need a gather across the row; just set pointers appropriately
     * in shg.
     */
    shg->vindex = col_vindex;
    shg->vedge = col_vedge;
    shg->hindex = col_hindex;
    shg->hvertex = col_hvertex;

    /* Copy vwgt and fixed arrays so shg owns this memory */
    for (i = 0; i < shg->VtxWeightDim*shg->nVtx; i++)
      shg->vwgt[i] = phg->vwgt[i];
    if (hgp->UseFixedVtx)
      for (i = 0; i < shg->nVtx; i++)
        shg->fixed_part[i] = phg->fixed_part[i];
    if (hgp->UsePrefPart)
      for (i = 0; i < shg->nVtx; i++)
        shg->pref_part[i] = phg->pref_part[i];
  }

  else {

    /* Gather info about size within the row */

    MPI_Allgather(&col_nPin, 1, MPI_INT, recv_size, 1, MPI_INT, 
                  phg->comm->row_comm);
  
    tmp = 0;
    for (i = 0; i < nProc_x; i++) 
      tmp += recv_size[i];

    shg->nPins = tmp;
  
    shg->vindex = (int *) ZOLTAN_CALLOC((shg->nVtx+1), sizeof(int));
    shg->vedge = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int));
    shg->hindex = (int *) ZOLTAN_CALLOC((shg->nEdge+1), sizeof(int));
    shg->hvertex = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int));
   
    if (!shg->vindex || !shg->hindex ||
        (shg->nPins && (!shg->vedge || !shg->hvertex)))
      MEMORY_ERROR;
    
    /* Gather vedge data for all procs in row */
  
    /* SCHEMEA can send local edge numbers; 
       SCHEMEB requires edge LNO to GNO conversion. */
  
    disp[0] = 0;
    for (i = 1; i < nProc_x; i++)
      disp[i] = disp[i-1] + recv_size[i-1];
  
    MPI_Allgatherv(col_vedge, col_nPin, MPI_INT,
                   shg->vedge, recv_size, disp, MPI_INT, phg->comm->row_comm);
  
    /* Gather vindex data for all procs in row */
  
    for (i = 0; i < col_nVtx; i++)
      send_buf[i] = col_vindex[i+1] - col_vindex[i];
  
    /* SCHEMEA can assume a recv for each vertex;
     * SCHEMEB would need to gather the number of vtxs recv'd from each proc. */
  
    for (i = 0; i < nProc_x; i++) 
      each[i] = (int)(phg->dist_x[i+1] - phg->dist_x[i]);

    disp[0] = 0;  /* Can't use dist_x, may not be sizeof(int) */
    for (i = 1; i < nProc_x; i++) 
      disp[i] = disp[i-1] + each[i-1];

    /* SCHEMEA can use phg->dist_x as displacement array;
     * SCHEMEB requires separate displacement array. */

    MPI_Allgatherv(send_buf, col_nVtx, MPI_INT, 
                   shg->vindex, each, disp,
                   MPI_INT, phg->comm->row_comm);

    /* Perform prefix sum on shg->vindex */
    sum = 0;
    for (i = 0; i < shg->nVtx; i++) {
      tmp = shg->vindex[i];
      shg->vindex[i] = sum;
      sum += tmp;
    }
    shg->vindex[shg->nVtx] = sum;
  
    /* Sanity check */
    if (shg->vindex[shg->nVtx] != shg->nPins) {
      printf("%d Sanity check failed:  "
             "shg->vindex %d != nPins %d\n", 
              zz->Proc, shg->vindex[shg->nVtx], shg->nPins);
      exit(-1);
    }
  
    /* Gather fixed array, if any  */
    if (hgp->UseFixedVtx){
  
#ifdef DEBUG_
      uprintf(phg->comm, "Debug in PHG_gather before gather. phg->fixed =");
      for (i=0; i<phg->nVtx; i++){
        printf(" %d ", phg->fixed_part[i]);
      }
      printf("\n");
#endif

      /* Can use the same each array. */
      /* Need to compute new disp array. */
  
      disp[0] = 0;
      for (i = 1; i < nProc_x; i++) {
        disp[i] = disp[i-1] + each[i-1];
      }
      
      MPI_Allgatherv(phg->fixed_part, phg->nVtx, MPI_FLOAT, 
                     shg->fixed_part, each, disp, MPI_FLOAT, phg->comm->row_comm);

#ifdef DEBUG_
      uprintf(phg->comm, "Debug in PHG_gather after gather. shg->fixed =");
      for (i=0; i<shg->nVtx; i++){
        printf(" %d ", shg->fixed_part[i]);
      }
      printf("\n");
#endif
    }
    /* Gather pref part array, if any  */
    if (hgp->UsePrefPart){
      /* Can use the same each array. */
      /* Need to compute new disp array. */
      disp[0] = 0;
      for (i = 1; i < nProc_x; i++) {
        disp[i] = disp[i-1] + each[i-1];
      }
      
      MPI_Allgatherv(phg->pref_part, phg->nVtx, MPI_FLOAT, 
                     shg->pref_part, each, disp, MPI_FLOAT, phg->comm->row_comm);
    }
    
    /* Gather vertex weights, if any. */
    if (shg->VtxWeightDim) {
  
      /* Can use nearly the same each array. */
      /* Need to compute new disp array. */
  
      disp[0] = 0;
      each[0] *= phg->VtxWeightDim;
      for (i = 1; i < nProc_x; i++) {
        each[i] *= phg->VtxWeightDim;
        disp[i] = disp[i-1] + each[i-1];
      }
      
      MPI_Allgatherv(phg->vwgt, phg->nVtx*phg->VtxWeightDim, MPI_FLOAT, 
                     shg->vwgt, each, disp, MPI_FLOAT, phg->comm->row_comm);
    }
  
    Zoltan_HG_Mirror(shg->nVtx, shg->vindex, shg->vedge, 
                     shg->nEdge, shg->hindex, shg->hvertex);

  }  /* End row gather */
  
#ifdef KDDKDD_CHECK
  Zoltan_HG_Print(zz, shg, NULL, stdout, "GatherAfter");/* NULL parts for now;
                                                           add non-NULL later */
  Zoltan_PHG_Plot_2D_Distrib(zz, phg);
  Zoltan_PHG_Plot_2D_Distrib(zz, shg);
#endif

End:

  if (ierr < 0) {
    Zoltan_HG_HGraph_Free(*gathered_hg);
    ZOLTAN_FREE(gathered_hg);
  }

  Zoltan_Multifree(__FILE__, __LINE__, 2, &send_buf, 
                                          &recv_size);

  if (nProc_x > 1 && nProc_y > 1) 
    Zoltan_Multifree(__FILE__, __LINE__, 4, &col_vedge,
                                            &col_vindex,
                                            &col_hvertex,
                                            &col_hindex);
  return ierr;
}
Esempio n. 19
0
PetscErrorCode  DMSetUp_DA_2D(DM da)
{
  DM_DA            *dd = (DM_DA*)da->data;
  const PetscInt   M            = dd->M;
  const PetscInt   N            = dd->N;
  PetscInt         m            = dd->m;
  PetscInt         n            = dd->n;
  const PetscInt   dof          = dd->w;
  const PetscInt   s            = dd->s;
  DMDABoundaryType bx           = dd->bx;
  DMDABoundaryType by           = dd->by;
  DMDAStencilType  stencil_type = dd->stencil_type;
  PetscInt         *lx          = dd->lx;
  PetscInt         *ly          = dd->ly;
  MPI_Comm         comm;
  PetscMPIInt      rank,size;
  PetscInt         xs,xe,ys,ye,x,y,Xs,Xe,Ys,Ye,start,end,IXs,IXe,IYs,IYe;
  PetscInt         up,down,left,right,i,n0,n1,n2,n3,n5,n6,n7,n8,*idx,nn,*idx_cpy;
  const PetscInt   *idx_full;
  PetscInt         xbase,*bases,*ldims,j,x_t,y_t,s_t,base,count;
  PetscInt         s_x,s_y; /* s proportionalized to w */
  PetscInt         sn0 = 0,sn2 = 0,sn6 = 0,sn8 = 0;
  Vec              local,global;
  VecScatter       ltog,gtol;
  IS               to,from,ltogis;
  PetscErrorCode   ierr;

  PetscFunctionBegin;
  if (stencil_type == DMDA_STENCIL_BOX && (bx == DMDA_BOUNDARY_MIRROR || by == DMDA_BOUNDARY_MIRROR)) SETERRQ(PetscObjectComm((PetscObject)da),PETSC_ERR_SUP,"Mirror boundary and box stencil");
  ierr = PetscObjectGetComm((PetscObject)da,&comm);CHKERRQ(ierr);
#if !defined(PETSC_USE_64BIT_INDICES)
  if (((Petsc64bitInt) M)*((Petsc64bitInt) N)*((Petsc64bitInt) dof) > (Petsc64bitInt) PETSC_MPI_INT_MAX) SETERRQ3(comm,PETSC_ERR_INT_OVERFLOW,"Mesh of %D by %D by %D (dof) is too large for 32 bit indices",M,N,dof);
#endif

  if (dof < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Must have 1 or more degrees of freedom per node: %D",dof);
  if (s < 0) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Stencil width cannot be negative: %D",s);

  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);

  if (m != PETSC_DECIDE) {
    if (m < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Non-positive number of processors in X direction: %D",m);
    else if (m > size) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Too many processors in X direction: %D %d",m,size);
  }
  if (n != PETSC_DECIDE) {
    if (n < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Non-positive number of processors in Y direction: %D",n);
    else if (n > size) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Too many processors in Y direction: %D %d",n,size);
  }

  if (m == PETSC_DECIDE || n == PETSC_DECIDE) {
    if (n != PETSC_DECIDE) {
      m = size/n;
    } else if (m != PETSC_DECIDE) {
      n = size/m;
    } else {
      /* try for squarish distribution */
      m = (PetscInt)(0.5 + PetscSqrtReal(((PetscReal)M)*((PetscReal)size)/((PetscReal)N)));
      if (!m) m = 1;
      while (m > 0) {
        n = size/m;
        if (m*n == size) break;
        m--;
      }
      if (M > N && m < n) {PetscInt _m = m; m = n; n = _m;}
    }
    if (m*n != size) SETERRQ(comm,PETSC_ERR_PLIB,"Unable to create partition, check the size of the communicator and input m and n ");
  } else if (m*n != size) SETERRQ(comm,PETSC_ERR_ARG_OUTOFRANGE,"Given Bad partition");

  if (M < m) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Partition in x direction is too fine! %D %D",M,m);
  if (N < n) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Partition in y direction is too fine! %D %D",N,n);

  /*
     Determine locally owned region
     xs is the first local node number, x is the number of local nodes
  */
  if (!lx) {
    ierr = PetscMalloc(m*sizeof(PetscInt), &dd->lx);CHKERRQ(ierr);
    lx   = dd->lx;
    for (i=0; i<m; i++) {
      lx[i] = M/m + ((M % m) > i);
    }
  }
  x  = lx[rank % m];
  xs = 0;
  for (i=0; i<(rank % m); i++) {
    xs += lx[i];
  }
#if defined(PETSC_USE_DEBUG)
  left = xs;
  for (i=(rank % m); i<m; i++) {
    left += lx[i];
  }
  if (left != M) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Sum of lx across processors not equal to M: %D %D",left,M);
#endif

  /*
     Determine locally owned region
     ys is the first local node number, y is the number of local nodes
  */
  if (!ly) {
    ierr = PetscMalloc(n*sizeof(PetscInt), &dd->ly);CHKERRQ(ierr);
    ly   = dd->ly;
    for (i=0; i<n; i++) {
      ly[i] = N/n + ((N % n) > i);
    }
  }
  y  = ly[rank/m];
  ys = 0;
  for (i=0; i<(rank/m); i++) {
    ys += ly[i];
  }
#if defined(PETSC_USE_DEBUG)
  left = ys;
  for (i=(rank/m); i<n; i++) {
    left += ly[i];
  }
  if (left != N) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Sum of ly across processors not equal to N: %D %D",left,N);
#endif

  /*
   check if the scatter requires more than one process neighbor or wraps around
   the domain more than once
  */
  if ((x < s) && ((m > 1) || (bx == DMDA_BOUNDARY_PERIODIC))) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Local x-width of domain x %D is smaller than stencil width s %D",x,s);
  if ((y < s) && ((n > 1) || (by == DMDA_BOUNDARY_PERIODIC))) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Local y-width of domain y %D is smaller than stencil width s %D",y,s);
  xe = xs + x;
  ye = ys + y;

  /* determine ghost region (Xs) and region scattered into (IXs)  */
  if (xs-s > 0) {
    Xs = xs - s; IXs = xs - s;
  } else {
    if (bx) {
      Xs = xs - s;
    } else {
      Xs = 0;
    }
    IXs = 0;
  }
  if (xe+s <= M) {
    Xe = xe + s; IXe = xe + s;
  } else {
    if (bx) {
      Xs = xs - s; Xe = xe + s;
    } else {
      Xe = M;
    }
    IXe = M;
  }

  if (bx == DMDA_BOUNDARY_PERIODIC || bx == DMDA_BOUNDARY_MIRROR) {
    IXs = xs - s;
    IXe = xe + s;
    Xs  = xs - s;
    Xe  = xe + s;
  }

  if (ys-s > 0) {
    Ys = ys - s; IYs = ys - s;
  } else {
    if (by) {
      Ys = ys - s;
    } else {
      Ys = 0;
    }
    IYs = 0;
  }
  if (ye+s <= N) {
    Ye = ye + s; IYe = ye + s;
  } else {
    if (by) {
      Ye = ye + s;
    } else {
      Ye = N;
    }
    IYe = N;
  }

  if (by == DMDA_BOUNDARY_PERIODIC || by == DMDA_BOUNDARY_MIRROR) {
    IYs = ys - s;
    IYe = ye + s;
    Ys  = ys - s;
    Ye  = ye + s;
  }

  /* stencil length in each direction */
  s_x = s;
  s_y = s;

  /* determine starting point of each processor */
  nn       = x*y;
  ierr     = PetscMalloc2(size+1,PetscInt,&bases,size,PetscInt,&ldims);CHKERRQ(ierr);
  ierr     = MPI_Allgather(&nn,1,MPIU_INT,ldims,1,MPIU_INT,comm);CHKERRQ(ierr);
  bases[0] = 0;
  for (i=1; i<=size; i++) {
    bases[i] = ldims[i-1];
  }
  for (i=1; i<=size; i++) {
    bases[i] += bases[i-1];
  }
  base = bases[rank]*dof;

  /* allocate the base parallel and sequential vectors */
  dd->Nlocal = x*y*dof;
  ierr       = VecCreateMPIWithArray(comm,dof,dd->Nlocal,PETSC_DECIDE,0,&global);CHKERRQ(ierr);
  dd->nlocal = (Xe-Xs)*(Ye-Ys)*dof;
  ierr       = VecCreateSeqWithArray(PETSC_COMM_SELF,dof,dd->nlocal,0,&local);CHKERRQ(ierr);

  /* generate appropriate vector scatters */
  /* local to global inserts non-ghost point region into global */
  ierr = VecGetOwnershipRange(global,&start,&end);CHKERRQ(ierr);
  ierr = ISCreateStride(comm,x*y*dof,start,1,&to);CHKERRQ(ierr);

  ierr  = PetscMalloc(x*y*sizeof(PetscInt),&idx);CHKERRQ(ierr);
  left  = xs - Xs; right = left + x;
  down  = ys - Ys; up = down + y;
  count = 0;
  for (i=down; i<up; i++) {
    for (j=left; j<right; j++) {
      idx[count++] = i*(Xe-Xs) + j;
    }
  }

  ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&from);CHKERRQ(ierr);
  ierr = VecScatterCreate(local,from,global,to,&ltog);CHKERRQ(ierr);
  ierr = PetscLogObjectParent(dd,ltog);CHKERRQ(ierr);
  ierr = ISDestroy(&from);CHKERRQ(ierr);
  ierr = ISDestroy(&to);CHKERRQ(ierr);

  /* global to local must include ghost points within the domain,
     but not ghost points outside the domain that aren't periodic */
  if (stencil_type == DMDA_STENCIL_BOX) {
    count = (IXe-IXs)*(IYe-IYs);
    ierr  = PetscMalloc(count*sizeof(PetscInt),&idx);CHKERRQ(ierr);

    left  = IXs - Xs; right = left + (IXe-IXs);
    down  = IYs - Ys; up = down + (IYe-IYs);
    count = 0;
    for (i=down; i<up; i++) {
      for (j=left; j<right; j++) {
        idx[count++] = j + i*(Xe-Xs);
      }
    }
    ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&to);CHKERRQ(ierr);

  } else {
    /* must drop into cross shape region */
    /*       ---------|
            |  top    |
         |---         ---| up
         |   middle      |
         |               |
         ----         ---- down
            | bottom  |
            -----------
         Xs xs        xe Xe */
    count = (ys-IYs)*x + y*(IXe-IXs) + (IYe-ye)*x;
    ierr  = PetscMalloc(count*sizeof(PetscInt),&idx);CHKERRQ(ierr);

    left  = xs - Xs; right = left + x;
    down  = ys - Ys; up = down + y;
    count = 0;
    /* bottom */
    for (i=(IYs-Ys); i<down; i++) {
      for (j=left; j<right; j++) {
        idx[count++] = j + i*(Xe-Xs);
      }
    }
    /* middle */
    for (i=down; i<up; i++) {
      for (j=(IXs-Xs); j<(IXe-Xs); j++) {
        idx[count++] = j + i*(Xe-Xs);
      }
    }
    /* top */
    for (i=up; i<up+IYe-ye; i++) {
      for (j=left; j<right; j++) {
        idx[count++] = j + i*(Xe-Xs);
      }
    }
    ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&to);CHKERRQ(ierr);
  }


  /* determine who lies on each side of us stored in    n6 n7 n8
                                                        n3    n5
                                                        n0 n1 n2
  */

  /* Assume the Non-Periodic Case */
  n1 = rank - m;
  if (rank % m) {
    n0 = n1 - 1;
  } else {
    n0 = -1;
  }
  if ((rank+1) % m) {
    n2 = n1 + 1;
    n5 = rank + 1;
    n8 = rank + m + 1; if (n8 >= m*n) n8 = -1;
  } else {
    n2 = -1; n5 = -1; n8 = -1;
  }
  if (rank % m) {
    n3 = rank - 1;
    n6 = n3 + m; if (n6 >= m*n) n6 = -1;
  } else {
    n3 = -1; n6 = -1;
  }
  n7 = rank + m; if (n7 >= m*n) n7 = -1;

  if (bx == DMDA_BOUNDARY_PERIODIC && by == DMDA_BOUNDARY_PERIODIC) {
    /* Modify for Periodic Cases */
    /* Handle all four corners */
    if ((n6 < 0) && (n7 < 0) && (n3 < 0)) n6 = m-1;
    if ((n8 < 0) && (n7 < 0) && (n5 < 0)) n8 = 0;
    if ((n2 < 0) && (n5 < 0) && (n1 < 0)) n2 = size-m;
    if ((n0 < 0) && (n3 < 0) && (n1 < 0)) n0 = size-1;

    /* Handle Top and Bottom Sides */
    if (n1 < 0) n1 = rank + m * (n-1);
    if (n7 < 0) n7 = rank - m * (n-1);
    if ((n3 >= 0) && (n0 < 0)) n0 = size - m + rank - 1;
    if ((n3 >= 0) && (n6 < 0)) n6 = (rank%m)-1;
    if ((n5 >= 0) && (n2 < 0)) n2 = size - m + rank + 1;
    if ((n5 >= 0) && (n8 < 0)) n8 = (rank%m)+1;

    /* Handle Left and Right Sides */
    if (n3 < 0) n3 = rank + (m-1);
    if (n5 < 0) n5 = rank - (m-1);
    if ((n1 >= 0) && (n0 < 0)) n0 = rank-1;
    if ((n1 >= 0) && (n2 < 0)) n2 = rank-2*m+1;
    if ((n7 >= 0) && (n6 < 0)) n6 = rank+2*m-1;
    if ((n7 >= 0) && (n8 < 0)) n8 = rank+1;
  } else if (by == DMDA_BOUNDARY_PERIODIC) {  /* Handle Top and Bottom Sides */
    if (n1 < 0) n1 = rank + m * (n-1);
    if (n7 < 0) n7 = rank - m * (n-1);
    if ((n3 >= 0) && (n0 < 0)) n0 = size - m + rank - 1;
    if ((n3 >= 0) && (n6 < 0)) n6 = (rank%m)-1;
    if ((n5 >= 0) && (n2 < 0)) n2 = size - m + rank + 1;
    if ((n5 >= 0) && (n8 < 0)) n8 = (rank%m)+1;
  } else if (bx == DMDA_BOUNDARY_PERIODIC) { /* Handle Left and Right Sides */
    if (n3 < 0) n3 = rank + (m-1);
    if (n5 < 0) n5 = rank - (m-1);
    if ((n1 >= 0) && (n0 < 0)) n0 = rank-1;
    if ((n1 >= 0) && (n2 < 0)) n2 = rank-2*m+1;
    if ((n7 >= 0) && (n6 < 0)) n6 = rank+2*m-1;
    if ((n7 >= 0) && (n8 < 0)) n8 = rank+1;
  }

  ierr = PetscMalloc(9*sizeof(PetscInt),&dd->neighbors);CHKERRQ(ierr);

  dd->neighbors[0] = n0;
  dd->neighbors[1] = n1;
  dd->neighbors[2] = n2;
  dd->neighbors[3] = n3;
  dd->neighbors[4] = rank;
  dd->neighbors[5] = n5;
  dd->neighbors[6] = n6;
  dd->neighbors[7] = n7;
  dd->neighbors[8] = n8;

  if (stencil_type == DMDA_STENCIL_STAR) {
    /* save corner processor numbers */
    sn0 = n0; sn2 = n2; sn6 = n6; sn8 = n8;
    n0  = n2 = n6 = n8 = -1;
  }

  ierr = PetscMalloc((Xe-Xs)*(Ye-Ys)*sizeof(PetscInt),&idx);CHKERRQ(ierr);
  ierr = PetscLogObjectMemory(da,(Xe-Xs)*(Ye-Ys)*sizeof(PetscInt));CHKERRQ(ierr);

  nn = 0;
  xbase = bases[rank];
  for (i=1; i<=s_y; i++) {
    if (n0 >= 0) { /* left below */
      x_t = lx[n0 % m];
      y_t = ly[(n0/m)];
      s_t = bases[n0] + x_t*y_t - (s_y-i)*x_t - s_x;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    }

    if (n1 >= 0) { /* directly below */
      x_t = x;
      y_t = ly[(n1/m)];
      s_t = bases[n1] + x_t*y_t - (s_y+1-i)*x_t;
      for (j=0; j<x_t; j++) idx[nn++] = s_t++;
    } else if (by == DMDA_BOUNDARY_MIRROR) {
      for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(s_y - i + 1)  + j;
    }

    if (n2 >= 0) { /* right below */
      x_t = lx[n2 % m];
      y_t = ly[(n2/m)];
      s_t = bases[n2] + x_t*y_t - (s_y+1-i)*x_t;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    }
  }

  for (i=0; i<y; i++) {
    if (n3 >= 0) { /* directly left */
      x_t = lx[n3 % m];
      /* y_t = y; */
      s_t = bases[n3] + (i+1)*x_t - s_x;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    } else if (bx == DMDA_BOUNDARY_MIRROR) {
      for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*i + s_x - j;
    }

    for (j=0; j<x; j++) idx[nn++] = xbase++; /* interior */

    if (n5 >= 0) { /* directly right */
      x_t = lx[n5 % m];
      /* y_t = y; */
      s_t = bases[n5] + (i)*x_t;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    } else if (bx == DMDA_BOUNDARY_MIRROR) {
      for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*(i + 1) - 2 - j;
    }
  }

  for (i=1; i<=s_y; i++) {
    if (n6 >= 0) { /* left above */
      x_t = lx[n6 % m];
      /* y_t = ly[(n6/m)]; */
      s_t = bases[n6] + (i)*x_t - s_x;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    }

    if (n7 >= 0) { /* directly above */
      x_t = x;
      /* y_t = ly[(n7/m)]; */
      s_t = bases[n7] + (i-1)*x_t;
      for (j=0; j<x_t; j++) idx[nn++] = s_t++;
    } else if (by == DMDA_BOUNDARY_MIRROR) {
      for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(y - i - 1)  + j;
    }

    if (n8 >= 0) { /* right above */
      x_t = lx[n8 % m];
      /* y_t = ly[(n8/m)]; */
      s_t = bases[n8] + (i-1)*x_t;
      for (j=0; j<s_x; j++) idx[nn++] = s_t++;
    }
  }

  ierr = ISCreateBlock(comm,dof,nn,idx,PETSC_COPY_VALUES,&from);CHKERRQ(ierr);
  ierr = VecScatterCreate(global,from,local,to,&gtol);CHKERRQ(ierr);
  ierr = PetscLogObjectParent(da,gtol);CHKERRQ(ierr);
  ierr = ISDestroy(&to);CHKERRQ(ierr);
  ierr = ISDestroy(&from);CHKERRQ(ierr);

  if (stencil_type == DMDA_STENCIL_STAR) {
    n0 = sn0; n2 = sn2; n6 = sn6; n8 = sn8;
  }

  if (((stencil_type == DMDA_STENCIL_STAR)  ||
       (bx && bx != DMDA_BOUNDARY_PERIODIC) ||
       (by && by != DMDA_BOUNDARY_PERIODIC))) {
    /*
        Recompute the local to global mappings, this time keeping the
      information about the cross corner processor numbers and any ghosted
      but not periodic indices.
    */
    nn    = 0;
    xbase = bases[rank];
    for (i=1; i<=s_y; i++) {
      if (n0 >= 0) { /* left below */
        x_t = lx[n0 % m];
        y_t = ly[(n0/m)];
        s_t = bases[n0] + x_t*y_t - (s_y-i)*x_t - s_x;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (xs-Xs > 0 && ys-Ys > 0) {
        for (j=0; j<s_x; j++) idx[nn++] = -1;
      }
      if (n1 >= 0) { /* directly below */
        x_t = x;
        y_t = ly[(n1/m)];
        s_t = bases[n1] + x_t*y_t - (s_y+1-i)*x_t;
        for (j=0; j<x_t; j++) idx[nn++] = s_t++;
      } else if (ys-Ys > 0) {
        if (by == DMDA_BOUNDARY_MIRROR) {
          for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(s_y - i + 1)  + j;
        } else {
          for (j=0; j<x; j++) idx[nn++] = -1;
        }
      }
      if (n2 >= 0) { /* right below */
        x_t = lx[n2 % m];
        y_t = ly[(n2/m)];
        s_t = bases[n2] + x_t*y_t - (s_y+1-i)*x_t;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (Xe-xe> 0 && ys-Ys > 0) {
        for (j=0; j<s_x; j++) idx[nn++] = -1;
      }
    }

    for (i=0; i<y; i++) {
      if (n3 >= 0) { /* directly left */
        x_t = lx[n3 % m];
        /* y_t = y; */
        s_t = bases[n3] + (i+1)*x_t - s_x;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (xs-Xs > 0) {
        if (bx == DMDA_BOUNDARY_MIRROR) {
          for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*i + s_x - j;
        } else {
          for (j=0; j<s_x; j++) idx[nn++] = -1;
        }
      }

      for (j=0; j<x; j++) idx[nn++] = xbase++; /* interior */

      if (n5 >= 0) { /* directly right */
        x_t = lx[n5 % m];
        /* y_t = y; */
        s_t = bases[n5] + (i)*x_t;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (Xe-xe > 0) {
        if (bx == DMDA_BOUNDARY_MIRROR) {
          for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*(i + 1) - 2 - j;
        } else {
          for (j=0; j<s_x; j++) idx[nn++] = -1;
        }
      }
    }

    for (i=1; i<=s_y; i++) {
      if (n6 >= 0) { /* left above */
        x_t = lx[n6 % m];
        /* y_t = ly[(n6/m)]; */
        s_t = bases[n6] + (i)*x_t - s_x;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (xs-Xs > 0 && Ye-ye > 0) {
        for (j=0; j<s_x; j++) idx[nn++] = -1;
      }
      if (n7 >= 0) { /* directly above */
        x_t = x;
        /* y_t = ly[(n7/m)]; */
        s_t = bases[n7] + (i-1)*x_t;
        for (j=0; j<x_t; j++) idx[nn++] = s_t++;
      } else if (Ye-ye > 0) {
        if (by == DMDA_BOUNDARY_MIRROR) {
          for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(y - i - 1)  + j;
        } else {
          for (j=0; j<x; j++) idx[nn++] = -1;
        }
      }
      if (n8 >= 0) { /* right above */
        x_t = lx[n8 % m];
        /* y_t = ly[(n8/m)]; */
        s_t = bases[n8] + (i-1)*x_t;
        for (j=0; j<s_x; j++) idx[nn++] = s_t++;
      } else if (Xe-xe > 0 && Ye-ye > 0) {
        for (j=0; j<s_x; j++) idx[nn++] = -1;
      }
    }
  }
  /*
     Set the local to global ordering in the global vector, this allows use
     of VecSetValuesLocal().
  */
  ierr = ISCreateBlock(comm,dof,nn,idx,PETSC_OWN_POINTER,&ltogis);CHKERRQ(ierr);
  ierr = PetscMalloc(nn*dof*sizeof(PetscInt),&idx_cpy);CHKERRQ(ierr);
  ierr = PetscLogObjectMemory(da,nn*dof*sizeof(PetscInt));CHKERRQ(ierr);
  ierr = ISGetIndices(ltogis, &idx_full);CHKERRQ(ierr);
  ierr = PetscMemcpy(idx_cpy,idx_full,nn*dof*sizeof(PetscInt));CHKERRQ(ierr);
  ierr = ISRestoreIndices(ltogis, &idx_full);CHKERRQ(ierr);
  ierr = ISLocalToGlobalMappingCreateIS(ltogis,&da->ltogmap);CHKERRQ(ierr);
  ierr = PetscLogObjectParent(da,da->ltogmap);CHKERRQ(ierr);
  ierr = ISDestroy(&ltogis);CHKERRQ(ierr);
  ierr = ISLocalToGlobalMappingBlock(da->ltogmap,dd->w,&da->ltogmapb);CHKERRQ(ierr);
  ierr = PetscLogObjectParent(da,da->ltogmap);CHKERRQ(ierr);

  ierr  = PetscFree2(bases,ldims);CHKERRQ(ierr);
  dd->m = m;  dd->n  = n;
  /* note petsc expects xs/xe/Xs/Xe to be multiplied by #dofs in many places */
  dd->xs = xs*dof; dd->xe = xe*dof; dd->ys = ys; dd->ye = ye; dd->zs = 0; dd->ze = 1;
  dd->Xs = Xs*dof; dd->Xe = Xe*dof; dd->Ys = Ys; dd->Ye = Ye; dd->Zs = 0; dd->Ze = 1;

  ierr = VecDestroy(&local);CHKERRQ(ierr);
  ierr = VecDestroy(&global);CHKERRQ(ierr);

  dd->gtol      = gtol;
  dd->ltog      = ltog;
  dd->idx       = idx_cpy;
  dd->Nl        = nn*dof;
  dd->base      = base;
  da->ops->view = DMView_DA_2d;
  dd->ltol      = NULL;
  dd->ao        = NULL;
  PetscFunctionReturn(0);
}
Esempio n. 20
0
//---------------------------------------------------------------------------//
int
all_gatherv(Node &send_node,
            Node &recv_node,
            MPI_Comm mpi_comm)
{
    Node n_snd_compact;
    send_node.compact_to(n_snd_compact);

    int m_size = mpi::size(mpi_comm);

    std::string schema_str = n_snd_compact.schema().to_json();

    int schema_len = schema_str.length() + 1;
    int data_len   = n_snd_compact.total_bytes_compact();
    
    // to do the conduit gatherv, first need a gather to get the 
    // schema and data buffer sizes
    
    int snd_sizes[] = {schema_len, data_len};

    Node n_rcv_sizes;

    Schema s;
    s["schema_len"].set(DataType::c_int());
    s["data_len"].set(DataType::c_int());
    n_rcv_sizes.list_of(s,m_size);

    int mpi_error = MPI_Allgather( snd_sizes, // local data
                                   2, // two ints per rank
                                   MPI_INT, // send ints
                                   n_rcv_sizes.data_ptr(),  // rcv buffer
                                   2,  // two ints per rank
                                   MPI_INT,  // rcv ints
                                   mpi_comm); // mpi com

    CONDUIT_CHECK_MPI_ERROR(mpi_error);
                                
    Node n_rcv_tmp;
    
    int  *schema_rcv_counts = NULL;
    int  *schema_rcv_displs = NULL;
    char *schema_rcv_buff   = NULL;

    int  *data_rcv_counts = NULL;
    int  *data_rcv_displs = NULL;
    char *data_rcv_buff   = NULL;


    // alloc data for the mpi gather counts and displ arrays
    n_rcv_tmp["schemas/counts"].set(DataType::c_int(m_size));
    n_rcv_tmp["schemas/displs"].set(DataType::c_int(m_size));

    n_rcv_tmp["data/counts"].set(DataType::c_int(m_size));
    n_rcv_tmp["data/displs"].set(DataType::c_int(m_size));

    // get pointers to counts and displs
    schema_rcv_counts = n_rcv_tmp["schemas/counts"].value();
    schema_rcv_displs = n_rcv_tmp["schemas/displs"].value();

    data_rcv_counts = n_rcv_tmp["data/counts"].value();
    data_rcv_displs = n_rcv_tmp["data/displs"].value();

    int schema_curr_displ = 0;
    int data_curr_displ   = 0;
    int i=0;
    
    NodeIterator itr = n_rcv_sizes.children();
    while(itr.has_next())
    {
        Node &curr = itr.next();

        int schema_curr_count = curr["schema_len"].value();
        int data_curr_count   = curr["data_len"].value();
        
        schema_rcv_counts[i] = schema_curr_count;
        schema_rcv_displs[i] = schema_curr_displ;
        schema_curr_displ   += schema_curr_count;
        
        data_rcv_counts[i] = data_curr_count;
        data_rcv_displs[i] = data_curr_displ;
        data_curr_displ   += data_curr_count;
        
        i++;
    }
    
    n_rcv_tmp["schemas/data"].set(DataType::c_char(schema_curr_displ));
    schema_rcv_buff = n_rcv_tmp["schemas/data"].value();

    mpi_error = MPI_Allgatherv( const_cast <char*>(schema_str.c_str()),
                                schema_len,
                                MPI_CHAR,
                                schema_rcv_buff,
                                schema_rcv_counts,
                                schema_rcv_displs,
                                MPI_CHAR,
                                mpi_comm);

    CONDUIT_CHECK_MPI_ERROR(mpi_error);

    // build all schemas from JSON, compact them.
    Schema rcv_schema;
    //TODO: should we make it easer to create a compact schema?
    Schema s_tmp;
    for(int i=0;i < m_size; i++)
    {
        Schema &s = s_tmp.append();
        s.set(&schema_rcv_buff[schema_rcv_displs[i]]);
    }
    
    s_tmp.compact_to(rcv_schema);

    // allocate data to hold the gather result
    recv_node.set(rcv_schema);
    data_rcv_buff = (char*)recv_node.data_ptr();
    
    mpi_error = MPI_Allgatherv( n_snd_compact.data_ptr(),
                                data_len,
                                MPI_CHAR,
                                data_rcv_buff,
                                data_rcv_counts,
                                data_rcv_displs,
                                MPI_CHAR,
                                mpi_comm);

    CONDUIT_CHECK_MPI_ERROR(mpi_error);

    return mpi_error;
}
Esempio n. 21
0
int main(int argc, char* argv[])
{
  int status = MPI_SUCCESS; 
  pami_result_t result = PAMI_ERROR;

  int provided = MPI_THREAD_SINGLE;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
  /* IBM: --ranks-per-node 64 fails to init threads but this  */
  /* IBM: testcase doesn't really care so don't exit */
  TEST_ASSERT((provided>=MPI_THREAD_MULTIPLE),"MPI_Init_thread"); 

  /* initialize the second client */
  char * clientname = "test"; /* IBM: PE PAMI requires a client name */
  pami_client_t client;
  result = PAMI_Client_create(clientname, &client, NULL, 0);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_create");

  /* query properties of the client */
  pami_configuration_t config[3];
  size_t num_contexts;

  config[0].name = PAMI_CLIENT_NUM_TASKS;
  config[1].name = PAMI_CLIENT_TASK_ID;
  config[2].name = PAMI_CLIENT_NUM_CONTEXTS;
  result = PAMI_Client_query(client, config, 3);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_query");
  world_size   = config[0].value.intval;
  world_rank   = config[1].value.intval;
  num_contexts = config[2].value.intval;

  if (world_rank==0)
    printf("hello world from rank %ld of %ld, number of contexts %zu \n", world_rank, world_size, num_contexts );/*IBM: debug num_contexts */
  fflush(stdout);

  /* initialize the contexts */
  pami_context_t * contexts;
  contexts = (pami_context_t *) safemalloc( num_contexts * sizeof(pami_context_t) );

  result = PAMI_Context_createv( client, NULL, 0, contexts, num_contexts );
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_createv");

  /************************************************************************/
  /* IBM: Updating the test with the assumption that we will Rput the     */
  /* IBM: local byte array to our neighbor's shared byte array.           */

  int n = (argc>1 ? atoi(argv[1]) : 1000);

  size_t bytes = n * sizeof(int), bytes_out;/* IBM: debug - scale up testing */
  int *  shared = (int *) safemalloc(bytes);
  for (int i=0; i<n; i++)
    shared[i] = -1;   /*IBM: initialize with -1, replaced with neighbor's rank */

  pami_memregion_t shared_mr;
  result = PAMI_Memregion_create(contexts[0], shared, bytes, &bytes_out, &shared_mr);
  TEST_ASSERT(result == PAMI_SUCCESS && bytes==bytes_out,"PAMI_Memregion_create");

  int *  local  = (int *) safemalloc(bytes);
  for (int i=0; i<n; i++)
    local[i] = world_rank;   /*IBM: initialize with our rank */

  pami_memregion_t local_mr;
  result = PAMI_Memregion_create(contexts[0], local, bytes, &bytes_out, &local_mr); /* IBM: local */
  TEST_ASSERT(result == PAMI_SUCCESS && bytes==bytes_out,"PAMI_Memregion_create");

  status = MPI_Barrier(MPI_COMM_WORLD);
  TEST_ASSERT(result == MPI_SUCCESS,"MPI_Barrier");

  pami_memregion_t * shmrs = (pami_memregion_t *) safemalloc( world_size * sizeof(pami_memregion_t) );

  status = MPI_Allgather(&shared_mr, sizeof(pami_memregion_t), MPI_BYTE, 
                         shmrs,      sizeof(pami_memregion_t), MPI_BYTE, MPI_COMM_WORLD);
  TEST_ASSERT(result == MPI_SUCCESS,"MPI_Allgather");

  int target = (world_rank>0 ? world_rank-1 : world_size-1);
  pami_endpoint_t target_ep;
  result = PAMI_Endpoint_create(client, (pami_task_t) target, 0, &target_ep);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Endpoint_create");

  int active = 2;
  pami_rput_simple_t parameters;
  parameters.rma.dest           = target_ep;
  parameters.rma.bytes          = bytes;
  parameters.rma.cookie         = &active;
  parameters.rma.done_fn        = cb_done;
  parameters.rdma.local.mr      = &local_mr;
  parameters.rdma.local.offset  = 0;
  parameters.rdma.remote.mr     = &shmrs[target]; /*IBM: target's mem region */
  parameters.rdma.remote.offset = 0;
  parameters.put.rdone_fn       = cb_done;
  result = PAMI_Rput(contexts[0], &parameters);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Rput");

  while (active)
  {
    //result = PAMI_Context_advance( contexts[0], 100);
    //TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_advance");
    result = PAMI_Context_trylock_advancev(contexts, 1, 1000);
    TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_trylock_advancev");
  }

  /* IBM: I'm done with Rput but my world_rank + 1 neighbor might not be so need to advance */
  /* IBM: Could do a barrier or send/recv a completion message instead ....*/

  active = 10; /* IBM: Arbitrary - advance some more - 10*10000 good enough?  */
  while (--active)                                                         /* IBM*/
  {                                                                        /* IBM*/
    result = PAMI_Context_trylock_advancev(contexts, 1, 10000);            /* IBM*/
  /*TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_trylock_advancev");*/ /* IBM*/
  }                                                                        /* IBM*/

  int errors = 0;
  
  target = (world_rank<(world_size-1) ? world_rank+1 : 0);
  for (int i=0; i<n; i++)
    if ((shared[i] != target) ||
        (local[i] != world_rank)) /*IBM: also verify didn't change local */
       errors++;

  if (errors>0)
  {
      printf("%ld: %d errors :-( \n", (long)world_rank,  errors);  /*IBM: grep "errors" in scaled up output */
      for (int i=0; i<n; i++)
        printf("%ld: local[%d] = %d , shared[%d] = %d (%d) \n", (long)world_rank, i, local[i], i, shared[i], target); /*IBM: print both arrays */
  }
  else
    printf("%ld: no errors :-) \n", (long)world_rank); 


  MPI_Barrier(MPI_COMM_WORLD);

  result = PAMI_Memregion_destroy(contexts[0], &shared_mr);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Memregion_destroy");

  result = PAMI_Memregion_destroy(contexts[0], &local_mr);
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Memregion_destroy");

  free(shmrs);
  free(local);
  free(shared);

  /************************************************************************/

  /* finalize the contexts */
  result = PAMI_Context_destroyv( contexts, num_contexts );
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_destroyv");

  free(contexts);

  /* finalize the client */
  result = PAMI_Client_destroy( &client );
  TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_destroy");

  status = MPI_Barrier(MPI_COMM_WORLD);
  TEST_ASSERT(result == MPI_SUCCESS,"MPI_Barrier");

  MPI_Finalize();

  if (world_rank==0)
    printf("%ld: end of test \n", world_rank );
  fflush(stdout);

  return 0;
}
Esempio n. 22
0
/*************************************************************************
* This function compacts a graph by removing the vertex separator
**************************************************************************/
void CompactGraph(CtrlType *ctrl, GraphType *graph, idxtype *perm, WorkSpaceType *wspace)
{
  int i, j, l, nvtxs, cnvtxs, cfirstvtx, nparts, npes; 
  idxtype *xadj, *ladjncy, *adjwgt, *vtxdist, *where;
  idxtype *cmap, *cvtxdist, *newwhere;

  nparts = ctrl->nparts;
  npes = ctrl->npes;

  nvtxs = graph->nvtxs;
  xadj = graph->xadj;
  ladjncy = graph->adjncy;
  adjwgt = graph->adjwgt;
  where = graph->where;

  if (graph->cmap == NULL)
    graph->cmap = idxmalloc(nvtxs+graph->nrecv, "CompactGraph: cmap");
  cmap = graph->cmap;

  vtxdist = graph->vtxdist;

  /*************************************************************
  * Construct the cvtxdist of the contracted graph. Uses the fact
  * that lpwgts stores the local non separator vertices.
  **************************************************************/
  cvtxdist = wspace->pv1;
  cnvtxs = cvtxdist[npes] = idxsum(nparts, graph->lpwgts);

  MPI_Allgather((void *)(cvtxdist+npes), 1, IDX_DATATYPE, (void *)cvtxdist, 1, IDX_DATATYPE, ctrl->comm);
  MAKECSR(i, npes, cvtxdist);

#ifdef DEBUG_ORDER
  PrintVector(ctrl, npes+1, 0, cvtxdist, "cvtxdist");
#endif


  /*************************************************************
  * Construct the cmap vector 
  **************************************************************/
  cfirstvtx = cvtxdist[ctrl->mype];

  /* Create the cmap of what you know so far locally */
  for (cnvtxs=0, i=0; i<nvtxs; i++) {
    if (where[i] < nparts) {
      perm[cnvtxs] = perm[i];
      cmap[i] = cfirstvtx + cnvtxs++;
    }
  }

  CommInterfaceData(ctrl, graph, cmap, wspace->indices, cmap+nvtxs);


  /*************************************************************
  * Finally, compact the graph
  **************************************************************/
  newwhere = idxmalloc(cnvtxs, "CompactGraph: newwhere");
  cnvtxs = l = 0;
  for (i=0; i<nvtxs; i++) {
    if (where[i] < nparts) {
      for (j=xadj[i]; j<xadj[i+1]; j++) {
        if (where[i] == where[ladjncy[j]]) {
          ladjncy[l] = cmap[ladjncy[j]];
          adjwgt[l++] = adjwgt[j];
        }
#ifdef DEBUG_ORDER
        else if (where[ladjncy[j]] < nparts)
          printf("It seems that the separation has failed: %d %d\n", where[i], where[ladjncy[j]]);
#endif
      }

      xadj[cnvtxs] = l;
      graph->vwgt[cnvtxs] = graph->vwgt[i];
      newwhere[cnvtxs] = where[i];
      cnvtxs++;
    }
  }
  for (i=cnvtxs; i>0; i--)
    xadj[i] = xadj[i-1];
  xadj[0] = 0;

  GKfree((void **)&graph->match, (void **)&graph->cmap, (void **)&graph->lperm, (void **)&graph->where, (void **)&graph->label, (void **)&graph->rinfo,
         (void **)&graph->nrinfo, (void **)&graph->lpwgts, (void **)&graph->gpwgts, (void **)&graph->sepind, (void **)&graph->peind,
         (void **)&graph->sendptr, (void **)&graph->sendind, (void **)&graph->recvptr, (void **)&graph->recvind, 
         (void **)&graph->imap, (void **)&graph->rlens, (void **)&graph->slens, (void **)&graph->rcand, (void **)&graph->pexadj, 
         (void **)&graph->peadjncy, (void **)&graph->peadjloc, LTERM);
 
  graph->nvtxs = cnvtxs;
  graph->nedges = l;
  graph->gnvtxs = cvtxdist[npes];
  idxcopy(npes+1, cvtxdist, graph->vtxdist);
  graph->where = newwhere;

}
Esempio n. 23
0
PetscErrorCode  MatGetMultiProcBlock_MPIAIJ(Mat mat, MPI_Comm subComm, MatReuse scall,Mat *subMat)
{
  PetscErrorCode ierr;
  Mat_MPIAIJ     *aij  = (Mat_MPIAIJ*)mat->data;
  Mat_SeqAIJ     *aijB = (Mat_SeqAIJ*)aij->B->data;
  PetscMPIInt    commRank,subCommSize,subCommRank;
  PetscMPIInt    *commRankMap,subRank,rank,commsize;
  PetscInt       *garrayCMap,col,i,j,*nnz,newRow,newCol;

  PetscFunctionBegin;
  ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&commsize);CHKERRQ(ierr);
  ierr = MPI_Comm_size(subComm,&subCommSize);CHKERRQ(ierr);

  /* create subMat object with the relavent layout */
  if (scall == MAT_INITIAL_MATRIX) {
    ierr = MatCreate(subComm,subMat);CHKERRQ(ierr);
    ierr = MatSetType(*subMat,MATMPIAIJ);CHKERRQ(ierr);
    ierr = MatSetSizes(*subMat,mat->rmap->n,mat->cmap->n,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
    ierr = MatSetBlockSizesFromMats(*subMat,mat,mat);CHKERRQ(ierr);

    /* need to setup rmap and cmap before Preallocation */
    ierr = PetscLayoutSetUp((*subMat)->rmap);CHKERRQ(ierr);
    ierr = PetscLayoutSetUp((*subMat)->cmap);CHKERRQ(ierr);
  }

  /* create a map of comm_rank from subComm to comm - should commRankMap and garrayCMap be kept for reused? */
  ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)mat),&commRank);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(subComm,&subCommRank);CHKERRQ(ierr);
  ierr = PetscMalloc1(subCommSize,&commRankMap);CHKERRQ(ierr);
  ierr = MPI_Allgather(&commRank,1,MPI_INT,commRankMap,1,MPI_INT,subComm);CHKERRQ(ierr);

  /* Traverse garray and identify column indices [of offdiag mat] that
   should be discarded. For the ones not discarded, store the newCol+1
   value in garrayCMap */
  ierr = PetscCalloc1(aij->B->cmap->n,&garrayCMap);CHKERRQ(ierr);
  for (i=0; i<aij->B->cmap->n; i++) {
    col = aij->garray[i];
    for (subRank=0; subRank<subCommSize; subRank++) {
      rank = commRankMap[subRank];
      if ((col >= mat->cmap->range[rank]) && (col < mat->cmap->range[rank+1])) {
        garrayCMap[i] = (*subMat)->cmap->range[subRank] + col - mat->cmap->range[rank]+1;
        break;
      }
    }
  }

  if (scall == MAT_INITIAL_MATRIX) {
    /* Now compute preallocation for the offdiag mat */
    ierr = PetscCalloc1(aij->B->rmap->n,&nnz);CHKERRQ(ierr);
    for (i=0; i<aij->B->rmap->n; i++) {
      for (j=aijB->i[i]; j<aijB->i[i+1]; j++) {
        if (garrayCMap[aijB->j[j]]) nnz[i]++;
      }
    }
    ierr = MatMPIAIJSetPreallocation(*(subMat),0,NULL,0,nnz);CHKERRQ(ierr);

    /* reuse diag block with the new submat */
    ierr = MatDestroy(&((Mat_MPIAIJ*)((*subMat)->data))->A);CHKERRQ(ierr);

    ((Mat_MPIAIJ*)((*subMat)->data))->A = aij->A;

    ierr = PetscObjectReference((PetscObject)aij->A);CHKERRQ(ierr);
  } else if (((Mat_MPIAIJ*)(*subMat)->data)->A != aij->A) {
    PetscObject obj = (PetscObject)((Mat_MPIAIJ*)((*subMat)->data))->A;

    ierr = PetscObjectReference((PetscObject)obj);CHKERRQ(ierr);

    ((Mat_MPIAIJ*)((*subMat)->data))->A = aij->A;

    ierr = PetscObjectReference((PetscObject)aij->A);CHKERRQ(ierr);
  }

  /* Now traverse aij->B and insert values into subMat */
  for (i=0; i<aij->B->rmap->n; i++) {
    newRow = (*subMat)->rmap->range[subCommRank] + i;
    for (j=aijB->i[i]; j<aijB->i[i+1]; j++) {
      newCol = garrayCMap[aijB->j[j]];
      if (newCol) {
        newCol--; /* remove the increment */
        ierr = MatSetValues(*subMat,1,&newRow,1,&newCol,(aijB->a+j),INSERT_VALUES);CHKERRQ(ierr);
      }
    }
  }

  /* assemble the submat */
  ierr = MatAssemblyBegin(*subMat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*subMat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  /* deallocate temporary data */
  ierr = PetscFree(commRankMap);CHKERRQ(ierr);
  ierr = PetscFree(garrayCMap);CHKERRQ(ierr);
  if (scall == MAT_INITIAL_MATRIX) {
    ierr = PetscFree(nnz);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
Esempio n. 24
0
/*-------------------------------------------------------------------------*/
int FTI_InitMpiICP(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
        FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
        FTIT_dataset* FTI_Data)
{
    int res;
    FTI_Print("I/O mode: MPI-IO.", FTI_DBUG);
    char str[FTI_BUFS], mpi_err[FTI_BUFS];

    // enable collective buffer optimization
    MPI_Info info;
    MPI_Info_create(&info);
    MPI_Info_set(info, "romio_cb_write", "enable");

    /* 
     * update ckpt file name (neccessary for the restart!)
     * not very nice TODO we should think about another mechanism
     */
    snprintf(FTI_Exec->meta[0].ckptFile, FTI_BUFS,
            "Ckpt%d-Rank%d.fti", FTI_Exec->ckptID, FTI_Topo->myRank);

    // TODO enable to set stripping unit in the config file (Maybe also other hints)
    // set stripping unit to 4MB
    MPI_Info_set(info, "stripping_unit", "4194304");

    char gfn[FTI_BUFS], ckptFile[FTI_BUFS];
    snprintf(ckptFile, FTI_BUFS, "Ckpt%d-mpiio.fti", FTI_Exec->ckptID);
    snprintf(gfn, FTI_BUFS, "%s/%s", FTI_Conf->gTmpDir, ckptFile);
    // open parallel file (collective call)
    MPI_File pfh;

#ifdef LUSTRE
    if (FTI_Topo->splitRank == 0) {
        res = llapi_file_create(gfn, FTI_Conf->stripeUnit, FTI_Conf->stripeOffset, FTI_Conf->stripeFactor, 0);
        if (res) {
            char error_msg[FTI_BUFS];
            error_msg[0] = 0;
            strerror_r(-res, error_msg, FTI_BUFS);
            snprintf(str, FTI_BUFS, "[Lustre] %s.", error_msg);
            FTI_Print(str, FTI_WARN);
        } else {
            snprintf(str, FTI_BUFS, "[LUSTRE] file:%s striping_unit:%i striping_factor:%i striping_offset:%i",
                    ckptFile, FTI_Conf->stripeUnit, FTI_Conf->stripeFactor, FTI_Conf->stripeOffset);
            FTI_Print(str, FTI_DBUG);
        }
    }
#endif
    res = MPI_File_open(FTI_COMM_WORLD, gfn, MPI_MODE_WRONLY|MPI_MODE_CREATE, info, &pfh);

    // check if successful
    if (res != 0) {
        errno = 0;
        int reslen;
        MPI_Error_string(res, mpi_err, &reslen);
        snprintf(str, FTI_BUFS, "unable to create file %s [MPI ERROR - %i] %s", gfn, res, mpi_err);
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }

    MPI_Offset chunkSize = FTI_Exec->ckptSize;

    // collect chunksizes of other ranks
    MPI_Offset* chunkSizes = talloc(MPI_Offset, FTI_Topo->nbApprocs * FTI_Topo->nbNodes);
    MPI_Allgather(&chunkSize, 1, MPI_OFFSET, chunkSizes, 1, MPI_OFFSET, FTI_COMM_WORLD);

    // set file offset
    MPI_Offset offset = 0;
    int i;
    for (i = 0; i < FTI_Topo->splitRank; i++) {
        offset += chunkSizes[i];
    }
    free(chunkSizes);

    FTI_Exec->iCPInfo.offset = offset;

    memcpy( FTI_Exec->iCPInfo.fh, &pfh, sizeof(FTI_MI_FH) );
    MPI_Info_free(&info);

    return FTI_SCES;

}
Esempio n. 25
0
int Zoltan_Preprocess_Graph(
  ZZ *zz,                               /* Zoltan structure */
  ZOLTAN_ID_PTR *global_ids,
  ZOLTAN_ID_PTR *local_ids,
  ZOLTAN_Third_Graph *gr,              /* Graph for third part libs */
  ZOLTAN_Third_Geom  *geo,
  ZOLTAN_Third_Part  *prt,
  ZOLTAN_Third_Vsize *vsp
)
{
  static char *yo = "Zoltan_Preprocess_Graph";

  int ierr;
  float *float_vwgt=NULL, *float_ewgts=NULL;
  char msg[256];
  ZG *graph = &(gr->graph);
  int local;

  char add_obj_weight[MAX_PARAM_STRING_LEN+1];

  ZOLTAN_TRACE_ENTER(zz, yo);

  /* Initialize all local pointers to NULL. This is necessary
   * because we free all non-NULL pointers upon errors.
   */
  gr->vtxdist = gr->xadj = gr->adjncy = NULL;
  gr->vwgt = gr->ewgts = NULL;
  float_vwgt = float_ewgts = NULL;

  if (gr->obj_wgt_dim >= 0) {
    /* Check weight dimensions */
    if (zz->Obj_Weight_Dim<0){
      sprintf(msg, "Object weight dimension is %d, "
	      "but should be >= 0. Using Obj_Weight_Dim = 0.",
	      zz->Obj_Weight_Dim);
      ZOLTAN_PRINT_WARN(zz->Proc, yo, msg);
      gr->obj_wgt_dim = 0;
    }
    else {
      gr->obj_wgt_dim = zz->Obj_Weight_Dim;
    }
  }
  else
    gr->obj_wgt_dim = 0;
  if (gr->edge_wgt_dim >= 0) {
    if (zz->Edge_Weight_Dim<0){
      sprintf(msg, "Edge weight dimension is %d, "
	      "but should be >= 0. Using Edge_Weight_Dim = 0.",
	      zz->Edge_Weight_Dim);
      ZOLTAN_PRINT_WARN(zz->Proc, yo, msg);
      gr->edge_wgt_dim = 0;
    }
    else if (zz->Edge_Weight_Dim>1){
      ZOLTAN_PRINT_WARN(zz->Proc, yo, "This method does not support "
			"multidimensional edge weights. Using Edge_Weight_Dim = 1.");
      gr->edge_wgt_dim = 1;
    }
    else {
      gr->edge_wgt_dim = zz->Edge_Weight_Dim;
    }
  }
  else
      gr->edge_wgt_dim = 0;

  /* Default graph type is GLOBAL. */

  /* Get parameter options shared by ParMetis and Jostle */
  gr->check_graph = 1;          /* default */
  gr->scatter = 1;              /* default */
  gr->final_output = 0;
  strcpy(add_obj_weight, "NONE");  /* default */
  Zoltan_Bind_Param(Graph_params, "CHECK_GRAPH", (void *) &gr->check_graph);
  Zoltan_Bind_Param(Graph_params, "SCATTER_GRAPH", (void *) &gr->scatter);
  Zoltan_Bind_Param(Graph_params, "FINAL_OUTPUT", (void *) &gr->final_output);
  Zoltan_Bind_Param(Graph_params, "ADD_OBJ_WEIGHT", (void *) add_obj_weight);
  Zoltan_Assign_Param_Vals(zz->Params, Graph_params, zz->Debug_Level, zz->Proc,
			   zz->Debug_Proc);


  /* If reorder is true, we already have the id lists. Ignore weights. */
  if ((*global_ids == NULL) || (!gr->id_known)){
    int *input_part = NULL;
    ierr = Zoltan_Get_Obj_List(zz, &gr->num_obj, global_ids, local_ids,
			       gr->obj_wgt_dim, &float_vwgt, &input_part);
    CHECK_IERR;
    if (prt) {
      prt->input_part = input_part;
    }
    else if (input_part) { /* Ordering, dont need part */
      ZOLTAN_FREE(&input_part);
    }
    if (ierr){
      /* Return error */
      ZOLTAN_PARMETIS_ERROR(ierr, "Get_Obj_List returned error.");
    }
  }
  /* Build Graph for third party library data structures, or just get vtxdist. */

  if (gr->get_data) {
    local = IS_LOCAL_GRAPH(gr->graph_type);
    ZOLTAN_FREE(&float_vwgt);
    ierr = Zoltan_ZG_Build (zz, graph, local); /* Normal graph */
    CHECK_IERR;
    ierr = Zoltan_ZG_Export (zz, graph,
			     &gr->num_obj, &gr->num_obj, &gr->obj_wgt_dim, &gr->edge_wgt_dim,
			     &gr->vtxdist, &gr->xadj, &gr->adjncy, &gr->adjproc,
			     &float_vwgt, &float_ewgts, NULL);
    /* TODO: support graph redistribution */
/*   if (prt) */
/*     ierr = Zoltan_ZG_Vertex_Info(zz, &graph, global_ids, &prt->input_part); */
/*   else */
/*     ierr = Zoltan_ZG_Vertex_Info(zz, &graph, global_ids, NULL); */

/*     /\* Just to try *\/ */
/*     if (prt) { */
/*       prt->input_part = (int *)ZOLTAN_CALLOC(gr->num_obj, sizeof(int)); */
/*     } */

    if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){
      ZOLTAN_PARMETIS_ERROR(ierr, "Zoltan_Build_Graph returned error.");
    }
  }
  else{ /* Only geometry */
    int i;
    /* No graph but still needs vtxdist*/
    gr->vtxdist = (int*) ZOLTAN_MALLOC ((zz->Num_Proc+1)*sizeof(int));
    if (gr->vtxdist == NULL)
      ZOLTAN_PARMETIS_ERROR(ZOLTAN_MEMERR, "Out of memory.");

    gr->vtxdist[0] = 0;
    MPI_Allgather(&gr->num_obj, 1, MPI_INT, gr->vtxdist+1, 1, MPI_INT, zz->Communicator);
    for (i=1 ; i <= zz->Num_Proc ; ++i) {
      gr->vtxdist[i] += gr->vtxdist[i-1];
    }
  }

  if (prt) {
    prt->part_sizes = prt->input_part_sizes;
    /* ParMETIS needs prt->part to be allocated, even when num_obj=0. */
    /* See Zoltan bug 4299. */
    prt->part = (indextype *)ZOLTAN_MALLOC((gr->num_obj+1) * sizeof(indextype));
    if (!prt->part) {
      /* Not enough memory */
      ZOLTAN_PARMETIS_ERROR(ZOLTAN_MEMERR, "Out of memory.");
    }
    if (gr->num_obj >0) {
      memcpy (prt->part, prt->input_part, (gr->num_obj) * sizeof(indextype));
    }
    else {
      prt->input_part = NULL;
    }
  }

  /* Convert from float. */

  /* Get vertex weights if needed */
  if (gr->obj_wgt_dim){
    ierr = Zoltan_Preprocess_Scale_Weights (gr, float_vwgt, &gr->vwgt,
					    gr->num_obj, gr->obj_wgt_dim, 1, zz,
					    "vertex", gr->vtxdist[zz->Proc]);
    if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){
      /* Return error code */
      ZOLTAN_PARMETIS_ERROR(ierr, "Error in scaling of weights.");
    }
    ZOLTAN_FREE(&float_vwgt);
  }

  if (strcasecmp(add_obj_weight, "NONE")){
    if (Zoltan_Preprocess_Add_Weight(zz, gr, prt, add_obj_weight) != ZOLTAN_OK)
      ZOLTAN_PARMETIS_ERROR(ierr, "Error in adding  vertex weights.");
  }

  /* Get edge weights if needed */
  if (gr->get_data)
    gr->num_edges = gr->xadj[gr->num_obj];
  else {
    gr->num_edges = 0;
    gr->edge_wgt_dim = 0;
  }

  if (gr->edge_wgt_dim){
    ierr = Zoltan_Preprocess_Scale_Weights (gr, float_ewgts, &gr->ewgts,
					    gr->num_edges, gr->edge_wgt_dim, 1, zz,
					    "edge", 0);
    if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){
      /* Return error code */
      ZOLTAN_PARMETIS_ERROR(ierr, "Error in scaling of weights.");
    }
    if (!gr->final_output) {
      ZOLTAN_FREE(&float_ewgts);
    }
    else
      gr->float_ewgts = float_ewgts;
  }
  else {
    ZOLTAN_FREE(&float_ewgts);
  }

  if (geo){
    ierr = Zoltan_Preprocess_Extract_Geom (zz, global_ids, local_ids, gr, geo);
    if (ierr) {
      ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL,
			    "Error returned from Zoltan_Preprocess_Extract_Geom");
    }
  }


  if (vsp) {
    ierr = Zoltan_Preprocess_Extract_Vsize (zz, global_ids, local_ids, gr, vsp);
    if (ierr) {
      ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL,
			  "Error returned from Zoltan_Preprocess_Extract_Vsize");
    }
  }

  /* Scatter graph?
   * If data distribution is highly imbalanced, it is better to
   * redistribute the graph data structure before calling ParMetis.
   * After partitioning, the results must be mapped back.
   */
  if (gr->scatter < gr->scatter_min) gr->scatter = gr->scatter_min;

  if (gr->scatter>0) {
    ierr = Zoltan_Preprocess_Scatter_Graph (zz, gr, prt, geo, vsp);
    if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN) {
      ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL,
			    "Error returned from Zoltan_Preprocess_Scatter_Graph");
    }
  }


  /* Verify that graph is correct */
  if (gr->get_data){
    int flag;

    if (zz->Debug_Level >= ZOLTAN_DEBUG_ALL)
      flag = 2; /* Maximum output level */
    else
      flag = 1; /* Medium output level */
    ierr = Zoltan_Verify_Graph(zz->Communicator, gr->vtxdist, gr->xadj, gr->adjncy, gr->vwgt,
	      gr->ewgts, gr->obj_wgt_dim, gr->edge_wgt_dim, gr->graph_type, gr->check_graph, flag);

  }

 End:

  ZOLTAN_TRACE_EXIT(zz, yo);

  return (ierr);
}
Esempio n. 26
0
static int gather_and_build_remap(
  ZZ *zz, 
  int *new_map,               /* Upon return, flag indicating whether parts
                                 assignments were changed due to remap. */
  int HEcnt,                  /* # of HEs allocated. */
  int *HEinfo                 /* Array of HE info; for each HE, two pins and 
                                 one edge weight. Stored as a single vector
                                 to minimize communication calls.  */
)
{
char *yo = "gather_and_remap";
int ierr = ZOLTAN_OK;
int i, uidx, tmp;
int *each_size = NULL;        /* sizes (# HEs * HEINFO_ENTRIES) for each proc */
int *recvbuf = NULL;          /* Receive buffer for gatherv */
int *displs = NULL;           /* Displacement buffer for gatherv */
int send_size;                /* Local # HEs * HEINFO_ENTRIES */
int total_size;               /* Total # ints in gatherv */
int total_HEcnt;              /* Total (across all procs) number of HEs. */
int max0, max1;               /* Max values of pin 0 and pin 1 for each HE. */
int *match = NULL;            /* Vector describing the matching. 
                                 match[i] = j ==> match[j] = i ==> 
                                 vertices i and j are matched. */
int *used = NULL;             /* Vector indicating which partitions are used
                                 in the matching. */
int limit;                    /* Maximum number of matches that are allowed */
HGraph hg;                    /* Hypergraph for matching */
float before_remap = 0,       /* Amount of data that overlaps between old and */
      after_remap = 0;        /* new decomposition before and after remapping, 
                                 respectively. */
float with_oldremap = 0;      /* Amount of data that overlaps between old and
                                 new decomposition using the OldRemap vector
                                 (remapping from the previous decomposition). */


  /* Gather HEs from each processor into a local complete HG. */

  each_size = (int *) ZOLTAN_MALLOC(zz->Num_Proc * sizeof(int));
  if (!each_size) {
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
    ierr = ZOLTAN_MEMERR;
    goto End;
  }
  send_size = HEcnt * HEINFO_ENTRIES;
  MPI_Allgather(&send_size, 1, MPI_INT, each_size, 1, MPI_INT,
                zz->Communicator);

  for (total_size = 0, i = 0; i < zz->Num_Proc; i++) {
    total_size += each_size[i];
  }

  recvbuf = (int *) ZOLTAN_MALLOC((zz->Num_Proc + total_size) * sizeof(int));
  displs = recvbuf + total_size;
  if (!recvbuf) {
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
    ierr = ZOLTAN_MEMERR;
    goto End;
  }

  displs[0] = 0;
  for (i = 1; i < zz->Num_Proc; i++)
    displs[i] = displs[i-1] + each_size[i-1];

  MPI_Allgatherv(HEinfo, send_size, MPI_INT, 
                 recvbuf, each_size, displs, MPI_INT, zz->Communicator);

  total_HEcnt = total_size / HEINFO_ENTRIES;
  for (max0 = -1, max1 = -1, i = 0; i < total_HEcnt; i++) {
    tmp = i * HEINFO_ENTRIES;
    if (recvbuf[tmp] > max0) max0 = recvbuf[tmp];
    if (recvbuf[tmp+1] > max1) max1 = recvbuf[tmp+1];
  }
  /* Increment max0 and max1 so that they are the maximum number of unique
     pin values for pin0 and pin1 respectively; i.e., allow pin value == 0. */
  max0++;
  max1++;
  
  /* Sanity check */
  /* Ideally, max1 should equal LB.Num_Global_Parts, but ParMETIS3 sometimes
   * does not return the correct number of non-empty partitions, allowing
   * max1 to be less than LB.Num_Global_Parts. 
   * (e.g., ewgt.adaptive-partlocal1-v3.4.?).
   */
  if (max1 > zz->LB.Num_Global_Parts) 
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Unexpected value for max1.");

  /* Set up global HG */

  Zoltan_HG_HGraph_Init(&hg);
  if (total_HEcnt) {
    hg.nVtx = max0 + zz->LB.Num_Global_Parts;  
    hg.nEdge = total_HEcnt;
    hg.nPins = total_HEcnt * 2;   /* two pins per HE */
    hg.EdgeWeightDim = 1;
    hg.ewgt = (float *) ZOLTAN_MALLOC(total_HEcnt * sizeof(float));
    hg.hindex = (int *) ZOLTAN_MALLOC((total_HEcnt + 1) * sizeof(int));
    hg.hvertex = (int *) ZOLTAN_MALLOC((hg.nPins) * sizeof(int));
    if (!hg.ewgt || !hg.hindex || !hg.hvertex) {
      ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
      ierr = ZOLTAN_MEMERR;
      goto End;
    }

    for (i = 0; i < total_HEcnt; i++) {
      tmp = i * HEINFO_ENTRIES;
      hg.hindex[i] = i+i; 
      hg.hvertex[i+i] = recvbuf[tmp];
      hg.hvertex[i+i+1] = recvbuf[tmp+1]+max0;
      hg.ewgt[i] = recvbuf[tmp+2];
    }
    hg.hindex[total_HEcnt] = total_HEcnt + total_HEcnt;

    ierr = Zoltan_HG_Create_Mirror(zz, &hg);
    if (ierr < 0) goto End;
  }

  before_remap = measure_stays(zz, &hg, max0, NULL, "BEFORE");

  /* Compute the amount of overlap when using the old remap vector. */

  with_oldremap = measure_stays(zz, &hg, max0, zz->LB.OldRemap, "WITHOLD");

  /* Do matching */

  match = (int *) ZOLTAN_CALLOC(hg.nVtx + zz->LB.Num_Global_Parts, sizeof(int));
  used = match + hg.nVtx;
  if (hg.nVtx && !match) {
    ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
    ierr = ZOLTAN_MEMERR;
    goto End;
  }

  /* Max # matches allowed */
  limit = (max0 < zz->LB.Num_Global_Parts ? max0 : zz->LB.Num_Global_Parts); 
  do_match(zz, &hg, match, limit);

      
  /* Build remapping vector, if non-trivial matching was returned. */

  *new_map = 0;
  for (i = 0; i < zz->LB.Num_Global_Parts; i++) 
    if (match[i+max0] != i+max0) {
      *new_map = 1;
      break;
    }

  if (*new_map) {

    zz->LB.Remap = (int *) ZOLTAN_MALLOC(zz->LB.Num_Global_Parts * sizeof(int));
    if (!(zz->LB.Remap)) {
      ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
      ierr = ZOLTAN_MEMERR;
      goto End;
    }


    /* First, process all parts that were matched. Mark matched parts as used.*/

    for (i = 0; i < zz->LB.Num_Global_Parts; i++) {
      zz->LB.Remap[i] = -1; 
      tmp = match[i+max0];
      if (tmp != i+max0) {
        zz->LB.Remap[i] = tmp;
        used[tmp] = 1;
      }
    }

    /* Second, process unmatched parts; if possible, keep same part number. */

    for (i = 0; i < zz->LB.Num_Global_Parts; i++) {
      if (zz->LB.Remap[i] > -1) continue;  /* Already processed part i */
      /* match[i+max0] == i+max0 */
      if (!used[i]) {  /* Keep the same part number if it is not used */
        zz->LB.Remap[i] = i;
        used[i] = 1;
      }
    }
  
    /* Third, process remaining unmatched parts; assign them to 
       unused partitions.*/
  
    for (uidx = 0, i = 0; i < zz->LB.Num_Global_Parts; i++) {
      if (zz->LB.Remap[i] > -1) continue;  /* Already processed part i */
      /* match[i+max0] == i+max0 */
      while (used[uidx]) uidx++;   /* Find next unused partition */
      zz->LB.Remap[i] = uidx;
      used[uidx] = 1;
    }
  }

  if (*new_map) 
    after_remap = measure_stays(zz, &hg, max0, zz->LB.Remap, "AFTER ");

  if ((before_remap >= after_remap) && (before_remap >= with_oldremap)) {
    /* No benefit from remapping; don't keep it! */
    ZOLTAN_FREE(&zz->LB.Remap);
    ZOLTAN_FREE(&zz->LB.OldRemap);
    *new_map = 0;
  }
  else if (with_oldremap >= after_remap) {
    /* The old remap vector is better than the new one; keep the old one. */
    ZOLTAN_FREE(&zz->LB.Remap);
    zz->LB.Remap = zz->LB.OldRemap;
    zz->LB.OldRemap = NULL;
    *new_map = 1;
  }
  else {
    /* Going to use the new remap vector; free the old one. */
    ZOLTAN_FREE(&zz->LB.OldRemap);
  }

  if (zz->Debug_Level >= ZOLTAN_DEBUG_ALL && zz->Proc == zz->Debug_Proc &&
      zz->LB.Remap) 
    for (i = 0; i < zz->LB.Num_Global_Parts; i++) 
      printf("%d REMAP Part %d to Part %d\n", zz->Proc, i, zz->LB.Remap[i]);

End:
  ZOLTAN_FREE(&match);
  ZOLTAN_FREE(&each_size);
  ZOLTAN_FREE(&recvbuf);
  Zoltan_HG_HGraph_Free(&hg);
  return ierr;
}
Esempio n. 27
0
/***************************************************
 **** K-SPACE CONTRIBUTION
 ***************************************************/
void ewald_compute_kspace(ewald_data_struct* d, 
    fcs_int num_particles,
    fcs_float *positions,
    fcs_float *charges,
    fcs_float *fields,
    fcs_float *potentials) {

  FCS_INFO(fprintf(stderr, "ewald_compute_kspace started...\n"));

  /* DISTRIBUTE ALL PARTICLE DATA TO ALL NODES */
  /* Gather all particle numbers */
  int node_num_particles = num_particles;
  int node_particles[d->comm_size]; 
  int node_particles3[d->comm_size]; 
  int displs[d->comm_size];
  int displs3[d->comm_size];
  fcs_int total_particles;

  /* printf("%d: num_particles=%d\n", d->comm_rank, num_particles); */
  MPI_Allgather(&node_num_particles, 1, MPI_INT, node_particles, 1, MPI_INT, d->comm);

  /* compute displacements for MPI_Gatherv */
  total_particles = node_particles[0];
  node_particles3[0] = node_particles[0]*3;
  displs[0] = 0;
  displs3[0] = 0;
  for (fcs_int i=1; i < d->comm_size; i++) {
    total_particles += node_particles[i];
    node_particles3[i] = node_particles[i]*3;
    displs[i] = displs[i-1] + node_particles[i-1];
    displs3[i] = displs3[i-1] + node_particles3[i-1];
  }

  fcs_float *all_positions, *all_charges, *node_fields, *node_potentials;

  all_positions = malloc(sizeof(fcs_float) * 3 * total_particles);
  all_charges = malloc(sizeof(fcs_float) * total_particles);
  node_fields = malloc(sizeof(fcs_float) * 3 * total_particles);
  node_potentials = malloc(sizeof(fcs_float) * total_particles);

  /* gather all particle data at all nodes */
  MPI_Allgatherv(positions, num_particles*3, FCS_MPI_FLOAT, all_positions, node_particles3, displs3, FCS_MPI_FLOAT, d->comm);
  MPI_Allgatherv(charges, num_particles, FCS_MPI_FLOAT, all_charges, node_particles, displs, FCS_MPI_FLOAT, d->comm);

  /* for (fcs_int i=0; i < total_particles; i++) { */
  /*   printf("%d: all_positions[%d]={%lf, %lf, %lf}\n", d->comm_rank, i, all_positions[i*3], all_positions[3*i+1], all_positions[3*i+2]); */
  /*   printf("%d: all_charges[%d]=%lf\n", d->comm_rank, i, all_charges[i]); */
  /* } */

  /* INIT ALGORITHM */

  /* init fields and potentials */
  if (fields != NULL) {
    for (fcs_int i=0; i < total_particles; i++) {
      node_fields[3*i] = 0.0;
      node_fields[3*i+1] = 0.0;
      node_fields[3*i+2] = 0.0;
    }
  }
  if (potentials != NULL)
    for (fcs_int i=0; i < total_particles; i++)
      node_potentials[i] = 0.0;
  
  /* COMPUTE FAR FIELDS */

  /* evenly distribute the k-vectors onto all tasks */
  fcs_int num_k_per_dir = 2*d->kmax+1;
  fcs_int num_k = num_k_per_dir * num_k_per_dir * num_k_per_dir;
  for (int k_ind = d->comm_rank; k_ind < num_k; k_ind += d->comm_size) {
    /* compute fields and potentials */
    fcs_int nx = 
      k_ind % num_k_per_dir - d->kmax;
    fcs_int ny = 
      k_ind % (num_k_per_dir*num_k_per_dir) / num_k_per_dir - d->kmax;
    fcs_int nz = 
      k_ind / (num_k_per_dir*num_k_per_dir) - d->kmax;
    if (nx*nx + ny*ny + nz*nz <= d->kmax*d->kmax) {
      /* system length vector L_vec */
      const fcs_float Lx = d->box_l[0];
      const fcs_float Ly = d->box_l[1];
      const fcs_float Lz = d->box_l[2];
      /* reciprocal vector k_vec */
      const fcs_float kx = 2.0*M_PI*nx / Lx;
      const fcs_float ky = 2.0*M_PI*ny / Ly;
      const fcs_float kz = 2.0*M_PI*nz / Lz;
      /* reciprocal charge density rhohat */
      fcs_float rhohat_re = 0.0;
      fcs_float rhohat_im = 0.0;
      
      /* compute Deserno, Holm (1998) eq. (8) */
      for (fcs_int i=0; i < total_particles; i++) {
        /* charge q */
        const fcs_float q = all_charges[i];
        if (!fcs_float_is_zero(q)) {
          /* particle position r_vec */
          const fcs_float rx = all_positions[3*i];
          const fcs_float ry = all_positions[3*i+1];
          const fcs_float rz = all_positions[3*i+2];
          /* compute k_vec*r_vec */
          fcs_float kr = kx*rx + ky*ry + kz*rz;
          /* rhohat = qi * exp(-i*k_vec*r_vec) */
          rhohat_re += q * cos(kr);
          rhohat_im += q * -sin(kr);
        }
      }
      
/*      FCS_DEBUG(fprintf(stderr, "  n_vec= (%d, %d, %d) rhohat_re=%e rhohat_im=%e\n",
        nx, ny, nz, rhohat_re, rhohat_im));*/
      
      /* fetch influence function */
      fcs_float g = d->G[linindex(abs(nx), abs(ny), abs(nz), d->kmax)];
      for (fcs_int i=0; i < total_particles; i++) {
        /* particle position r_vec */
        const fcs_float rx = all_positions[3*i];
        const fcs_float ry = all_positions[3*i+1];
        const fcs_float rz = all_positions[3*i+2];
        /* compute k_vec*r_vec */
        fcs_float kr = kx*rx + ky*ry + kz*rz;

        if (fields != NULL) {
          /* compute field at position of particle i
             compare to Deserno, Holm (1998) eq. (15) */
          fcs_float fak1 = g * (rhohat_re*sin(kr) + rhohat_im*cos(kr));
          node_fields[3*i] += kx * fak1;
          node_fields[3*i+1] += ky * fak1;
          node_fields[3*i+2] += kz * fak1;
        }
        if (potentials != NULL) {
          /* compute potential at position of particle i
             compare to Deserno, Holm (1998) eq. (9) */
          node_potentials[i] += g * (rhohat_re*cos(kr) - rhohat_im*sin(kr));
        }
      }
    }
  }
  
  /* printf("%d: node_fields[0]=%lf\n", d->comm_rank, node_fields[0]); */

  /* REDISTRIBUTE COMPUTED FAR FIELDS AND POTENTIALS  */
  if (fields != NULL) {
    fcs_float all_fields[total_particles*3];
    for (fcs_int pid=0; pid < total_particles; pid++) {
      all_fields[3*pid] = 0.0;
      all_fields[3*pid+1] = 0.0;
      all_fields[3*pid+2] = 0.0;
    }

    /* Combine all fields on master */
    MPI_Reduce(node_fields, all_fields, total_particles*3, 
      FCS_MPI_FLOAT, MPI_SUM, 0, d->comm);
    /* if (d->comm_rank == 0) */
    /*   printf("%d: all_fields[0]=%lf\n", d->comm_rank, all_fields[0]); */
    /* Scatter the fields to the task that holds the particle */
    MPI_Scatterv(all_fields, node_particles3, displs3, FCS_MPI_FLOAT,
      fields, num_particles*3, FCS_MPI_FLOAT, 0, d->comm);
  }

  if (potentials != NULL) {
    fcs_float all_potentials[total_particles];
    for (fcs_int pid=0; pid < total_particles; pid++)
      all_potentials[pid] = 0.0;
    /* Combine all potentials on master */
    MPI_Reduce(node_potentials, all_potentials, total_particles, 
      FCS_MPI_FLOAT, MPI_SUM, 0, d->comm);
    /* Scatter the potentials to the task that holds the particle */
    MPI_Scatterv(all_potentials, node_particles, displs, FCS_MPI_FLOAT,
      potentials, num_particles, FCS_MPI_FLOAT, 0, d->comm);

    /* subtract self potential */
    FCS_INFO(fprintf(stderr, "  subtracting self potential...\n"));
    for (fcs_int i=0; i < num_particles; i++) {
      potentials[i] -= charges[i] * M_2_SQRTPI * d->alpha;
    }
  }

  free(all_positions);
  free(all_charges);
  free(node_fields);
  free(node_potentials);

  /* now each task should have its far field components */
  FCS_INFO(fprintf(stderr, "ewald_compute_kspace finished.\n"));
}
/**
 * if A[n*n],B[n*1],C[n*1] is Matrices and AC=B then C=A^-1*B
 * get input one Matrix of (A^-1*B) of size [n*(n+1)]
 * scatter the Matrix between process
 * collect B in each Process
 * GatherAll B in each Process
 * do Multiblication
 **/
int main(int argc, char  *argv[])
{
 	int my_rank;
 	int size ;
 	int tag;
 	int source;
 	int dest ;
 	MPI_Status status;


 	MPI_Init(&argc,&argv);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	MPI_Comm_rank(MPI_COMM_WORLD , &my_rank);
 	 
 	int input = 0 ;
 	int A_r = 0, A_c = 0,
		B_r = 0, B_c = 0;
	int *A,*B,*C;
 	 
	int N ;
 	
 	if(my_rank==0){
 		printf("Welcome to vector Matrix multiplication program! Allgather\n");
		FILE *file= fopen("test.txt","r+"); //readFromFile
		if(file!=NULL){
			
			fscanf(file,"%d",&N);
			 A_r = N ;A_c = N ;
			 B_r = N ;B_c = 1 ;
			// check dimnsions of matrix is divisable by # of process
			if (A_c%size != 0)
		 	{
		 		printf("# of MPI tasks Must Divisble by Matrix dimentions. Quitting...\n");
				MPI_Abort(MPI_COMM_WORLD, 0);
				exit(1);
		 	}
		 	// check that  A  [n*n], B  [n*1]
			if(A_c==B_r && A_c==A_r && B_c == 1){
				arr_alloc_file_input(file,&A,N,(N+1)); 
				arr_alloc(&C,A_r,B_c);			 
			}else{
				printf("not Valid Matrix dimentions");
				MPI_Abort(MPI_COMM_WORLD, 0);
				exit(1);
			}
			fclose(file);
		}else{printf("unable to open file");}
 		printf("------ A ---------\n");
		printMatrix(A , N ,N+1);
		 
 		//scanf("%d",&N);
 
 	}
 	 
 	/* Bcast dimentions*/
 	MPI_Bcast (&A_c, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	MPI_Bcast (&A_r, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	MPI_Bcast (&B_c, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	MPI_Bcast (&B_r, 1, MPI_INT, 0, MPI_COMM_WORLD);
 	// allocations
 	if (my_rank!=0){arr_alloc(&A,A_r,(A_r+1));}
 	// all process allocate B to make GatherAll on it
 	arr_alloc(&B,B_r,B_c);

 	// n is dimention 
 	int n = A_r ;
   
   /**
 	* Scatter A smallA and collect smallB from smallA
 	* 
 	* smallA.size = nRows*n
 	* nRows = n/size ;
 	* 
 	* smallB.size = n/size ;
 	**/

 	//allocate
 	int *smallA , *smallB;
 	int nRows = n/size ;
 	int smallA_size = nRows*(n+1);
 	int smallB_size = n/size ;

 	arr_alloc(&smallA, smallA_size, 1);
 	arr_alloc(&smallB, smallB_size, 1);
 	
 	//Scatter A
 	MPI_Scatter(A,smallA_size,MPI_INT,smallA,smallA_size,MPI_INT,0,MPI_COMM_WORLD);
 	
 	// collect smallB in each process
 	// index : smallB index in smallA
 	int j , index=n ;
 	for ( j = 0; j <= nRows; ++j)
 	{
 		smallB[j] = smallA[index];
 		index+= (n+1) ; 
 	}

 	// printf("\n---------P%d-----smallA-------\n",my_rank);
 	// printMatrix(smallA, 1, smallA_size);


 	// printf("\n---------P%d-----smallB-------\n",my_rank);
 	// printMatrix(smallB, 1, smallB_size);

   /**
   	* make each process AllGather it smallB then all process have the B
   	* prebare matrix B to AllGather the smallB
   	* 
   	* prebare matrix smallC to store the result
   	* smallC.size  = smallB.size = n/size ;  
   	*
   	**/

   	// allocation
	int *smallC ,smallC_size=n/size;
	
	
	arr_alloc(&smallC, smallC_size, 1);
	arr_init(smallC, smallC_size, 1, 0);


	MPI_Allgather( smallB , smallB_size,MPI_INT,B,smallB_size,MPI_INT,MPI_COMM_WORLD);
	

	if (my_rank==0)		
	{
		 printf("\n---------B after Allgather-------\n",my_rank);
 		 printMatrix(B, 1, n);

	}

 
  
 
	j = 0 ; 
	for ( j ; j < nRows; ++j) // for each row in smallA
	{
		int sum = 0 ;
		int j2 = 0 ; 
		for ( j2 = 0; j2 < n; ++j2)
		{
			sum+=(smallA[j2+(j*(n+1))] * B[j2]);
			/*
				smalA[j2+(j*(n+1))]
				  j2 -> go in cloumn
				  (j*(n+1)) -> go in row and skip merged B
			*/
			//printf("P%d: [%d - %d]\n",my_rank,smallA[j2+(j*(n+1))], B[j2] );
			
			
		}
		smallC[j] = sum ;
 	}
 

	// collect smallC
	MPI_Gather(smallC , smallC_size , MPI_INT , C , smallC_size , MPI_INT , 0 , MPI_COMM_WORLD);

	if (my_rank==0)
 	{
 		printf("------ Final C -------\n");
		printMatrix( C, A_r,B_c); 
 	} 


	MPI_Finalize();
 	return 0; 	
 }
Esempio n. 29
0
PetscErrorCode ConvertMatToMatrix(MPI_Comm comm, Mat A,Mat AT,matrix **B)
{
  matrix                  *M;
  int                     i,j,col;
  int                     row_indx;
  int                     len,pe,local_indx,start_indx;
  int                     *mapping;
  PetscErrorCode          ierr;
  const int               *cols;
  const double            *vals;
  int                     n,mnl,nnl,nz,rstart,rend;
  PetscMPIInt             size,rank;
  struct compressed_lines *rows;

  PetscFunctionBegin;
  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
  ierr = MatGetSize(A,&n,&n);CHKERRQ(ierr);
  ierr = MatGetLocalSize(A,&mnl,&nnl);CHKERRQ(ierr);

  /*
    not sure why a barrier is required. commenting out
  ierr = MPI_Barrier(comm);CHKERRQ(ierr);
  */

  M = new_matrix((SPAI_Comm)comm);

  M->n              = n;
  M->bs             = 1;
  M->max_block_size = 1;

  M->mnls          = (int*)malloc(sizeof(int)*size);
  M->start_indices = (int*)malloc(sizeof(int)*size);
  M->pe            = (int*)malloc(sizeof(int)*n);
  M->block_sizes   = (int*)malloc(sizeof(int)*n);
  for (i=0; i<n; i++) M->block_sizes[i] = 1;

  ierr = MPI_Allgather(&mnl,1,MPI_INT,M->mnls,1,MPI_INT,comm);CHKERRQ(ierr);

  M->start_indices[0] = 0;
  for (i=1; i<size; i++) M->start_indices[i] = M->start_indices[i-1] + M->mnls[i-1];

  M->mnl            = M->mnls[M->myid];
  M->my_start_index = M->start_indices[M->myid];

  for (i=0; i<size; i++) {
    start_indx = M->start_indices[i];
    for (j=0; j<M->mnls[i]; j++) M->pe[start_indx+j] = i;
  }

  if (AT) {
    M->lines = new_compressed_lines(M->mnls[rank],1);CHKERRQ(ierr);
  } else {
    M->lines = new_compressed_lines(M->mnls[rank],0);CHKERRQ(ierr);
  }

  rows = M->lines;

  /* Determine the mapping from global indices to pointers */
  ierr       = PetscMalloc1(M->n,&mapping);CHKERRQ(ierr);
  pe         = 0;
  local_indx = 0;
  for (i=0; i<M->n; i++) {
    if (local_indx >= M->mnls[pe]) {
      pe++;
      local_indx = 0;
    }
    mapping[i] = local_indx + M->start_indices[pe];
    local_indx++;
  }

  /*********************************************************/
  /************** Set up the row structure *****************/
  /*********************************************************/

  ierr = MatGetOwnershipRange(A,&rstart,&rend);CHKERRQ(ierr);
  for (i=rstart; i<rend; i++) {
    row_indx = i - rstart;
    ierr     = MatGetRow(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
    /* allocate buffers */
    rows->ptrs[row_indx] = (int*)malloc(nz*sizeof(int));
    rows->A[row_indx]    = (double*)malloc(nz*sizeof(double));
    /* copy the matrix */
    for (j=0; j<nz; j++) {
      col = cols[j];
      len = rows->len[row_indx]++;

      rows->ptrs[row_indx][len] = mapping[col];
      rows->A[row_indx][len]    = vals[j];
    }
    rows->slen[row_indx] = rows->len[row_indx];

    ierr = MatRestoreRow(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
  }


  /************************************************************/
  /************** Set up the column structure *****************/
  /*********************************************************/

  if (AT) {

    for (i=rstart; i<rend; i++) {
      row_indx = i - rstart;
      ierr     = MatGetRow(AT,i,&nz,&cols,&vals);CHKERRQ(ierr);
      /* allocate buffers */
      rows->rptrs[row_indx] = (int*)malloc(nz*sizeof(int));
      /* copy the matrix (i.e., the structure) */
      for (j=0; j<nz; j++) {
        col = cols[j];
        len = rows->rlen[row_indx]++;

        rows->rptrs[row_indx][len] = mapping[col];
      }
      ierr = MatRestoreRow(AT,i,&nz,&cols,&vals);CHKERRQ(ierr);
    }
  }

  ierr = PetscFree(mapping);CHKERRQ(ierr);

  order_pointers(M);
  M->maxnz = calc_maxnz(M);
  *B       = M;
  PetscFunctionReturn(0);
}
Esempio n. 30
0
int main(int argc, char **argv) {

  const int MAX_ITER  = 20;
  const double TOL = 1e-12;
  
  int rank;
  int size;
  int P = 8; // number of blocks to update P <= size

  /* -----------------------------------
     mode controls the selection schemes, 
       mode =0, fixed P
       mode =1, dynamic update P
     ----------------------------------*/
  int mode=1; // number of processors used to update each time
  double lambda = 0.1;
  srand (time(NULL));
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Determine current running process
  MPI_Comm_size(MPI_COMM_WORLD, &size); // Total number of processes
  
  // data directory (you need to change the path to your own data directory)
  char* dataCenterDir = "../Data/Gaussian";
  char* big_dir;
  if(argc==2)
    big_dir = argv[1];
  else
    big_dir = "big1";

  /* Read in local data */
  
  FILE *f, *test;
  int m, n, j;
  int row, col;
  double entry, startTime, endTime;
  double total_start_time, total_end_time;
  /*
   * Subsystem n will look for files called An.dat and bn.dat
   * in the current directory; these are its local data and do not need to be
   * visible to any other processes. Note that
   * m and n here refer to the dimensions of the *local* coefficient matrix.
   */
  
  /* ------------
     Read in A 
     ------------*/
  if(rank ==0){
    printf("=============================\n");
    printf("|    Start to load data!     |\n");
    printf("=============================\n");
  }
  char s[100];
  sprintf(s, "%s/%s/A%d.dat",dataCenterDir,big_dir, rank + 1);
  printf("[%d] reading %s\n", rank, s);
  f = fopen(s, "r");
  if (f == NULL) {
    printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
    exit(EXIT_FAILURE);
  }
  mm_read_mtx_array_size(f, &m, &n);
  gsl_matrix *A = gsl_matrix_calloc(m, n);
  for (int i = 0; i < m*n; i++) {
    row = i % m;
    col = floor(i/m);
    fscanf(f, "%lf", &entry);
    gsl_matrix_set(A, row, col, entry);
  }
  fclose(f);
  
  /* ------------
      Read in b 
     -------------*/
  sprintf(s, "%s/%s/b.dat", dataCenterDir, big_dir);
  printf("[%d] reading %s\n", rank, s);
  f = fopen(s, "r");
  if (f == NULL) {
    printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
    exit(EXIT_FAILURE);
  }
  mm_read_mtx_array_size(f, &m, &n);
  gsl_vector *b = gsl_vector_calloc(m);
  for (int i = 0; i < m; i++) {
    fscanf(f, "%lf", &entry);
    gsl_vector_set(b, i, entry);
  }
  fclose(f);
  
  /* ------------
     Read in xs 
     ------------*/
  sprintf(s, "%s/%s/xs%d.dat", dataCenterDir, big_dir, rank + 1);
  printf("[%d] reading %s\n", rank, s);
  f = fopen(s, "r");
  if (f == NULL) {
    printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
    exit(EXIT_FAILURE);
  }
  mm_read_mtx_array_size(f, &m, &n);
  gsl_vector *xs = gsl_vector_calloc(m);
  
  for (int i = 0; i < m; i++) {
    fscanf(f, "%lf", &entry);
    gsl_vector_set(xs, i, entry);
  }
  fclose(f);
  
  m = A->size1;
  n = A->size2;
  MPI_Barrier(MPI_COMM_WORLD);
  
  /*----------------------------------------
   * These are all variables related to GRock
   ----------------------------------------*/
  
  struct value table[size];
  gsl_vector *x        = gsl_vector_calloc(n);
  gsl_vector *As       = gsl_vector_calloc(n);
  gsl_vector *invAs    = gsl_vector_calloc(n);
  gsl_vector *local_b  = gsl_vector_calloc(m);
  gsl_vector *beta     = gsl_vector_calloc(n);
  gsl_vector *tmp      = gsl_vector_calloc(n);
  gsl_vector *d        = gsl_vector_calloc(n);
  gsl_vector *absd     = gsl_vector_calloc(n);
  gsl_vector *oldx     = gsl_vector_calloc(n);
  gsl_vector *tmpx     = gsl_vector_calloc(n);
  gsl_vector *z        = gsl_vector_calloc(m);
  gsl_vector *tmpz     = gsl_vector_calloc(m);
  gsl_vector *Ax       = gsl_vector_calloc(m);
  gsl_vector *Atmpx    = gsl_vector_calloc(m);
  gsl_vector *xdiff    = gsl_vector_calloc(n);
  gsl_permutation *idx = gsl_permutation_calloc(n);
  double send[1]; 
  double recv[1]; 
  double err;

  int num_upd = (int)(n*0.08);
  double sigma = 0.01;

  double xs_local_nrm[1], xs_nrm[1];
  double local_old_obj, global_old_obj, local_new_obj, global_new_obj;
  //calculate the 2 norm of xs
  xs_local_nrm[0] = gsl_blas_dnrm2(xs);
  xs_local_nrm[0] *=xs_local_nrm[0];
  MPI_Allreduce(xs_local_nrm, xs_nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  xs_nrm[0] = sqrt(xs_nrm[0]);
  
  // evaluate the two norm of the columns of A
  for(j=0;j<n;j++){
    gsl_vector_view column = gsl_matrix_column(A, j);
    double d;
    d = gsl_blas_dnrm2(&column.vector);
    gsl_vector_set(As, j, d*d);
    gsl_vector_set(invAs, j, 1./(d*d));
  }
  
  if (rank == 0) {
    printf("=============================\n");
    printf("|GRock start to solve Lasso!|\n");
    printf("|---------------------------|\n");
    printf("|lambda=%1.2f, m=%d, n=%d  |\n", lambda, m, n*size);
    if(mode==1) printf("| Mode: dynamic update P.   |\n");
    else  printf("|   Mode: fixed update P    |\n");
    printf("=============================\n");
    printf("%3s %8s %8s %5s\n", "iter", "rel_err", "obj", "P");
    startTime = MPI_Wtime();
    sprintf(s, "results/test%d.m", size);
    test = fopen(s, "w");
    fprintf(test,"res = [ \n");
  }
  
  /* Main BCD loop */
  total_start_time = MPI_Wtime();
  int iter = 0;
  while (iter < MAX_ITER) {
    startTime = MPI_Wtime();

    /*---------- restore the old x ------------*/
    gsl_vector_memcpy(oldx, x);
    
    /*------- calculate local_b = b - sum_{j \neq i} Aj*xj--------- */ 
    gsl_blas_dgemv(CblasNoTrans, 1, A, x, 0, Ax); // Ax = A * x
    MPI_Allreduce(Ax->data, z->data,  m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    gsl_vector_sub(z, b); // z = Ax - b
    gsl_vector_memcpy(local_b, Ax);
    gsl_vector_sub(local_b, z);
    
    /* -------calculate beta ------------------*/
    gsl_blas_dgemv(CblasTrans, -1, A, z, 0, beta); // beta = A'(b - Ax) + ||A.s||^2 * xs
    gsl_vector_memcpy(tmp, As);    
    pointwise(tmp, x, n);
    gsl_vector_add(beta, tmp);
    shrink(beta, lambda);
    // x = 1/|xs|^2 * shrink(beta, lambda)
    gsl_vector_memcpy(x, beta);
    pointwise(x, invAs, n); 
  
    /* ------calcuate proposed decrease -------- */
    gsl_vector_memcpy(d,x);
    gsl_vector_sub(d, oldx);
    if(mode ==1){
      gsl_vector_memcpy(absd, d);
      abs_vector(absd, n);
      // sort the local array d
      gsl_vector_scale(absd, -1.0);
      gsl_sort_vector_index(idx, absd);

      //    printf("|d(0)| = %lf, |d(1)| = %lf \n", gsl_vector_get(absd,0), gsl_vector_get(absd, 3));
      // calculate current objective value;
      local_old_obj = objective(oldx, lambda, z, size);
      MPI_Allreduce(&local_old_obj, &global_old_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
      num_upd = fmin(num_upd+1, (int)(0.1*n));    
      gsl_vector_memcpy(tmpx, oldx);
      int upd_idx;
      double local_delta = 0, delta=0.0;
      for(int i=0; i<num_upd; i++){
	upd_idx = gsl_permutation_get(idx, i);
	//      printf("%d\n", upd_idx);
	gsl_vector_set(tmpx, upd_idx, gsl_vector_get(x, upd_idx));
	local_delta += gsl_vector_get(d, upd_idx) * gsl_vector_get(d, upd_idx);
      }
      MPI_Allreduce(&local_delta, &delta,  1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);    
      gsl_blas_dgemv(CblasNoTrans, 1, A, tmpx, 0, Atmpx); // Ax = A * x
      MPI_Allreduce(Atmpx->data, tmpz->data,  m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
      gsl_vector_sub(tmpz, b); // z = Ax - b
    
      local_new_obj = objective(tmpx, lambda, tmpz, size);
      MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

      while(global_new_obj - global_old_obj> -sigma * delta){
	num_upd = fmax(num_upd-1, 1);
	for(int i=0; i<num_upd; i++){
	  upd_idx = gsl_permutation_get(idx, i);
	  gsl_vector_set(tmpx, upd_idx, gsl_vector_get(x, upd_idx));
	  local_delta += gsl_vector_get(d, upd_idx) * gsl_vector_get(d, upd_idx);
	}
	MPI_Allreduce(&delta, &local_delta,  1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);    
	gsl_blas_dgemv(CblasNoTrans, 1, A, tmpx, 0, Atmpx); // Ax = A * x
	MPI_Allreduce(Atmpx->data, tmpz->data,  m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	gsl_vector_sub(tmpz, b); // z = Ax - b
	
	local_new_obj = objective(tmpx, lambda, tmpz, size);
	MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	
	if(num_upd==1)
	  break;
      }

      gsl_vector_memcpy(x, tmpx);
    }  

    if(mode==0){
      CBLAS_INDEX_t id = gsl_blas_idamax(d);
      double *store = (double*)calloc(size, sizeof(double));
      double foo[1];
      foo[0] = gsl_vector_get(d,id);
      MPI_Allgather(foo, 1, MPI_DOUBLE, store, 1, MPI_DOUBLE, MPI_COMM_WORLD);
      for(int i=0;i<size;i++){
	table[i].ID   = i;
	table[i].data = fabs(store[i]);
      }
      // quick sort to decide which block to update
      qsort((void *) & table, size, sizeof(struct value), (compfn)compare );
      gsl_vector_memcpy(x, oldx);
      
      if(size>P){
	for(int i=0;i<P;i++){
	  if(rank == table[i].ID)
	    gsl_vector_set(x, id, gsl_vector_get(oldx, id) + gsl_vector_get(d, id));
	}
      }else
	gsl_vector_set(x, id, gsl_vector_get(oldx, id) + gsl_vector_get(d, id));
      local_new_obj = objective(x, lambda, z, size);
      MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    
    /*------------------------------
      calculate the relative error
      ------------------------------*/
    gsl_vector_memcpy(xdiff,xs);
    gsl_vector_sub(xdiff, x);
    err = gsl_blas_dnrm2(xdiff);
    send[0] = err*err;
    MPI_Allreduce(send, recv, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    recv[0] = sqrt(recv[0])/xs_nrm[0];
 
    endTime = MPI_Wtime();
    if(mode==1) P = num_upd*size;
    if (rank == 0) {
      if(iter%5 == 0)
	printf("%3d %10.2e %10.4f %3d\n", iter,
	       recv[0],  global_new_obj, P);
      fprintf(test, "%e \n",recv[0]);
    }

    /* termination check */
    if(recv[0] < TOL){
      break;
    }
    iter++;
  }
  total_end_time = MPI_Wtime();  
  /* Have the master write out the results to disk */
  if (rank == 0) {
    printf("=============================\n");
    printf("|    GRock solved Lasso!    |\n");
    printf("|---------------------------|\n");
    printf("|Summary:                   |\n");
    printf("|   # of iteration: %d      |\n", iter);
    printf("|   relative error: %4.2e|\n", recv[0]);
    printf("|  objective value: %4.2f    |\n", global_new_obj);
    printf("|             time: %4.1es|\n", total_end_time - total_start_time);
    printf("=============================\n");
    
    fprintf(test,"] \n");
    fprintf(test,"semilogy(1:length(res),res); \n");
    fprintf(test,"xlabel('# of iteration'); ylabel('||x - xs||');\n");
    fclose(test);
    f = fopen("results/solution.dat", "w");
    fprintf(f,"x = [ \n");
    gsl_vector_fprintf(f, x, "%lf");
    fprintf(f,"] \n");
    fclose(f);
    endTime = MPI_Wtime();
  }
  
  MPI_Finalize(); /* Shut down the MPI execution environment */
  
  /* Clear memory */
  gsl_matrix_free(A);
  gsl_vector_free(b);
  gsl_vector_free(x);
  gsl_vector_free(z);
  gsl_vector_free(xdiff);
  gsl_vector_free(Ax);
  gsl_vector_free(As);
  gsl_vector_free(invAs);
  gsl_vector_free(tmpx);
  gsl_vector_free(oldx);
  gsl_vector_free(local_b);
  gsl_vector_free(beta);
  gsl_vector_free(tmpz);
  gsl_vector_free(absd);
  gsl_vector_free(Atmpx);
  gsl_permutation_free(idx);

  return 0;
}