int MPI_Address(void *location, MPI_Aint *address) { int rank; PMPI_Comm_rank(MPI_COMM_WORLD, &rank); fprintf(stderr, "MPI_ADDRESS[%d]: location %0" PRIxPTR " address %0" PRIxPTR "\n", rank, (uintptr_t)location, (uintptr_t)address); fflush(stderr); return PMPI_Address(location, address); }
int MPI_Address(void *location, MPI_Aint * address) { return PMPI_Address(location, address); }
/* * mpiPi_collect_basics() - all tasks send their basic info to the * collectorRank. */ void mpiPi_collect_basics () { int i = 0; double app_time = mpiPi.cumulativeTime; int cnt; mpiPi_task_info_t mti; int blockcounts[4] = { 1, 1, 1, MPIPI_HOSTNAME_LEN_MAX }; MPI_Datatype types[4] = { MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_CHAR }; MPI_Aint displs[4]; MPI_Datatype mti_type; MPI_Request *recv_req_arr; mpiPi_msg_debug ("Collect Basics\n"); cnt = 0; PMPI_Address (&mti.mpi_time, &displs[cnt++]); PMPI_Address (&mti.app_time, &displs[cnt++]); PMPI_Address (&mti.rank, &displs[cnt++]); PMPI_Address (&mti.hostname, &displs[cnt++]); for (i = (cnt - 1); i >= 0; i--) { displs[i] -= displs[0]; } PMPI_Type_struct (cnt, blockcounts, displs, types, &mti_type); PMPI_Type_commit (&mti_type); if (mpiPi.rank == mpiPi.collectorRank) { /* In the case where multiple reports are generated per run, only allocate memory for global_task_info once */ if (mpiPi.global_task_info == NULL) { mpiPi.global_task_info = (mpiPi_task_info_t *) calloc (mpiPi.size, sizeof (mpiPi_task_info_t)); if (mpiPi.global_task_info == NULL) mpiPi_abort ("Failed to allocate memory for global_task_info"); mpiPi_msg_debug ("MEMORY : Allocated for global_task_info : %13ld\n", mpiPi.size * sizeof (mpiPi_task_info_t)); } bzero (mpiPi.global_task_info, mpiPi.size * sizeof (mpiPi_task_info_t)); recv_req_arr = (MPI_Request *) malloc (sizeof (MPI_Request) * mpiPi.size); for (i = 0; i < mpiPi.size; i++) { mpiPi_task_info_t *p = &mpiPi.global_task_info[i]; if (i != mpiPi.collectorRank) { PMPI_Irecv (p, 1, mti_type, i, mpiPi.tag, mpiPi.comm, &(recv_req_arr[i])); } else { strcpy (p->hostname, mpiPi.hostname); p->app_time = app_time; p->rank = mpiPi.rank; recv_req_arr[i] = MPI_REQUEST_NULL; } } PMPI_Waitall (mpiPi.size, recv_req_arr, MPI_STATUSES_IGNORE); free (recv_req_arr); /* task MPI time is calculated from callsites data in mpiPi_insert_callsite_records. */ for (i = 0; i < mpiPi.size; i++) mpiPi.global_task_info[i].mpi_time = 0.0; } else { strcpy (mti.hostname, mpiPi.hostname); mti.app_time = app_time; mti.rank = mpiPi.rank; PMPI_Send (&mti, 1, mti_type, mpiPi.collectorRank, mpiPi.tag, mpiPi.comm); } PMPI_Type_free (&mti_type); return; }
static int two_phase_read_and_exch(mca_io_ompio_file_t *fh, void *buf, MPI_Datatype datatype, mca_io_ompio_access_array_t *others_req, struct iovec *offset_len, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, size_t *buf_idx, int striping_unit, int two_phase_num_io_procs, int *aggregator_list){ int ret=OMPI_SUCCESS, i = 0, j = 0, ntimes = 0, max_ntimes = 0; int m = 0; int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL; int *partial_send=NULL, *start_pos=NULL, req_len=0, flag=0; int *recd_from_proc=NULL; MPI_Aint buftype_extent=0; size_t byte_size = 0; OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off=0, done=0, for_next_iter=0; OMPI_MPI_OFFSET_TYPE size=0, req_off=0, real_size=0, real_off=0, len=0; OMPI_MPI_OFFSET_TYPE for_curr_iter=0; char *read_buf=NULL, *tmp_buf=NULL; MPI_Datatype byte = MPI_BYTE; int two_phase_cycle_buffer_size=0; opal_datatype_type_size(&byte->super, &byte_size); for (i = 0; i < fh->f_size; i++){ if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0;i<fh->f_size;i++){ for(j=0;j< others_req[i].count; j++){ st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]); end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } } fh->f_get_bytes_per_agg ( &two_phase_cycle_buffer_size); ntimes = (int)((end_loc - st_loc + two_phase_cycle_buffer_size)/ two_phase_cycle_buffer_size); if ((st_loc == -1) && (end_loc == -1)){ ntimes = 0; } fh->f_comm->c_coll.coll_allreduce (&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (ntimes){ read_buf = (char *) calloc (two_phase_cycle_buffer_size, sizeof(char)); if ( NULL == read_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } curr_offlen_ptr = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == curr_offlen_ptr){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } count = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == count){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } partial_send = (int *)calloc(fh->f_size, sizeof(int)); if ( NULL == partial_send ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } send_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == send_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == recv_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recd_from_proc = (int *)calloc(fh->f_size,sizeof(int)); if (NULL == recd_from_proc){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } start_pos = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == start_pos ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; ompi_datatype_type_extent(datatype, &buftype_extent); for (m=0; m<ntimes; m++) { size = OMPIO_MIN((unsigned)two_phase_cycle_buffer_size, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<fh->f_size; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; PMPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); send_size[i] += (int)(OMPIO_MIN(real_off + real_size - req_off, (OMPI_MPI_OFFSET_TYPE)req_len)); if (real_off+real_size-req_off < (OMPI_MPI_OFFSET_TYPE)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = OMPIO_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<fh->f_size; i++) if (count[i]) flag = 1; if (flag) { #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif len = size * byte_size; fh->f_io_array = (mca_io_ompio_io_array_t *)calloc (1,sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)off; fh->f_io_array[0].length = len; fh->f_io_array[0].memory_address = read_buf+for_curr_iter; fh->f_num_of_io_entries = 1; if (fh->f_num_of_io_entries){ if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output(1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if 0 int ii; printf("%d: len/4 : %lld\n", fh->f_rank, len/4); for (ii = 0; ii < len/4 ;ii++){ printf("%d: read_buf[%d]: %ld\n", fh->f_rank, ii, (int *)read_buf[ii]); } #endif fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += (end_read_time - start_read_time); #endif } for_curr_iter = for_next_iter; for (i=0; i< fh->f_size; i++){ recv_size[i] = 0; } two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, two_phase_num_io_procs, aggregator_list); if (for_next_iter){ tmp_buf = (char *) calloc (for_next_iter, sizeof(char)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); free(read_buf); read_buf = (char *)malloc(for_next_iter+two_phase_cycle_buffer_size); memcpy(read_buf, tmp_buf, for_next_iter); free(tmp_buf); } off += size; done += size; } for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, two_phase_num_io_procs, aggregator_list); exit: free (read_buf); free (curr_offlen_ptr); free (count); free (partial_send); free (send_size); free (recv_size); free (recd_from_proc); free (start_pos); return ret; }