static int test_upper( unsigned int length ) { ompi_datatype_t *pdt; opal_convertor_t * pConv; int rc = OMPI_SUCCESS; unsigned int i, iov_count, split_chunk, total_length; size_t max_data; struct iovec iov[5]; TIMER_DATA_TYPE start, end; long total_time; printf( "test upper matrix\n" ); pdt = upper_matrix( length ); /*dt_dump( pdt );*/ total_length = length * (length + 1) * ( sizeof(double) / 2); pConv = opal_convertor_create( remote_arch, 0 ); if( OMPI_SUCCESS != opal_convertor_prepare_for_send( pConv, &(pdt->super), 1, NULL ) ) { printf( "Cannot attach the datatype to a convertor\n" ); return OMPI_ERROR; } GET_TIME( start ); split_chunk = (length + 1) * sizeof(double); /* split_chunk = (total_length + 1) * sizeof(double); */ for( i = total_length; i > 0; ) { iov_count = 5; max_data = 0; opal_convertor_raw( pConv, iov, &iov_count, &max_data ); i -= max_data; } GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); printf( "complete raw in %ld microsec\n", total_time ); /* test the automatic destruction pf the data */ ompi_datatype_destroy( &pdt ); assert( pdt == NULL ); OBJ_RELEASE( pConv ); return rc; }
/* * gatherv_inter * * Function: - gatherv operation using a local gather on c_local_comm * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_gatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL, *displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; if (MPI_PROC_NULL == root) { /* do nothing */ return OMPI_SUCCESS; } size = ompi_comm_remote_size(comm); rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm); if (MPI_ROOT == root) { /* I am the root, receiving the data from zero. */ ompi_datatype_create_indexed(size, rcounts, disps, rdtype, &ndtype); ompi_datatype_commit(&ndtype); err = MCA_PML_CALL(recv(rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_GATHERV, comm, MPI_STATUS_IGNORE)); ompi_datatype_destroy(&ndtype); return err; } if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); displace = (int *)malloc(sizeof(int) * size_local); if ((NULL == displace) || (NULL == count)) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (OMPI_SUCCESS != err) { goto exit; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } /* Perform the gatherv locally with the first process as root */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { err = OMPI_ERROR; goto exit; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*count[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gatherv_module); if (OMPI_SUCCESS != err) { goto exit; } if (0 == rank) { for (i = 0; i < size_local; i++) { total = total + count[i]; } /* First process sends data to the root */ err = MCA_PML_CALL(send(ptmp, total, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, MCA_PML_BASE_SEND_STANDARD, comm)); } exit: if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } /* All done */ return err; }
int mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL; int i, j, l,cycles=0, local_cycles=0, *current_index=NULL; int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0; int **blocklen_per_process=NULL, *iovec_count_per_process=NULL; int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0; int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL; MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0; MPI_Aint *memory_displacements=NULL; int bytes_to_read_in_cycle=0; size_t max_data=0, bytes_per_cycle=0; uint32_t iov_count=0, iov_index=0; struct iovec *decoded_iov=NULL, *iov=NULL; mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL; mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL; char *global_buf=NULL, *receive_buf=NULL; int blocklen[3] = {1, 1, 1}; int static_num_io_procs=1; OPAL_PTRDIFF_TYPE d[3], base; ompi_datatype_t *types[3]; ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator=-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_common_ompio_print_entry nentry; #endif #if DEBUG_ON MPI_Aint gc_in; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } /* In case the data is not contigous in memory, decode it into an iovec */ if (!recvbuf_is_contiguous ) { fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &static_num_io_procs ); fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, static_num_io_procs, max_data); my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /* printf("max_data %ld\n", max_data); */ ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh, max_data, &iov, &iov_size); if (ret != OMPI_SUCCESS){ goto exit; } if ( iov_size > 0 ) { local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (j=0; j < iov_size; j++){ local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) iov[j].iov_base; local_iov_array[j].length = (size_t)iov[j].iov_len; local_iov_array[j].process_id = fh->f_rank; } } else { /* Allocate at least one element to correctly create the derived data type */ local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0; local_iov_array[0].length = (size_t) 0; local_iov_array[0].process_id = fh->f_rank; } d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0]; d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length; d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id; base = d[0]; for (i=0 ; i<3 ; i++) { d[i] -= base; } /* io_array datatype for using in communication*/ types[0] = &ompi_mpi_long.dt; types[1] = &ompi_mpi_long.dt; types[2] = &ompi_mpi_int.dt; ompi_datatype_create_struct (3, blocklen, d, types, &io_array_type); ompi_datatype_commit (&io_array_type); /* #########################################################*/ fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle); local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles, &cycles, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int )); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == bytes_remaining){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == current_index){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == iovec_count_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == displs){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&iov_size, 1, MPI_INT, iovec_count_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { displs[0] = 0; global_iov_count = iovec_count_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_iov_count += iovec_count_per_process[i]; displs[i] = displs[i-1] + iovec_count_per_process[i-1]; } } if ( (my_aggregator == fh->f_rank) && (global_iov_count > 0 )) { global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count * sizeof(mca_fcoll_static_local_io_array)); if (NULL == global_iov_array){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_gatherv_array (local_iov_array, iov_size, io_array_type, global_iov_array, iovec_count_per_process, displs, io_array_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ fprintf(stderr,"global_iov_array gather error!\n"); goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if ( ( my_aggregator == fh->f_rank) && ( global_iov_count > 0 )) { sorted = (int *)malloc (global_iov_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } read_local_heap_sort (global_iov_array, global_iov_count, sorted); send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for ( i=0; i<fh->f_procs_per_group; i++ ) { sendtype[i] = MPI_DATATYPE_NULL; } if (NULL == bytes_per_process){ bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (gc_in=0; gc_in<global_iov_count; gc_in++){ printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n", global_iov_array[sorted[gc_in]].process_id, gc_in, global_iov_array[sorted[gc_in]].offset, gc_in, global_iov_array[sorted[gc_in]].length); } } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++){ if (my_aggregator == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if ( NULL != sendtype ) { for ( i=0; i<fh->f_procs_per_group; i++ ) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy (&sendtype[i] ); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } if (index < local_cycles ) { if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) { bytes_to_read_in_cycle = max_data - position; } else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) { bytes_to_read_in_cycle = max_data; } else { bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group; } } else { bytes_to_read_in_cycle = 0; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif fcoll_base_coll_gather_array (&bytes_to_read_in_cycle, 1, MPI_INT, bytes_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_to_read_in_cycle) { receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char)); if ( NULL == receive_buf){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_to_read_in_cycle, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { for (i=0;i<fh->f_procs_per_group; i++){ while (bytes_per_process[i] > 0){ /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n", index, i, bytes_per_process[i], i, bytes_remaining[i]);*/ if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id, fh) == i){ /* current id owns this entry!*/ if (bytes_remaining[i]){ /*Remaining bytes in the current entry of the global offset array*/ if (bytes_remaining[i] <= bytes_per_process[i]){ blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); bytes_per_process[i] -= bytes_remaining[i]; blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_remaining[i] = 0; /* This entry has been used up, we need to move to the next entry of this process and make current_index point there*/ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } continue; } else{ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); bytes_remaining[i] -= bytes_per_process[i]; bytes_per_process[i] = 0; break; } } else{ if (bytes_per_process[i] < global_iov_array[sorted[current_index[i]]].length){ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; bytes_remaining[i] = global_iov_array[sorted[current_index[i]]].length - bytes_per_process[i]; bytes_per_process[i] = 0; break; } else { blocklen_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].length; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_per_process[i] -= global_iov_array[sorted[current_index[i]]].length; current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } } } } else{ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ bytes_per_process[i] = 0; /* no more entries left to service this request*/ continue; } } } } entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group;i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ entries_per_aggregator++; #if DEBUG_ON printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_fcoll_static_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } temp_index=0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i]; j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]], disp_index[i]); } #endif fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); if (my_aggregator == fh->f_rank){ for (i=0 ; i<global_count/4 ; i++) printf (" READ %d \n",((int *)global_buf)[i]); } #endif temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group; i++){ send_req[i] = MPI_REQUEST_NULL; ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } /* if ( my_aggregator == fh->f_rank ) */ ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif position += bytes_to_read_in_cycle; if (!recvbuf_is_contiguous) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_to_read_in_cycle; while (remaining && (iov_count > iov_index)){ mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else{ memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = static_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){ mca_common_ompio_register_print_entry(fh->f_coll_read_time, nentry); } #endif exit: if (NULL != decoded_iov){ free(decoded_iov); decoded_iov = NULL; } if (NULL != displs){ free(displs); displs = NULL; } if (NULL != iovec_count_per_process){ free(iovec_count_per_process); iovec_count_per_process=NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != global_iov_array){ free(global_iov_array); global_iov_array=NULL; } if (my_aggregator == fh->f_rank) { for(l=0;l<fh->f_procs_per_group;l++){ if (blocklen_per_process) { free(blocklen_per_process[l]); } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != disp_index){ free(disp_index); disp_index =NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } if(NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining = NULL; } if(NULL != current_index){ free(current_index); current_index = NULL; } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process =NULL; } if (NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if ( !recvbuf_is_contiguous ) { if (NULL != receive_buf){ free(receive_buf); receive_buf=NULL; } } if (NULL != global_buf) { free(global_buf); global_buf = NULL; } if (NULL != sorted) { free(sorted); sorted = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return ret; }
int32_t ompi_datatype_create_darray(int size, int rank, int ndims, int const* gsize_array, int const* distrib_array, int const* darg_array, int const* psize_array, int order, const ompi_datatype_t* oldtype, ompi_datatype_t** newtype) { ompi_datatype_t *lastType; ptrdiff_t orig_extent, *st_offsets = NULL; int i, start_loop, end_loop, step; int *coords = NULL, rc = OMPI_SUCCESS; /* speedy corner case */ if (ndims < 1) { /* Don't just return MPI_DATATYPE_NULL as that can't be MPI_TYPE_FREE()ed, and that seems bad */ *newtype = ompi_datatype_create(0); ompi_datatype_add(*newtype, &ompi_mpi_datatype_null.dt, 0, 0, 0); return MPI_SUCCESS; } rc = ompi_datatype_type_extent(oldtype, &orig_extent); if (MPI_SUCCESS != rc) goto cleanup; /* calculate position in grid using row-major ordering */ { int tmp_rank = rank, procs = size; coords = (int *) malloc(ndims * sizeof(int)); for (i = 0 ; i < ndims ; i++) { procs = procs / psize_array[i]; coords[i] = tmp_rank / procs; tmp_rank = tmp_rank % procs; } } st_offsets = (ptrdiff_t *) malloc(ndims * sizeof(ptrdiff_t)); /* duplicate type to here to 1) deal with constness without casting and 2) eliminate need to for conditional destroy below. Lame, yes. But cleaner code all around. */ rc = ompi_datatype_duplicate(oldtype, &lastType); if (OMPI_SUCCESS != rc) goto cleanup; /* figure out ordering issues */ if (MPI_ORDER_C == order) { start_loop = ndims - 1 ; step = -1; end_loop = -1; } else { start_loop = 0 ; step = 1; end_loop = ndims; } /* Build up array */ for (i = start_loop; i != end_loop; i += step) { int nprocs, tmp_rank; switch(distrib_array[i]) { case MPI_DISTRIBUTE_BLOCK: rc = block(gsize_array, i, ndims, psize_array[i], coords[i], darg_array[i], order, orig_extent, lastType, newtype, st_offsets+i); break; case MPI_DISTRIBUTE_CYCLIC: rc = cyclic(gsize_array, i, ndims, psize_array[i], coords[i], darg_array[i], order, orig_extent, lastType, newtype, st_offsets+i); break; case MPI_DISTRIBUTE_NONE: /* treat it as a block distribution on 1 process */ if (order == MPI_ORDER_C) { nprocs = psize_array[i]; tmp_rank = coords[i]; } else { nprocs = 1; tmp_rank = 0; } rc = block(gsize_array, i, ndims, nprocs, tmp_rank, MPI_DISTRIBUTE_DFLT_DARG, order, orig_extent, lastType, newtype, st_offsets+i); break; default: rc = MPI_ERR_ARG; } ompi_datatype_destroy(&lastType); /* need to destroy the old type even in error condition, so don't check return code from above until after cleanup. */ if (MPI_SUCCESS != rc) goto cleanup; lastType = *newtype; } /* set displacement and UB correctly. Use struct instead of resized for same reason as subarray */ { ptrdiff_t displs[3], tmp_size; ompi_datatype_t *types[3]; int blength[3] = { 1, 1, 1}; displs[1] = st_offsets[start_loop]; tmp_size = 1; for (i = start_loop + step ; i != end_loop ; i += step) { tmp_size *= gsize_array[i - step]; displs[1] += tmp_size * st_offsets[i]; } displs[0] = 0; displs[1] *= orig_extent; displs[2] = orig_extent; for (i = 0 ; i < ndims ; i++) { displs[2] *= gsize_array[i]; } types[0] = MPI_LB; types[1] = lastType; types[2] = MPI_UB; rc = ompi_datatype_create_struct(3, blength, displs, types, newtype); ompi_datatype_destroy(&lastType); /* need to destroy the old type even in error condition, so don't check return code from above until after cleanup. */ if (MPI_SUCCESS != rc) goto cleanup; } cleanup: if (NULL != st_offsets) free(st_offsets); if (NULL != coords) free(coords); return OMPI_SUCCESS; }
int32_t ompi_datatype_create_subarray(int ndims, int const* size_array, int const* subsize_array, int const* start_array, int order, const ompi_datatype_t* oldtype, ompi_datatype_t** newtype) { MPI_Datatype last_type; int32_t i, step, end_loop; MPI_Aint size, displ, extent; /** * If the oldtype contains the original MPI_LB and MPI_UB markers then we * are forced to follow the MPI standard suggestion and reset these 2 * markers (MPI 3.0 page 96 line 37). Otherwise we can simply resize the * datatype. */ ompi_datatype_type_extent( oldtype, &extent ); /* If the ndims is zero then return the NULL datatype */ if( ndims < 2 ) { if( 0 == ndims ) { *newtype = &ompi_mpi_datatype_null.dt; return MPI_SUCCESS; } ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type ); size = size_array[0]; displ = start_array[0]; goto replace_subarray_type; } if( MPI_ORDER_C == order ) { i = ndims - 1; step = -1; end_loop = -1; } else { i = 0; step = 1; end_loop = ndims; } /* As we know that the ndims is at least 1 we can start by creating the * first dimension data outside the loop, such that we dont have to create * a duplicate of the oldtype just to be able to free it. */ ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i], oldtype, newtype ); last_type = *newtype; size = (MPI_Aint)size_array[i] * (MPI_Aint)size_array[i+step]; displ = (MPI_Aint)start_array[i] + (MPI_Aint)start_array[i+step] * (MPI_Aint)size_array[i]; for( i += 2 * step; i != end_loop; i += step ) { ompi_datatype_create_hvector( subsize_array[i], 1, size * extent, last_type, newtype ); ompi_datatype_destroy( &last_type ); displ += size * start_array[i]; size *= size_array[i]; last_type = *newtype; } replace_subarray_type: /** * We need to shift the content (useful data) of the datatype, so * we need to force the displacement to be moved. Therefore, we * cannot use resize as it will only set the soft lb and ub * markers without moving the data. Instead, we have to create a * new data, and insert the last_Type with the correct * displacement. */ *newtype = ompi_datatype_create( last_type->super.desc.used ); ompi_datatype_add( *newtype, last_type, 1, displ * extent, size * extent); ompi_datatype_destroy( &last_type ); return OMPI_SUCCESS; }
int mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0; int *bytes_per_process = NULL; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; #if TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; print_entry nentry; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /* #########################################################*/ /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = fh->f_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } /* * Calculate how many bytes are read in each cycle */ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == sendtype){ sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /* Calculate how much data will be contributed in this cycle by each process*/ bytes_received = 0; while (bytes_to_read_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_read_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** ********* Scatter the Data from the readers ************** *********************************************************/ if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif recv_req = (MPI_Request *) malloc (sizeof (MPI_Request)); if (NULL == recv_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, fh->f_comm, recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for (i = 0; i < fh->f_procs_per_group; i++) ompi_datatype_destroy(sendtype+i); if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } } } #if TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } } return ret; }
int ompio_io_ompio_file_close (mca_io_ompio_file_t *ompio_fh) { int ret = OMPI_SUCCESS; int delete_flag = 0; char name[256]; if(mca_io_ompio_coll_timing_info){ strcpy (name, "WRITE"); if (!ompi_io_ompio_empty_print_queue(WRITE_PRINT_QUEUE)){ ret = ompi_io_ompio_print_time_info(WRITE_PRINT_QUEUE, name, ompio_fh); if (OMPI_SUCCESS != ret){ printf("Error in print_time_info "); } } strcpy (name, "READ"); if (!ompi_io_ompio_empty_print_queue(READ_PRINT_QUEUE)){ ret = ompi_io_ompio_print_time_info(READ_PRINT_QUEUE, name, ompio_fh); if (OMPI_SUCCESS != ret){ printf("Error in print_time_info "); } } } if ( ompio_fh->f_amode & MPI_MODE_DELETE_ON_CLOSE ) { delete_flag = 1; } /*close the sharedfp file*/ if( NULL != ompio_fh->f_sharedfp ){ ret = ompio_fh->f_sharedfp->sharedfp_file_close(ompio_fh); } if ( NULL != ompio_fh->f_fs ) { /* The pointer might not be set if file_close() is ** called from the file destructor in case of an error ** during file_open() */ ret = ompio_fh->f_fs->fs_file_close (ompio_fh); } if ( delete_flag && 0 == ompio_fh->f_rank ) { mca_io_ompio_file_delete ( ompio_fh->f_filename, MPI_INFO_NULL ); } if ( NULL != ompio_fh->f_fs ) { mca_fs_base_file_unselect (ompio_fh); } if ( NULL != ompio_fh->f_fbtl ) { mca_fbtl_base_file_unselect (ompio_fh); } if ( NULL != ompio_fh->f_fcoll ) { mca_fcoll_base_file_unselect (ompio_fh); } if ( NULL != ompio_fh->f_sharedfp) { mca_sharedfp_base_file_unselect (ompio_fh); } if (NULL != ompio_fh->f_io_array) { free (ompio_fh->f_io_array); ompio_fh->f_io_array = NULL; } if (NULL != ompio_fh->f_init_procs_in_group) { free (ompio_fh->f_init_procs_in_group); ompio_fh->f_init_procs_in_group = NULL; } if (NULL != ompio_fh->f_procs_in_group) { free (ompio_fh->f_procs_in_group); ompio_fh->f_procs_in_group = NULL; } if (NULL != ompio_fh->f_decoded_iov) { free (ompio_fh->f_decoded_iov); ompio_fh->f_decoded_iov = NULL; } if (NULL != ompio_fh->f_convertor) { free (ompio_fh->f_convertor); ompio_fh->f_convertor = NULL; } if (NULL != ompio_fh->f_datarep) { free (ompio_fh->f_datarep); ompio_fh->f_datarep = NULL; } if (MPI_DATATYPE_NULL != ompio_fh->f_iov_type) { ompi_datatype_destroy (&ompio_fh->f_iov_type); } if ( MPI_DATATYPE_NULL != ompio_fh->f_etype ) { ompi_datatype_destroy (&ompio_fh->f_etype); } if ( MPI_DATATYPE_NULL != ompio_fh->f_filetype ){ ompi_datatype_destroy (&ompio_fh->f_filetype); } if ( MPI_DATATYPE_NULL != ompio_fh->f_orig_filetype ){ ompi_datatype_destroy (&ompio_fh->f_orig_filetype); } if (MPI_COMM_NULL != ompio_fh->f_comm && (ompio_fh->f_flags & OMPIO_SHAREDFP_IS_SET) ) { ompi_comm_free (&ompio_fh->f_comm); } return ret; }
int32_t ompi_datatype_create_subarray(int ndims, int const* size_array, int const* subsize_array, int const* start_array, int order, const ompi_datatype_t* oldtype, ompi_datatype_t** newtype) { MPI_Datatype last_type; int32_t i, step, end_loop; MPI_Aint size, displ, extent; ompi_datatype_type_extent( oldtype, &extent ); /* If the ndims is zero then return the NULL datatype */ if( ndims < 2 ) { if( 0 == ndims ) { *newtype = &ompi_mpi_datatype_null.dt; return MPI_SUCCESS; } ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type ); size = size_array[0]; displ = start_array[0]; goto replace_subarray_type; } if( MPI_ORDER_C == order ) { i = ndims - 1; step = -1; end_loop = -1; } else { i = 0; step = 1; end_loop = ndims; } /* As we know that the ndims is at least 1 we can start by creating the * first dimension data outside the loop, such that we dont have to create * a duplicate of the oldtype just to be able to free it. */ ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i], oldtype, newtype ); last_type = *newtype; size = size_array[i] * size_array[i+step]; displ = start_array[i] + start_array[i+step] * size_array[i]; for( i += 2 * step; i != end_loop; i += step ) { ompi_datatype_create_hvector( subsize_array[i], 1, size * extent, last_type, newtype ); ompi_datatype_destroy( &last_type ); displ += size * start_array[i]; size *= size_array[i]; last_type = *newtype; } replace_subarray_type: /** * We cannot use resized here. Resized will only set the soft lb and ub markers * without moving the real data inside. What we need is to force the displacement * of the data upward to the right position AND set the LB and UB. A type * struct is the function we need. */ { MPI_Aint displs[3]; MPI_Datatype types[3]; int blength[3] = { 1, 1, 1 }; displs[0] = 0; displs[1] = displ * extent; displs[2] = size * extent; types[0] = MPI_LB; types[1] = last_type; types[2] = MPI_UB; ompi_datatype_create_struct( 3, blength, displs, types, newtype ); } ompi_datatype_destroy( &last_type ); return OMPI_SUCCESS; }
static int test_upper( unsigned int length ) { double *mat1, *mat2, *inbuf; ompi_datatype_t *pdt; opal_convertor_t * pConv; char *ptr; int rc; unsigned int i, j, iov_count, split_chunk, total_length; size_t max_data; struct iovec a; TIMER_DATA_TYPE start, end; long total_time; printf( "test upper matrix\n" ); pdt = upper_matrix( length ); /*dt_dump( pdt );*/ mat1 = malloc( length * length * sizeof(double) ); init_random_upper_matrix( length, mat1 ); mat2 = calloc( length * length, sizeof(double) ); total_length = length * (length + 1) * ( sizeof(double) / 2); inbuf = (double*)malloc( total_length ); ptr = (char*)inbuf; /* copy upper matrix in the array simulating the input buffer */ for( i = 0; i < length; i++ ) { uint32_t pos = i * length + i; for( j = i; j < length; j++, pos++ ) { *inbuf = mat1[pos]; inbuf++; } } inbuf = (double*)ptr; pConv = opal_convertor_create( remote_arch, 0 ); if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2 ) ) { printf( "Cannot attach the datatype to a convertor\n" ); return OMPI_ERROR; } GET_TIME( start ); split_chunk = (length + 1) * sizeof(double); /* split_chunk = (total_length + 1) * sizeof(double); */ for( i = total_length; i > 0; ) { if( i <= split_chunk ) { /* equal test just to be able to set a breakpoint */ split_chunk = i; } a.iov_base = ptr; a.iov_len = split_chunk; iov_count = 1; max_data = split_chunk; opal_convertor_unpack( pConv, &a, &iov_count, &max_data ); ptr += max_data; i -= max_data; if( mat2[0] != inbuf[0] ) assert(0); } GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); printf( "complete unpacking in %ld microsec\n", total_time ); free( inbuf ); rc = check_diag_matrix( length, mat1, mat2 ); free( mat1 ); free( mat2 ); /* test the automatic destruction pf the data */ ompi_datatype_destroy( &pdt ); assert( pdt == NULL ); OBJ_RELEASE( pConv ); return rc; }
int mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0, datatype_size = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; int recv_req_count=0; #if TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; print_entry nentry; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } mca_io_ompio_get_num_aggregators ( &dynamic_num_io_procs ); ret = ompi_io_ompio_set_aggregator_props (fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = ompi_io_ompio_generate_current_file_view(fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %d LENGTH: %ld\n", fh->f_rank, local_iov_array[i].iov_base, local_iov_array[i].iov_len); } #endif /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON printf("total_fview_count : %d\n", total_fview_count); if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = ompi_io_ompio_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].iov_base, global_iov_array[sorted[tv]].iov_len); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } mca_io_ompio_get_bytes_per_agg ( (int *)&bytes_per_cycle ); cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers*/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == recvtype){ recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ /* Calculate how much data will be contributed in this cycle by each process*/ bytes_sent = 0; #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /* The blocklen and displs calculation only done at aggregators!*/ while (bytes_to_write_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_write_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif #if TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif global_buf = (char *) malloc (global_count); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); opal_datatype_type_size(&recvtype[i]->super, &datatype_size); if (datatype_size){ recv_req = (MPI_Request *)realloc ((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request)); ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[recv_req_count])); recv_req_count++; if (OMPI_SUCCESS != ret){ goto exit; } } } } if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } remaining = bytes_sent; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ send_req = (MPI_Request *) malloc (sizeof(MPI_Request)); if (NULL == send_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { ret = ompi_request_wait_all (recv_req_count, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } #if TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /********************************************************** **************** DONE GATHERING OF DATA ****************** *********************************************************/ /********************************************************** ******* Create the io array, and pass it to fbtl ********* *********************************************************/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { #if TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } if (NULL != send_req){ free(send_req); send_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } for (i =0; i< fh->f_procs_per_group; i++) ompi_datatype_destroy(recvtype+i); if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } #if TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){ ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit : if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return OMPI_SUCCESS; }
int mca_io_ompio_file_set_view (ompi_file_t *fp, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { mca_io_ompio_data_t *data; mca_io_ompio_file_t *fh; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; data = (mca_io_ompio_data_t *) fp->f_io_selected_data; fh = &data->ompio_fh; ompi_datatype_destroy (&fh->f_etype); ompi_datatype_destroy (&fh->f_filetype); ompi_datatype_destroy (&fh->f_orig_filetype); if (NULL != fh->f_decoded_iov) { free (fh->f_decoded_iov); fh->f_decoded_iov = NULL; } if (NULL != fh->f_datarep) { free (fh->f_datarep); fh->f_datarep = NULL; } /* Reset the flags first */ fh->f_flags = 0; fh->f_flags |= OMPIO_FILE_VIEW_IS_SET; fh->f_datarep = strdup (datarep); ompi_datatype_duplicate (filetype, &fh->f_orig_filetype ); opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent); opal_datatype_type_size (&filetype->super, &ftype_size); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ) { ompi_datatype_t *newfiletype; ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE, &ompi_mpi_byte.dt, &newfiletype); ompi_datatype_commit (&newfiletype); mca_io_ompio_set_view_internal (fh, disp, etype, newfiletype, datarep, info); ompi_datatype_destroy ( &newfiletype ); } else { mca_io_ompio_set_view_internal (fh, disp, etype, filetype, datarep, info); } if (OMPI_SUCCESS != mca_fcoll_base_file_select (&data->ompio_fh, NULL)) { opal_output(1, "mca_fcoll_base_file_select() failed\n"); return OMPI_ERROR; } return OMPI_SUCCESS; }
static int unpack_ooo(void) { ompi_datatype_t * t1; ompi_datatype_t * t2; ompi_datatype_t * type[2]; ompi_datatype_t * newtype; MPI_Aint disp[2]; int len[2], rc; rc = ompi_datatype_create_vector(2, 1, 2, MPI_INT, &t1); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not create vector t1\n"); return 1; } rc = ompi_datatype_commit (&t1); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not commit vector t1\n"); return 1; } rc = ompi_datatype_create_vector(2, 1, 2, MPI_DOUBLE, &t2); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not create vector t2\n"); return 1; } rc = ompi_datatype_commit (&t2); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not commit vector t2\n"); return 1; } /* * btl=0x7f7823672580 bytes_received=992 data_offset=0 * btl=0x7f7823260420 bytes_received=1325 data_offset=992 * btl=0x7f7823672580 bytes_received=992 data_offset=2317 * btl=0x7f7823672580 bytes_received=992 data_offset=3309 * btl=0x7f7823672580 bytes_received=992 data_offset=4301 * btl=0x7f7823672580 bytes_received=992 data_offset=5293 * btl=0x7f7823672580 bytes_received=992 data_offset=6285 * btl=0x7f7823672580 bytes_received=667 data_offset=7277 */ size_t test1[9][2] = { {992, 0}, {1325, 992}, {992, 2317}, {992, 3309}, {992, 4301}, {992, 5293}, {992, 6285}, {667, 7277}, {0, -1}, }; /* * btl=0x7f80bc545580 bytes_received=992 data_offset=0 * btl=0x7f80bc545580 bytes_received=992 data_offset=2317 * btl=0x7f80bc545580 bytes_received=992 data_offset=3309 * btl=0x7f80bc545580 bytes_received=992 data_offset=4301 * btl=0x7f80bc545580 bytes_received=992 data_offset=5293 * btl=0x7f80bc545580 bytes_received=992 data_offset=6285 * btl=0x7f80bc133420 bytes_received=1325 data_offset=992 * btl=0x7f80bc545580 bytes_received=667 data_offset=7277 */ size_t test2[9][2] = { {992, 0}, {992, 2317}, {992, 3309}, {992, 4301}, {992, 5293}, {992, 6285}, {1325, 992}, {667, 7277}, {0, -1}, }; /* trimmed version of test2 */ size_t test3[9][2] = { {992, 0}, {4960, 2317}, {1325, 992}, {667, 7277}, {0, -1}, }; /* an other test case */ size_t test4[9][2] = { {992, 0}, {992, 2976}, {992, 1984}, {992, 992}, {3976, 3968}, {0, -1}, }; disp[0] = (long)(&foo.i[0]) - (long)&foo; disp[1] = (long)(&foo.d[0]) - (long)&foo; type[0] = t1; type[1] = t2; len[0] = 1; len[1] = 1; rc = ompi_datatype_create_struct(2, len, disp, type, &newtype); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not create struct\n"); return 1; } rc = ompi_datatype_commit (&newtype); if (OMPI_SUCCESS != rc) { fprintf(stderr, "could not create struct\n"); return 1; } pbar = (struct pfoo_t *)malloc (N * sizeof(struct pfoo_t)); if (NULL == pbar) { fprintf(stderr, "could not malloc pbar\n"); return 1; } bar = (struct foo_t *)malloc (N * sizeof(struct foo_t)); if (NULL == bar) { fprintf(stderr, "could not malloc bar\n"); return 1; } if (0 != testcase(newtype, test1)) { printf ("test1 failed\n"); return 2; } if (0 != testcase(newtype, test2)) { printf ("test2 failed\n"); return 2; } if (0 != testcase(newtype, test3)) { printf ("test3 failed\n"); return 2; } if (0 != testcase(newtype, test4)) { printf ("test4 failed\n"); return 2; } /* test the automatic destruction pf the data */ ompi_datatype_destroy( &newtype ); assert( newtype == NULL ); return rc; }
int mca_fcoll_dynamic_file_write_all (ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0, datatype_size = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request send_req=NULL, *recv_req=NULL; int my_aggregator=-1; bool sendbuf_is_contiguous = false; size_t ftype_size; ptrdiff_t ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; mca_common_ompio_print_entry nentry; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (ptrdiff_t) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { sendbuf_is_contiguous = true; } if (! sendbuf_is_contiguous ) { ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators")); if ( OMPI_ERR_MAX == dynamic_num_io_procs ) { ret = OMPI_ERROR; goto exit; } ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[0]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the local offsets/lengths array corresponding to *** this write operation ********************************************************************/ ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %d LENGTH: %ld\n", fh->f_rank, local_iov_array[i].iov_base, local_iov_array[i].iov_len); } #endif /************************************************************* *** 4. Allgather the offset/lengths array from all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON printf("total_fview_count : %d\n", total_fview_count); if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].iov_base, global_iov_array[sorted[tv]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ bytes_per_cycle = fh->f_bytes_per_agg; cycles = ceil((double)total_bytes/bytes_per_cycle); if (my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request)); if ( NULL == recv_req ) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ recvtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); recvtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; free(blocklen_per_process[l]); free(displs_per_process[l]); blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually written in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_sent = 0; /* The blocklen and displs calculation only done at aggregators!*/ while (bytes_to_write_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_write_in_cycle) { /* The data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t) global_iov_array[sorted[current_index]].iov_base; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif /************************************************************************* *** 7e. Perform the actual communication *************************************************************************/ for (i=0;i<fh->f_procs_per_group; i++) { recv_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); opal_datatype_type_size(&recvtype[i]->super, &datatype_size); if (datatype_size){ ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[i])); if (OMPI_SUCCESS != ret){ goto exit; } } } } } /* end if (my_aggregator == fh->f_rank ) */ if ( sendbuf_is_contiguous ) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ ptrdiff_t mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } remaining = bytes_sent; while (remaining) { mem_address = (ptrdiff_t) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, my_aggregator, 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (my_aggregator == fh->f_rank) { ret = ompi_request_wait_all (fh->f_procs_per_group, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (my_aggregator == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /********************************************************** *** 7f. Create the io array, and pass it to fbtl *********************************************************/ if (my_aggregator == fh->f_rank) { #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (ptrdiff_t)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } /* end if (my_aggregator == fh->f_rank) */ } /* end for (index = 0; index < cycles; index++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){ mca_common_ompio_register_print_entry(fh->f_coll_write_time, nentry); } #endif exit : if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); } } free(recvtype); recvtype=NULL; } if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process){ free(blocklen_per_process[l]); } if (NULL != displs_per_process){ free(displs_per_process[l]); } } free(blocklen_per_process); free(displs_per_process); } if (NULL != displs){ free(displs); displs=NULL; } if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } return OMPI_SUCCESS; }
/* * allgatherv_inter * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allgatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype = NULL; rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm->c_local_comm); size = ompi_comm_remote_size(comm); if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); displace = (int *)malloc(sizeof(int) * size_local); if ((NULL == count) || (NULL == displace)) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } /* Local gather to get the scount of each process */ err = comm->c_local_comm->c_coll->coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gather_module); if (OMPI_SUCCESS != err) { goto exit; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } total = 0; for (i = 0; i < size_local; i++) { total = total + count[i]; } if ( total > 0 ) { ptrdiff_t gap, span; span = opal_datatype_span(&sdtype->super, total, &gap); ptmp_free = (char*)malloc(span); if (NULL == ptmp_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ptmp = ptmp_free - gap; } } err = comm->c_local_comm->c_coll->coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gatherv_module); if (OMPI_SUCCESS != err) { goto exit; } ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_datatype_commit(&ndtype); if (0 == rank) { /* Exchange data between roots */ err = ompi_coll_base_sendrecv_actual(ptmp, total, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, 1, ndtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_bcast_module); exit: if( NULL != ndtype ) { ompi_datatype_destroy(&ndtype); } if (NULL != ptmp_free) { free(ptmp_free); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } return err; }
int ompio_io_ompio_file_close (mca_io_ompio_file_t *ompio_fh) { int ret = OMPI_SUCCESS; int delete_flag = 0; char name[256]; if(mca_io_ompio_coll_timing_info) { strcpy (name, "WRITE"); if (!ompi_io_ompio_empty_print_queue(WRITE_PRINT_QUEUE)) { ret = ompi_io_ompio_print_time_info(WRITE_PRINT_QUEUE, name, ompio_fh); if (OMPI_SUCCESS != ret) { printf("Error in print_time_info "); } } strcpy (name, "READ"); if (!ompi_io_ompio_empty_print_queue(READ_PRINT_QUEUE)) { ret = ompi_io_ompio_print_time_info(READ_PRINT_QUEUE, name, ompio_fh); if (OMPI_SUCCESS != ret) { printf("Error in print_time_info "); } } } if ( ompio_fh->f_amode & MPI_MODE_DELETE_ON_CLOSE ) { delete_flag = 1; } /*close the sharedfp file*/ if(ompio_fh->f_sharedfp != NULL) { ret = ompio_fh->f_sharedfp->sharedfp_file_close(ompio_fh); } ret = ompio_fh->f_fs->fs_file_close (ompio_fh); if ( delete_flag && 0 == ompio_fh->f_rank ) { mca_io_ompio_file_delete ( ompio_fh->f_filename, MPI_INFO_NULL ); } mca_fs_base_file_unselect (ompio_fh); mca_fbtl_base_file_unselect (ompio_fh); mca_fcoll_base_file_unselect (ompio_fh); /* mca_sharedfp_base_file_unselect (ompio_fh) ; EG?*/ if (NULL != ompio_fh->f_io_array) { free (ompio_fh->f_io_array); ompio_fh->f_io_array = NULL; } if (NULL != ompio_fh->f_init_procs_in_group) { free (ompio_fh->f_init_procs_in_group); ompio_fh->f_init_procs_in_group = NULL; } if (NULL != ompio_fh->f_procs_in_group) { free (ompio_fh->f_procs_in_group); ompio_fh->f_procs_in_group = NULL; } if (NULL != ompio_fh->f_decoded_iov) { free (ompio_fh->f_decoded_iov); ompio_fh->f_decoded_iov = NULL; } if (NULL != ompio_fh->f_convertor) { free (ompio_fh->f_convertor); ompio_fh->f_convertor = NULL; } if (NULL != ompio_fh->f_datarep) { free (ompio_fh->f_datarep); ompio_fh->f_datarep = NULL; } if (MPI_DATATYPE_NULL != ompio_fh->f_iov_type) { ompi_datatype_destroy (&ompio_fh->f_iov_type); } if (MPI_COMM_NULL != ompio_fh->f_comm) { ompi_comm_free (&ompio_fh->f_comm); } /* if (MPI_INFO_NULL != ompio_fh->f_info) { ompi_info_free (&ompio_fh->f_info); } */ return ret; }
int mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { size_t max_data = 0, bytes_per_cycle=0; struct iovec *iov=NULL, *decoded_iov=NULL; uint32_t iov_count=0, iov_index=0; int i=0,j=0,l=0, temp_index; int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL; int index, *disp_index=NULL, **blocklen_per_process=NULL; int *iovec_count_per_process=NULL, *displs=NULL; size_t total_bytes_written=0; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0; local_io_array *local_iov_array =NULL, *global_iov_array=NULL; local_io_array *file_offsets_for_agg=NULL; int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL; char *send_buf=NULL, *global_buf=NULL; int iov_size=0, current_position=0, *current_index=NULL; int *bytes_remaining=NULL, entries_per_aggregator=0; ompi_datatype_t **recvtype = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; /* For creating datatype of type io_array */ int blocklen[3] = {1, 1, 1}; int static_num_io_procs=1; OPAL_PTRDIFF_TYPE d[3], base; ompi_datatype_t *types[3]; ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL; /*----------------------------------------------*/ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; mca_io_ompio_print_entry nentry; #endif #if DEBUG_ON MPI_Aint gc_in; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /* In case the data is not contigous in memory, decode it into an iovec */ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( & static_num_io_procs ); fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh, static_num_io_procs, max_data); /* io_array datatype for using in communication*/ types[0] = &ompi_mpi_long.dt; types[1] = &ompi_mpi_long.dt; types[2] = &ompi_mpi_int.dt; d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0]; d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length; d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id; base = d[0]; for (i=0 ; i<3 ; i++) { d[i] -= base; } ompi_datatype_create_struct (3, blocklen, d, types, &io_array_type); ompi_datatype_commit (&io_array_type); /* #########################################################*/ ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh, max_data, &iov, &iov_size); if (ret != OMPI_SUCCESS){ fprintf(stderr,"Current File View Generation Error\n"); goto exit; } if (0 == iov_size){ iov_size = 1; } local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array)); if ( NULL == local_iov_array){ fprintf(stderr,"local_iov_array allocation error\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (j=0; j < iov_size; j++){ local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) iov[j].iov_base; local_iov_array[j].length = (size_t)iov[j].iov_len; local_iov_array[j].process_id = fh->f_rank; } fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); local_cycles = ceil((double)max_data/bytes_per_cycle); ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles, &cycles, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret){ fprintf(stderr,"local cycles allreduce!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int )); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == bytes_remaining){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == current_index){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **) malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(i=0;i<fh->f_procs_per_group;i++){ current_index[i] = 0; bytes_remaining[i] =0; blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == iovec_count_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == displs){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_allgather_array (&iov_size, 1, MPI_INT, iovec_count_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ fprintf(stderr,"iov size allgatherv array!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { displs[0] = 0; global_iov_count = iovec_count_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_iov_count += iovec_count_per_process[i]; displs[i] = displs[i-1] + iovec_count_per_process[i-1]; } } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { global_iov_array = (local_io_array *) malloc (global_iov_count * sizeof(local_io_array)); if (NULL == global_iov_array){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = fh->f_gatherv_array (local_iov_array, iov_size, io_array_type, global_iov_array, iovec_count_per_process, displs, io_array_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ fprintf(stderr,"global_iov_array gather error!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if ( 0 == global_iov_count){ global_iov_count = 1; } sorted = (int *)malloc (global_iov_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } local_heap_sort (global_iov_array, global_iov_count, sorted); } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (gc_in=0; gc_in<global_iov_count; gc_in++){ printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n", global_iov_array[gc_in].process_id, gc_in, global_iov_array[gc_in].offset, gc_in, global_iov_array[gc_in].length); } } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++){ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == recvtype){ recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (local_cycles > index) { if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) { bytes_to_write_in_cycle = max_data % bytes_per_cycle; } else if (max_data <= bytes_per_cycle) { bytes_to_write_in_cycle = max_data; } else { bytes_to_write_in_cycle = bytes_per_cycle; } } else { bytes_to_write_in_cycle = 0; } #if DEBUG_ON /* if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/ printf ("***%d: CYCLE %d Bytes %ld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); /* }*/ #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ /* gather from each process how many bytes each will be sending */ fh->f_gather_array (&bytes_to_write_in_cycle, 1, MPI_INT, bytes_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); /* For each aggregator it needs to get bytes_to_write_in_cycle from each process in group which adds up to bytes_per_cycle */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0;i<fh->f_procs_per_group; i++){ /* printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]); */ #if DEBUG_ON printf ("%d : bytes_per_process : %d\n", fh->f_procs_in_group[i], bytes_per_process[i]); #endif while (bytes_per_process[i] > 0){ if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id, fh) == i){ /* current id owns this entry!*/ /*Add and subtract length and create blocklength and displs array*/ if (bytes_remaining[i]){ /*Remaining bytes in the current entry of the global offset array*/ if (bytes_remaining[i] <= bytes_per_process[i]){ blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); bytes_per_process[i] -= bytes_remaining[i]; blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; bytes_remaining[i] = 0; disp_index[i] += 1; /* This entry has been used up, we need to move to the next entry of this process and make current_index point there*/ current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ /* No more entries left, so Its all done! exit!*/ break; } continue; } else{ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); bytes_remaining[i] -= bytes_per_process[i]; bytes_per_process[i] = 0; break; } } else{ if (bytes_per_process[i] < global_iov_array[sorted[current_index[i]]].length){ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; bytes_remaining[i] = global_iov_array[sorted[current_index[i]]].length - bytes_per_process[i]; bytes_per_process[i] = 0; break; } else { blocklen_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].length; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_per_process[i] -= global_iov_array[sorted[current_index[i]]].length; current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } } } } else{ current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ bytes_per_process[i] = 0; /* no more entries left to service this request*/ continue; } } } } entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group;i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ entries_per_aggregator++; #if DEBUG_ON printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld, disp : %d\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]], disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]); } #endif #if DEBUG_ON printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n", fh->f_rank, global_count, bytes_to_write_in_cycle, fh->f_procs_per_group); #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif global_buf = (char *) malloc (global_count); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == recv_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group; i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[i])); if (OMPI_SUCCESS != ret){ fprintf(stderr,"irecv Error!\n"); goto exit; } } } if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_to_write_in_cycle) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_to_write_in_cycle); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } remaining = bytes_to_write_in_cycle; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_to_write_in_cycle; send_req = (MPI_Request *) malloc (sizeof(MPI_Request)); if (NULL == send_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = MCA_PML_CALL(isend(send_buf, bytes_to_write_in_cycle, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req)); if ( OMPI_SUCCESS != ret ){ fprintf(stderr,"isend error!\n"); goto exit; } ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { ret = ompi_request_wait_all (fh->f_procs_per_group, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += end_comm_time - start_comm_time; #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } if (NULL != send_req){ free(send_req); send_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } for (i = 0; i < fh->f_procs_per_group; i++) ompi_datatype_destroy(recvtype+i); if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = static_num_io_procs; if (!fh->f_full_print_queue(WRITE_PRINT_QUEUE)){ fh->f_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit: if (NULL != decoded_iov){ free(decoded_iov); decoded_iov = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } } if (NULL != send_buf){ free(send_buf); send_buf = NULL; } if (NULL != global_buf){ free(global_buf); global_buf = NULL; } if (NULL != recvtype){ free(recvtype); recvtype = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process = NULL; } if(NULL != current_index){ free(current_index); current_index = NULL; } if(NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != sorted) { free(sorted); sorted = NULL; } return ret; }
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0, weallocated = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; ptrdiff_t rlb, slb, tlb, sext, rext, tsext; struct ompi_datatype_t *new_ddt; #ifdef blahblah mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; #endif if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } #ifdef blahblah /* try and SAVE memory by using the data segment hung off the communicator if possible */ if (data->mcct_num_reqs >= size) { /* we have enought preallocated for displments and lengths */ displs = (int*) data->mcct_reqs; blen = (int *) (displs + size); weallocated = 0; } else { /* allocate the buffers ourself */ #endif displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } weallocated = 1; #ifdef blahblah } #endif /* tmp buffer allocation for message data */ tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - slb; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return err; }
int main(int argc, char *argv[]) { opal_init_util(&argc, &argv); ompi_datatype_init(); /* Simple contiguous data: MPI_INT32_T */ { int32_t send_data[2] = {1234, 5678}; int32_t recv_data[2] = {-1, -1}; if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); printf("data "); dump_hex(&send_data, sizeof(int32_t) * 2); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int32_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 2); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { printf("Error during external32 pack/unack for contiguous types (MPI_INT32_T)\n"); exit(-1); } } /* Simple contiguous data: MPI_INT16_T */ { int16_t send_data[2] = {1234, 5678}; int16_t recv_data[2] = {-1, -1}; if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); printf("data "); dump_hex(&send_data, sizeof(int16_t) * 2); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int16_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int16_t.dt ); if( verbose ) { printf("recv "); dump_hex(&recv_data, sizeof(int16_t) * 2); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { printf("Error during external32 pack/unack for contiguous types\n"); exit(-1); } } /* Vector datatype */ printf("\n\nVector datatype\n\n"); { int count=2, blocklength=1, stride=2; int send_data[3] = {1234, 0, 5678}; int recv_data[3] = {-1, -1, -1}; ompi_datatype_t *ddt; ompi_datatype_create_vector ( count, blocklength, stride, &ompi_mpi_int.dt, &ddt ); { const int* a_i[3] = {&count, &blocklength, &stride}; ompi_datatype_t *type = &ompi_mpi_int.dt; ompi_datatype_set_args( ddt, 3, a_i, 0, NULL, 1, &type, MPI_COMBINER_VECTOR ); } ompi_datatype_commit(&ddt); if( verbose ) { printf("send data %08x %x08x %08x \n", send_data[0], send_data[1], send_data[2]); printf("data "); dump_hex(&send_data, sizeof(int32_t) * 3); printf("\n"); } (void)pack_unpack_datatype( send_data, ddt, 1, recv_data, check_vector, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 3); printf("\n"); printf("recv data %08x %08x %08x \n", recv_data[0], recv_data[1], recv_data[2]); } ompi_datatype_destroy(&ddt); if( (send_data[0] != recv_data[0]) || (send_data[2] != recv_data[2]) ) { printf("Error during external32 pack/unack for vector types (MPI_INT32_T)\n"); exit(-1); } } ompi_datatype_finalize(); return 0; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_scatterv_inter(void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, total, size_local; int *counts=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_remote_size(comm); size_local = ompi_comm_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { if(0 == rank) { /* local root recieves the counts from the root */ counts = (int *)malloc(sizeof(int) * size_local); err = MCA_PML_CALL(recv(counts, size_local, MPI_INT, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* calculate the whole buffer size and recieve it from root */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*counts[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } total = 0; for (i = 0; i < size_local; i++) { total = total + counts[i]; } err = MCA_PML_CALL(recv(ptmp, total, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* set the local displacement i.e. no displacements here */ displace = (int *)malloc(sizeof(int) * size_local); displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + counts[i-1]; } } /* perform the scatterv locally */ err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, rdtype, rbuf, rcount, rdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); if (OMPI_SUCCESS != err) { return err; } if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != counts) { free(counts); } } else { err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_create_indexed(size,scounts,disps,sdtype,&ndtype); ompi_datatype_commit(&ndtype); err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_destroy(&ndtype); } /* All done */ return err; }
static int two_phase_exchange_data(mca_io_ompio_file_t *fh, void *buf, struct iovec *offset_len, int *send_size, int *start_pos, int *recv_size, int *count, int *partial_send, int *recd_from_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, mca_io_ompio_access_array_t *others_req, int iter, size_t *buf_idx, MPI_Aint buftype_extent, int striping_unit, int *aggregator_list) { int i=0, j=0, k=0, tmp=0, nprocs_recv=0, nprocs_send=0; int ret = OMPI_SUCCESS; char **recv_buf = NULL; MPI_Request *requests=NULL; MPI_Datatype send_type; #if TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_alltoall (send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fh->f_comm, fh->f_comm->c_coll.coll_alltoall_module); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG for (i=0; i<fh->f_size; i++){ printf("%d: RS[%d]: %d\n", fh->f_rank, i, recv_size[i]); } #endif nprocs_recv = 0; for (i=0; i < fh->f_size; i++) if (recv_size[i]) nprocs_recv++; nprocs_send = 0; for (i=0; i< fh->f_size; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) malloc((nprocs_send+nprocs_recv+1) * sizeof(MPI_Request)); if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { j = 0; for (i=0; i < fh->f_size; i++){ if (recv_size[i]){ ret = MCA_PML_CALL(irecv(((char *) buf)+ buf_idx[i], recv_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); if ( OMPI_SUCCESS != ret ){ return ret; } j++; buf_idx[i] += recv_size[i]; } } } else{ recv_buf = (char **)malloc(fh->f_size * sizeof(char *)); if (NULL == recv_buf){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i < fh->f_size; i++) if(recv_size[i]) recv_buf[i] = (char *) malloc (recv_size[i] * sizeof(char)); j = 0; for(i=0; i<fh->f_size; i++) if (recv_size[i]) { ret = MCA_PML_CALL(irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); j++; } } j = 0; for (i = 0; i< fh->f_size; i++){ if (send_size[i]){ if (partial_send[i]){ k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ompi_datatype_create_hindexed(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); ompi_datatype_commit(&send_type); ret = MCA_PML_CALL(isend(MPI_BOTTOM, 1, send_type, i, fh->f_rank+i+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, requests+nprocs_recv+j)); ompi_datatype_destroy(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } if (nprocs_recv) { ret = ompi_request_wait_all(nprocs_recv, requests, MPI_STATUS_IGNORE); if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { two_phase_fill_user_buffer(fh, buf, flat_buf, recv_buf, offset_len, (unsigned *)recv_size, requests, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent, striping_unit, aggregator_list); } } ret = ompi_request_wait_all(nprocs_send, requests+nprocs_recv, MPI_STATUS_IGNORE); if (NULL != requests){ free(requests); requests = NULL; } if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)){ for (i=0; i< fh->f_size; i++){ if (recv_size[i]){ free(recv_buf[i]); } } free(recv_buf); } #if TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += (end_rcomm_time - start_rcomm_time); #endif exit: return ret; }
/* * file_open_pvfs2: This is the same strategy as ROMIO's pvfs2 open * * Function: - opens a new file * Accepts: - same arguments as MPI_File_open() * Returns: - Success if new file handle */ int mca_fs_pvfs2_file_open (struct ompi_communicator_t *comm, const char* filename, int access_mode, struct ompi_info_t *info, mca_io_ompio_file_t *fh) { int ret; mca_fs_pvfs2 *pvfs2_fs; PVFS_fs_id pvfs2_id; char pvfs2_path[OMPIO_MAX_NAME] = {0}; char * ncache_timeout; open_status o_status = {0, {0, 0}}; struct ompi_datatype_t *open_status_type; struct ompi_datatype_t *types[2] = {&ompi_mpi_int.dt, &ompi_mpi_byte.dt}; int lens[2] = {1, sizeof(PVFS_object_ref)}; OPAL_PTRDIFF_TYPE offsets[2]; char char_stripe[MPI_MAX_INFO_KEY]; int flag; int fs_pvfs2_stripe_size = -1; int fs_pvfs2_stripe_width = -1; /* We are going to do what ROMIO does with one process resolving * the name and broadcasting to others */ pvfs2_fs = (mca_fs_pvfs2 *) malloc(sizeof(mca_fs_pvfs2)); if (NULL == pvfs2_fs) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } if (!mca_fs_pvfs2_IS_INITIALIZED) { /* disable the pvfs2 ncache */ ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT"); if (ncache_timeout == NULL ) { setenv("PVFS2_NCACHE_TIMEOUT", "0", 1); } ret = PVFS_util_init_defaults(); if (ret < 0) { PVFS_perror("PVFS_util_init_defaults", ret); return OMPI_ERROR; } mca_fs_pvfs2_IS_INITIALIZED = 1; } memset(&(pvfs2_fs->credentials), 0, sizeof(PVFS_credentials)); PVFS_util_gen_credentials(&(pvfs2_fs->credentials)); /* check for stripe size and stripe depth in the info object and update mca_fs_pvfs2_stripe_width and mca_fs_pvfs2_stripe_size before calling fake_an_open() */ ompi_info_get (info, "stripe_size", MPI_MAX_INFO_VAL, char_stripe, &flag); if ( flag ) { sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_size ); } ompi_info_get (info, "stripe_width", MPI_MAX_INFO_VAL, char_stripe, &flag); if ( flag ) { sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_width ); } if (fs_pvfs2_stripe_size < 0) { fs_pvfs2_stripe_size = mca_fs_pvfs2_stripe_size; } if (fs_pvfs2_stripe_width < 0) { fs_pvfs2_stripe_width = mca_fs_pvfs2_stripe_width; } if (OMPIO_ROOT == fh->f_rank) { ret = PVFS_util_resolve(filename, &pvfs2_id, pvfs2_path, OMPIO_MAX_NAME); if (ret < 0 ) { PVFS_perror("PVFS_util_resolve", ret); o_status.error = -1; } else { fake_an_open (pvfs2_id, pvfs2_path, access_mode, fs_pvfs2_stripe_width, (PVFS_size)fs_pvfs2_stripe_size, pvfs2_fs, &o_status); } pvfs2_fs->object_ref = o_status.object_ref; fh->f_fs_ptr = pvfs2_fs; } /* broadcast status and (possibly valid) object reference */ offsets[0] = (MPI_Aint)(&o_status.error); offsets[1] = (MPI_Aint)(&o_status.object_ref); ompi_datatype_create_struct (2, lens, offsets, types, &open_status_type); ompi_datatype_commit (&open_status_type); fh->f_comm->c_coll.coll_bcast (MPI_BOTTOM, 1, open_status_type, OMPIO_ROOT, fh->f_comm, fh->f_comm->c_coll.coll_bcast_module); ompi_datatype_destroy (&open_status_type); if (o_status.error != 0) { /* No need to free the pvfs2_fs structure, since it will be deallocated in file_close in case of an error */ fh->f_fs_ptr = NULL; return OMPI_ERROR; } pvfs2_fs->object_ref = o_status.object_ref; fh->f_fs_ptr = pvfs2_fs; /* update the internal ompio structure to store stripe size and stripe depth correctly. Hadi(to be done): For this read the stripe size and stripe depth from the file itself */ if (fs_pvfs2_stripe_size > 0 && fs_pvfs2_stripe_width > 0) { fh->f_stripe_size = fs_pvfs2_stripe_size; fh->f_stripe_count = fs_pvfs2_stripe_width; } return OMPI_SUCCESS; }
static int two_phase_exchage_data(mca_io_ompio_file_t *fh, void *buf, char *write_buf, struct iovec *offset_length, int *send_size,int *start_pos, int *recv_size, OMPI_MPI_OFFSET_TYPE off, OMPI_MPI_OFFSET_TYPE size, int *count, int *partial_recv, int *sent_to_proc, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, mca_io_ompio_access_array_t *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter, size_t *buf_idx,MPI_Aint buftype_extent, int striping_unit, int *aggregator_list, int *hole){ int *tmp_len=NULL, sum, *srt_len=NULL, nprocs_recv, nprocs_send, k,i,j; int ret=OMPI_SUCCESS; MPI_Request *requests=NULL, *send_req=NULL; ompi_datatype_t **recv_types=NULL; OMPI_MPI_OFFSET_TYPE *srt_off=NULL; char **send_buf = NULL; #if TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_alltoall (recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fh->f_comm, fh->f_comm->c_coll.coll_alltoall_module); if ( OMPI_SUCCESS != ret ){ return ret; } nprocs_recv = 0; for (i=0;i<fh->f_size;i++){ if (recv_size[i]){ nprocs_recv++; } } recv_types = (ompi_datatype_t **) malloc (( nprocs_recv + 1 ) * sizeof(ompi_datatype_t *)); if ( NULL == recv_types ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } tmp_len = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == tmp_len ) { return OMPI_ERR_OUT_OF_RESOURCE; } j = 0; for (i=0;i<fh->f_size;i++){ if (recv_size[i]) { if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } ompi_datatype_create_hindexed(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types+j); ompi_datatype_commit(recv_types+j); j++; } } sum = 0; for (i=0;i<fh->f_size;i++) sum += count[i]; srt_off = (OMPI_MPI_OFFSET_TYPE *) malloc((sum+1)*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == srt_off ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } srt_len = (int *) malloc((sum+1)*sizeof(int)); if ( NULL == srt_len ) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } two_phase_heap_merge(others_req, count, srt_off, srt_len, start_pos, fh->f_size,fh->f_rank, nprocs_recv, sum); for (i=0; i<fh->f_size; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } if ( NULL != tmp_len ){ free(tmp_len); } *hole = 0; if (off != srt_off[0]){ *hole = 1; } else{ for (i=1;i<sum;i++){ if (srt_off[i] <= srt_off[0] + srt_len[0]){ int new_len = srt_off[i] + srt_len[i] - srt_off[0]; if(new_len > srt_len[0]) srt_len[0] = new_len; } else break; } if (i < sum || size != srt_len[0]) *hole = 1; } if ( NULL != srt_off ){ free(srt_off); } if ( NULL != srt_len ){ free(srt_len); } if (nprocs_recv){ if (*hole){ if (off > 0){ fh->f_io_array = (mca_io_ompio_io_array_t *)malloc (sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset =(IOVBASE_TYPE *)(intptr_t) off; fh->f_num_of_io_entries = 1; fh->f_io_array[0].length = size; fh->f_io_array[0].memory_address = write_buf; if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) { opal_output(1, "READ FAILED\n"); return OMPI_ERROR; } } } fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } } } nprocs_send = 0; for (i=0; i <fh->f_size; i++) if (send_size[i]) nprocs_send++; #if DEBUG_ON printf("%d : nprocs_send : %d\n", fh->f_rank,nprocs_send); #endif requests = (MPI_Request *) malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); if ( NULL == requests ){ return OMPI_ERR_OUT_OF_RESOURCE; } j = 0; for (i=0; i<fh->f_size; i++) { if (recv_size[i]) { ret = MCA_PML_CALL(irecv(MPI_BOTTOM, 1, recv_types[j], i, fh->f_rank+i+100*iter, fh->f_comm, requests+j)); if ( OMPI_SUCCESS != ret ){ goto exit; } j++; } } send_req = requests + nprocs_recv; if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { j = 0; for (i=0; i <fh->f_size; i++) if (send_size[i]) { ret = MCA_PML_CALL(isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, fh->f_rank+i+100*iter, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req+j)); if ( OMPI_SUCCESS != ret ){ goto exit; } j++; buf_idx[i] += send_size[i]; } } else if(nprocs_send && (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY))){ send_buf = (char **) malloc(fh->f_size*sizeof(char*)); if ( NULL == send_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i < fh->f_size; i++){ if (send_size[i]) { send_buf[i] = (char *) malloc(send_size[i]); if ( NULL == send_buf[i] ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } ret = two_phase_fill_send_buffer(fh, buf,flat_buf, send_buf, offset_length, send_size, send_req,sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent, striping_unit, aggregator_list); if ( OMPI_SUCCESS != ret ){ goto exit; } } for (i=0; i<nprocs_recv; i++) ompi_datatype_destroy(recv_types+i); if (NULL != recv_types){ free(recv_types); recv_types = NULL; } ret = ompi_request_wait_all (nprocs_send+nprocs_recv, requests, MPI_STATUS_IGNORE); if ( NULL != requests ){ free(requests); } #if TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif exit: return ret; }
int32_t ompi_datatype_create_subarray(int ndims, int const* size_array, int const* subsize_array, int const* start_array, int order, const ompi_datatype_t* oldtype, ompi_datatype_t** newtype) { MPI_Datatype last_type; int32_t i, step, end_loop; MPI_Aint size, displ, extent; /** * If the oldtype contains the original MPI_LB and MPI_UB markers then we * are forced to follow the MPI standard suggestion and reset these 2 * markers (MPI 3.0 page 96 line 37). Otherwise we can simply resize the * datatype. */ ompi_datatype_type_extent( oldtype, &extent ); /* If the ndims is zero then return the NULL datatype */ if( ndims < 2 ) { if( 0 == ndims ) { *newtype = &ompi_mpi_datatype_null.dt; return MPI_SUCCESS; } ompi_datatype_create_contiguous( subsize_array[0], oldtype, &last_type ); size = size_array[0]; displ = start_array[0]; goto replace_subarray_type; } if( MPI_ORDER_C == order ) { i = ndims - 1; step = -1; end_loop = -1; } else { i = 0; step = 1; end_loop = ndims; } /* As we know that the ndims is at least 1 we can start by creating the * first dimension data outside the loop, such that we dont have to create * a duplicate of the oldtype just to be able to free it. */ ompi_datatype_create_vector( subsize_array[i+step], subsize_array[i], size_array[i], oldtype, newtype ); last_type = *newtype; size = (MPI_Aint)size_array[i] * (MPI_Aint)size_array[i+step]; displ = (MPI_Aint)start_array[i] + (MPI_Aint)start_array[i+step] * (MPI_Aint)size_array[i]; for( i += 2 * step; i != end_loop; i += step ) { ompi_datatype_create_hvector( subsize_array[i], 1, size * extent, last_type, newtype ); ompi_datatype_destroy( &last_type ); displ += size * start_array[i]; size *= size_array[i]; last_type = *newtype; } replace_subarray_type: /* * Resized will only set the soft lb and ub markers without moving the real * data inside. Thus, in case the original data contains the hard markers * (MPI_LB or MPI_UB) we must force the displacement of the data upward to * the right position AND set the hard markers LB and UB. * * NTH: ompi_datatype_create_resized() does not do enough for the general * pack/unpack functions to work correctly. Until this is fixed always use * ompi_datatype_create_struct(). Once this is fixed remove 1 || below. To * verify that the regression is fixed run the subarray test in the Open MPI * ibm testsuite. */ if(1 || oldtype->super.flags & (OPAL_DATATYPE_FLAG_USER_LB | OPAL_DATATYPE_FLAG_USER_UB) ) { MPI_Aint displs[3]; MPI_Datatype types[3]; int blength[3] = { 1, 1, 1 }; displs[0] = 0; displs[1] = displ * extent; displs[2] = size * extent; types[0] = MPI_LB; types[1] = last_type; types[2] = MPI_UB; ompi_datatype_create_struct( 3, blength, displs, types, newtype ); } else { ompi_datatype_create_resized(last_type, displ * extent, size * extent, newtype); } ompi_datatype_destroy( &last_type ); return OMPI_SUCCESS; }
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; OPAL_PTRDIFF_TYPE sext, rext, span, gap; struct ompi_datatype_t *new_ddt; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_type_extent (rdtype, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } span = opal_datatype_span(&sdtype->super, (int64_t)size * scount, &gap); displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } /* tmp buffer allocation for message data */ tmpbuf_free = (char *)malloc(span); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - gap; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return err; }
int mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_gen2_num_io_procs; size_t max_data = 0; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator =-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_io_ompio_print_entry nentry; #endif /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } if (! recvbuf_is_contiguous ) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_gen2_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the File offsets/lengths corresponding to this write ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /************************************************************* *** 4. Allgather the File View information at all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); if ( my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ sendtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != sendtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually read in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_received = 0; while (bytes_to_read_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_read_in_cycle) { /* Data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* end while (bytes_to_read_in_cycle) */ /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } /* Sort the displacements for each aggregator */ read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } /********************************************************** *** 7e. Create the io array, and pass it to fbtl *********************************************************/ fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ send_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** *** 7f. Scatter the Data from the readers *********************************************************/ if ( recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (my_aggregator == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!recvbuf_is_contiguous ) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /* end for (index=0; index < cycles; index ++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_gen2_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (!recvbuf_is_contiguous) { if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != sendtype){ for (i = 0; i < fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); } } free(sendtype); sendtype=NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } if ( NULL != send_req ) { free ( send_req ); send_req = NULL; } } return ret; }
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { size_t max_data = 0; int i; int num_groups = 0; contg *contg_groups; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb, ub; ompi_datatype_t *newfiletype; if ( NULL != fh->f_etype ) { ompi_datatype_destroy (&fh->f_etype); } if ( NULL != fh->f_filetype ) { ompi_datatype_destroy (&fh->f_filetype); } if ( NULL != fh->f_orig_filetype ) { ompi_datatype_destroy (&fh->f_orig_filetype); } if (NULL != fh->f_decoded_iov) { free (fh->f_decoded_iov); fh->f_decoded_iov = NULL; } if (NULL != fh->f_datarep) { free (fh->f_datarep); fh->f_datarep = NULL; } /* Reset the flags first */ fh->f_flags = 0; fh->f_flags |= OMPIO_FILE_VIEW_IS_SET; fh->f_datarep = strdup (datarep); ompi_datatype_duplicate (filetype, &fh->f_orig_filetype ); opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent); opal_datatype_type_size (&filetype->super, &ftype_size); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){ ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE, &ompi_mpi_byte.dt, &newfiletype); ompi_datatype_commit (&newfiletype); } else { newfiletype = filetype; } fh->f_iov_count = 0; fh->f_disp = disp; fh->f_offset = disp; fh->f_total_bytes = 0; ompi_io_ompio_decode_datatype (fh, newfiletype, 1, NULL, &max_data, &fh->f_decoded_iov, &fh->f_iov_count); opal_datatype_get_extent(&newfiletype->super, &lb, &fh->f_view_extent); opal_datatype_type_ub (&newfiletype->super, &ub); opal_datatype_type_size (&etype->super, &fh->f_etype_size); opal_datatype_type_size (&newfiletype->super, &fh->f_view_size); ompi_datatype_duplicate (etype, &fh->f_etype); ompi_datatype_duplicate (newfiletype, &fh->f_filetype); fh->f_cc_size = get_contiguous_chunk_size (fh); if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) { if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) && fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) { fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW; } } contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg)); if (NULL == contg_groups) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } for( i = 0; i < fh->f_size; i++){ contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int)); if(NULL == contg_groups[i].procs_in_contg_group){ int j; opal_output (1, "OUT OF MEMORY\n"); for(j=0; j<i; j++) { free(contg_groups[j].procs_in_contg_group); } free(contg_groups); return OMPI_ERR_OUT_OF_RESOURCE; } } if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh, &num_groups, contg_groups)){ opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n"); free(contg_groups); return OMPI_ERROR; } if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) && (num_groups == 1 || num_groups == fh->f_size)) ) { mca_io_ompio_finalize_initial_grouping(fh, num_groups, contg_groups); } for( i = 0; i < fh->f_size; i++){ free(contg_groups[i].procs_in_contg_group); } free(contg_groups); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){ ompi_datatype_destroy ( &newfiletype ); } if (OMPI_SUCCESS != mca_fcoll_base_file_select (fh, NULL)) { opal_output(1, "mca_fcoll_base_file_select() failed\n"); return OMPI_ERROR; } return OMPI_SUCCESS; }
static int cyclic(const int *gsize_array, int dim, int ndims, int nprocs, int rank, int darg, int order, ptrdiff_t orig_extent, ompi_datatype_t* type_old, ompi_datatype_t **type_new, ptrdiff_t *st_offset) { int blksize, i, blklens[2], st_index, end_index, local_size, rem, count, rc; ptrdiff_t stride, disps[2]; ompi_datatype_t *type_tmp, *types[2]; if (darg == MPI_DISTRIBUTE_DFLT_DARG) { blksize = 1; } else { blksize = darg; } st_index = rank * blksize; end_index = gsize_array[dim] - 1; if (end_index < st_index) { local_size = 0; } else { local_size = ((end_index - st_index + 1)/(nprocs*blksize))*blksize; rem = (end_index - st_index + 1) % (nprocs*blksize); local_size += rem < blksize ? rem : blksize; } count = local_size / blksize; rem = local_size % blksize; stride = nprocs*blksize*orig_extent; if (order == MPI_ORDER_FORTRAN) { for (i=0; i<dim; i++) { stride *= gsize_array[i]; } } else { for (i=ndims-1; i>dim; i--) { stride *= gsize_array[i]; } } rc = ompi_datatype_create_hvector(count, blksize, stride, type_old, type_new); if (OMPI_SUCCESS != rc) return rc; if (rem) { /* if the last block is of size less than blksize, include it separately using MPI_Type_struct */ types [0] = *type_new; types [1] = type_old; disps [0] = 0; disps [1] = count*stride; blklens[0] = 1; blklens[1] = rem; rc = ompi_datatype_create_struct(2, blklens, disps, types, &type_tmp); ompi_datatype_destroy(type_new); /* even in error condition, need to destroy type_new, so check for error after destroy. */ if (OMPI_SUCCESS != rc) return rc; *type_new = type_tmp; } /* need to set the UB for block-cyclic to work */ disps[0] = 0; disps[1] = orig_extent; if (order == MPI_ORDER_FORTRAN) { for(i=0; i<=dim; i++) { disps[1] *= gsize_array[i]; } } else { for(i=ndims-1; i>=dim; i--) { disps[1] *= gsize_array[i]; } } rc = opal_datatype_resize( &(*type_new)->super, disps[0], disps[1] ); if (OMPI_SUCCESS != rc) return rc; *st_offset = rank * blksize; /* in terms of no. of elements of type oldtype in this dimension */ if (local_size == 0) *st_offset = 0; return OMPI_SUCCESS; }
/* * allgatherv_intra * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgatherv_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank ; int err; MPI_Aint extent; MPI_Aint lb; char *send_buf = NULL; struct ompi_datatype_t *newtype, *send_type; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* * We don't have a root process defined. Arbitrarily assign root * to process with rank 0 (OMPI convention) */ if (MPI_IN_PLACE == sbuf) { ompi_datatype_get_extent(rdtype, &lb, &extent); send_type = rdtype; send_buf = (char*)rbuf; for (i = 0; i < rank; ++i) { send_buf += (rcounts[i] * extent); } } else { send_buf = (char*)sbuf; send_type = sdtype; } err = comm->c_coll.coll_gatherv(send_buf, rcounts[rank], send_type,rbuf, rcounts, disps, rdtype, 0, comm, comm->c_coll.coll_gatherv_module); if (MPI_SUCCESS != err) { return err; } /* * we now have all the data in the root's rbuf. Need to * broadcast the data out to the other processes * * Need to define a datatype that captures the different vectors * from each process. MPI_TYPE_INDEXED with params * size,rcount,displs,rdtype,newtype * should do the trick. * Use underlying ddt functions to create, and commit the * new datatype on each process, then broadcast and destroy the * datatype. */ err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype); if (MPI_SUCCESS != err) { return err; } err = ompi_datatype_commit(&newtype); if(MPI_SUCCESS != err) { return err; } err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm, comm->c_coll.coll_bcast_module); ompi_datatype_destroy (&newtype); return err; }