int mca_sharedfp_lockedfile_iread(mca_io_ompio_file_t *fh, void *buf, int count, ompi_datatype_t *datatype, MPI_Request * request) { int ret = OMPI_SUCCESS; mca_sharedfp_base_module_t * shared_fp_base_module; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; if ( NULL == fh->f_sharedfp_data ) { if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_iread: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_lockedfile_iread - error opening the shared file pointer\n"); return ret; } } /* Calculate the number of bytes to read */ opal_datatype_type_size ( &datatype->super, &numofBytes); bytesRequested = count * numofBytes; if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_iread - Bytes Requested is %ld\n",bytesRequested); } /*Retrieve the shared file data struct*/ sh = fh->f_sharedfp_data; /*Request the offset to write bytesRequested bytes*/ ret = mca_sharedfp_lockedfile_request_position(sh,bytesRequested,&offset); offset /= sh->sharedfh->f_etype_size; if ( -1 != ret ) { if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_iread - Offset received is %lld\n",offset); } /* Read the file */ ret = ompio_io_ompio_file_iread_at(sh->sharedfh,offset,buf,count,datatype,request); } return ret; }
int mca_io_ompio_file_read_all (ompi_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t * status) { int ret = OMPI_SUCCESS; mca_io_ompio_data_t *data; data = (mca_io_ompio_data_t *) fh->f_io_selected_data; ret = data->ompio_fh. f_fcoll->fcoll_file_read_all (&data->ompio_fh, buf, count, datatype, status); if ( MPI_STATUS_IGNORE != status ) { size_t size; opal_datatype_type_size (&datatype->super, &size); status->_ucount = count * size; } return ret; }
int mca_sharedfp_individual_iwrite(mca_io_ompio_file_t *fh, const void *buf, int count, ompi_datatype_t *datatype, MPI_Request * request) { int ret = OMPI_SUCCESS; size_t numofbytes = 0; OMPI_MPI_OFFSET_TYPE totalbytes = 0; mca_sharedfp_individual_header_record *headnode = NULL; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if(fh->f_sharedfp_data==NULL){ if ( mca_sharedfp_individual_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_individual_iwrite: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if (ret != OMPI_SUCCESS) { opal_output(0,"mca_sharedfp_individual_iwrite - error opening the shared file pointer\n"); return ret; } } /* Calculate the number of bytes of data that needs to be written*/ opal_datatype_type_size ( &datatype->super, &numofbytes); totalbytes = count * numofbytes; sh = fh->f_sharedfp_data; headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data; if ( NULL == headnode) { opal_output (0, "sharedfp_individual_iwrite: headnode is NULL but file is open\n"); return OMPI_ERROR; } /*Insert metadata record into a queue*/ ret = mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED,totalbytes,sh); /*Write the data into individual file*/ ret = mca_common_ompio_file_iwrite_at ( headnode->datafilehandle, headnode->datafile_offset, buf, count, datatype, request); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_iwrite: Error while iwriting the datafile \n"); return ret; } /* Update the datafileoffset */ headnode->datafile_offset = headnode->datafile_offset + totalbytes; return ret; }
int mca_io_ompio_file_write_all (ompi_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; mca_common_ompio_data_t *data; data = (mca_common_ompio_data_t *) fh->f_io_selected_data; OPAL_THREAD_LOCK(&fh->f_lock); ret = data->ompio_fh. f_fcoll->fcoll_file_write_all (&data->ompio_fh, buf, count, datatype, status); OPAL_THREAD_UNLOCK(&fh->f_lock); if ( MPI_STATUS_IGNORE != status ) { size_t size; opal_datatype_type_size (&datatype->super, &size); status->_ucount = count * size; } return ret; }
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { size_t max_data = 0; MPI_Aint lb,ub; fh->f_iov_count = 0; fh->f_disp = disp; fh->f_offset = disp; fh->f_total_bytes = 0; ompi_io_ompio_decode_datatype (fh, filetype, 1, NULL, &max_data, &fh->f_decoded_iov, &fh->f_iov_count); opal_datatype_get_extent(&filetype->super, &lb, &fh->f_view_extent); opal_datatype_type_ub (&filetype->super, &ub); opal_datatype_type_size (&etype->super, &fh->f_etype_size); opal_datatype_type_size (&filetype->super, &fh->f_view_size); ompi_datatype_duplicate (etype, &fh->f_etype); ompi_datatype_duplicate (filetype, &fh->f_filetype); fh->f_cc_size = get_contiguous_chunk_size (fh); if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) { if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) && fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) { fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW; } } return OMPI_SUCCESS; }
int mca_sharedfp_individual_write (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; size_t numofbytes = 0; size_t totalbytes = 0; mca_sharedfp_individual_header_record *headnode = NULL; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if ( NULL == fh->f_sharedfp_data ) { if ( mca_sharedfp_individual_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output,"sharedfp_individual_write: opening the shared file pointer file\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if (ret != OMPI_SUCCESS) { opal_output(0,"sharedfp_individual_write - error opening the shared file pointer\n"); return ret; } } /* Calculate the number of bytes of data that need to be written*/ opal_datatype_type_size ( &datatype->super, &numofbytes); totalbytes = count * numofbytes; /*Retrieve data structure for shared file pointer operations*/ sh = fh->f_sharedfp_data; headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data; if (headnode) { /*Insert metadata record into a queue*/ mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh); /*Write the data into individual file*/ ret = ompio_io_ompio_file_write_at ( headnode->datafilehandle, headnode->datafile_offset, buf, count, datatype, status); if ( OMPI_SUCCESS != ret ) { opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n"); return -1; } /* Update the datafileoffset*/ headnode->datafile_offset = headnode->datafile_offset + totalbytes; } return ret; }
int mca_sharedfp_sm_write (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if( NULL == fh->f_sharedfp_data ){ if ( mca_sharedfp_sm_verbose ) { printf("sharedfp_sm_write: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_sm_write - error opening the shared file pointer\n"); return ret; } } /* Calculate the number of bytes to write*/ opal_datatype_type_size ( &datatype->super, &numofBytes); bytesRequested = count * numofBytes; /*Retrieve the shared file data struct*/ sh = fh->f_sharedfp_data; if ( mca_sharedfp_sm_verbose ) { printf("sharedfp_sm_write: Requested is %ld\n",bytesRequested); } /*Request the offset to write bytesRequested bytes*/ ret = mca_sharedfp_sm_request_position(sh,bytesRequested,&offset); if ( -1 != ret ) { if ( mca_sharedfp_sm_verbose ) { printf("sharedfp_sm_write: fset received is %lld\n",offset); } /* Write to the file*/ ret = ompio_io_ompio_file_write_at(sh->sharedfh,offset,buf,count,datatype,status); } return ret; }
int mca_sharedfp_addproc_read ( mca_io_ompio_file_t *fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if(NULL == fh->f_sharedfp_data){ if ( mca_sharedfp_addproc_verbose ) { printf("sharedfp_addproc_read: opening the shared file pointer file\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(1,"sharedfp_addproc_read - error opening the shared file pointer\n"); return ret; } } /* Calculate the number of bytes to write */ opal_datatype_type_size ( &datatype->super ,&numofBytes); bytesRequested = count * numofBytes; if ( mca_sharedfp_addproc_verbose ){ printf("mca_sharedfp_addproc_read: Bytes Requested is %ld\n", bytesRequested); } /* Retrieve the shared file data struct */ sh = fh->f_sharedfp_data; /*Request to the additional process for the offset*/ ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offset); if( OMPI_SUCCESS == ret ){ if ( mca_sharedfp_addproc_verbose ){ printf("mca_sharedfp_addproc_read: Offset received is %lld\n",offset); } /* Read from the file */ ret = ompio_io_ompio_file_read_at(sh->sharedfh,offset,buf,count,datatype,status); } return ret; }
int mca_sharedfp_lockedfile_read ( ompio_file_t *fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; if ( fh->f_sharedfp_data == NULL ) { if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_read: module not initialized\n"); } return OMPI_ERROR; } /* Calculate the number of bytes to read */ opal_datatype_type_size ( &datatype->super, &numofBytes); bytesRequested = count * numofBytes; if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_read: Bytes Requested is %ld\n",bytesRequested); } /*Retrieve the shared file data struct*/ sh = fh->f_sharedfp_data; /*Request the offset to write bytesRequested bytes*/ ret = mca_sharedfp_lockedfile_request_position(sh,bytesRequested,&offset); offset /= fh->f_etype_size; if (-1 != ret ) { if ( mca_sharedfp_lockedfile_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_lockedfile_read: Offset received is %lld\n",offset); } /* Read the file */ ret = mca_common_ompio_file_read_at(fh,offset,buf,count,datatype,status); } return ret; }
int mca_sharedfp_addproc_write (mca_io_ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; if(NULL == fh->f_sharedfp_data){ opal_output(0, "sharedfp_addproc_write: shared file pointer structure not initialized correctly\n"); return OMPI_ERROR; } /* Calculate the number of bytes to write*/ opal_datatype_type_size ( &datatype->super, &numofBytes); bytesRequested = count * numofBytes; /*Retrieve the shared file data structure */ sh = fh->f_sharedfp_data; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_write: sharedfp_addproc_write: Bytes Requested is %ld\n", bytesRequested); } /*Request the offset to write bytesRequested bytes*/ ret = mca_sharedfp_addproc_request_position( sh, bytesRequested, &offset); offset /= sh->sharedfh->f_etype_size; if ( OMPI_SUCCESS == ret ) { if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_write: Offset received is %lld\n",offset); } /* Write to the file */ ret = ompio_io_ompio_file_write_at(sh->sharedfh,offset,buf,count,datatype,status); } return ret; }
int mca_sharedfp_addproc_read ( mca_io_ompio_file_t *fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long bytesRequested = 0; size_t numofBytes; struct mca_sharedfp_base_data_t *sh = NULL; if(NULL == fh->f_sharedfp_data){ opal_output(0, "sharedfp_addproc_read: shared file pointer " "structure not initialized correctly\n"); return OMPI_ERROR; } /* Calculate the number of bytes to write */ opal_datatype_type_size ( &datatype->super ,&numofBytes); bytesRequested = count * numofBytes; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_addproc_read: Bytes Requested is %ld\n", bytesRequested); } /* Retrieve the shared file data struct */ sh = fh->f_sharedfp_data; /*Request to the additional process for the offset*/ ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offset); offset /= sh->sharedfh->f_etype_size; if( OMPI_SUCCESS == ret ){ if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_addproc_read: Offset received is %lld\n",offset); } /* Read from the file */ ret = mca_common_ompio_file_read_at(sh->sharedfh,offset,buf,count,datatype,status); } return ret; }
int mca_sharedfp_sm_read_ordered_begin(mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0; long sendBuff = 0; long *buff=NULL; long offsetBuff; OMPI_MPI_OFFSET_TYPE offsetReceived = 0; long bytesRequested = 0; int recvcnt = 1, sendcnt = 1; size_t numofBytes; int rank, size, i; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if ( NULL == fh->f_sharedfp_data){ if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_sm_read_ordered_begin: opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_sm_read_ordered_begin - error opening the shared file pointer\n"); return ret; } } if ( true == fh->f_split_coll_in_use ) { opal_output(0,"Only one split collective I/O operation allowed per file handle at any given point in time!\n"); return MPI_ERR_REQUEST; } /*Retrieve the new communicator*/ sh = fh->f_sharedfp_data; /* Calculate the number of bytes to read*/ opal_datatype_type_size ( &datatype->super, &numofBytes); sendBuff = count * numofBytes; /* Get the ranks in the communicator */ rank = ompi_comm_rank ( sh->comm ); size = ompi_comm_size ( sh->comm ); if ( 0 == rank ) { buff = (long*)malloc(sizeof(long) * size); if ( NULL == buff ) return OMPI_ERR_OUT_OF_RESOURCE; } ret = sh->comm->c_coll.coll_gather ( &sendBuff, sendcnt, OMPI_OFFSET_DATATYPE, buff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module ); if( OMPI_SUCCESS != ret){ goto exit; } /* All the counts are present now in the recvBuff. ** The size of recvBuff is sizeof_newComm */ if ( 0 == rank ) { for (i = 0; i < size ; i ++) { bytesRequested += buff[i]; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Bytes requested are %ld\n", bytesRequested); } } /* Request the offset to read bytesRequested bytes ** only the root process needs to do the request, ** since the root process will then tell the other ** processes at what offset they should read their ** share of the data. */ ret = mca_sharedfp_sm_request_position(sh,bytesRequested,&offsetReceived); if( OMPI_SUCCESS != ret){ goto exit; } if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Offset received is %lld\n",offsetReceived); } buff[0] += offsetReceived; for (i = 1 ; i < size; i++) { buff[i] += buff[i-1]; } } /* Scatter the results to the other processes*/ ret = sh->comm->c_coll.coll_scatter ( buff, sendcnt, OMPI_OFFSET_DATATYPE, &offsetBuff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if( OMPI_SUCCESS != ret){ goto exit; } /*Each process now has its own individual offset in recvBUFF*/ offset = offsetBuff - sendBuff; offset /= sh->sharedfh->f_etype_size; if ( mca_sharedfp_sm_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "mca_sharedfp_sm_read_ordered_begin: Offset returned is %lld\n",offset); } /* read to the file */ ret = ompio_io_ompio_file_iread_at_all(sh->sharedfh,offset,buf,count,datatype, &fh->f_split_coll_req); fh->f_split_coll_in_use = true; exit: if ( NULL != buff ) { free ( buff ); } return ret; }
int mca_fcoll_static_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS, iov_size=0, *bytes_remaining=NULL; int i, j, l,cycles=0, local_cycles=0, *current_index=NULL; int index, *disp_index=NULL, *bytes_per_process=NULL, current_position=0; int **blocklen_per_process=NULL, *iovec_count_per_process=NULL; int *displs=NULL, *sorted=NULL ,entries_per_aggregator=0; int *sorted_file_offsets=NULL, temp_index=0, position=0, *temp_disp_index=NULL; MPI_Aint **displs_per_process=NULL, global_iov_count=0, global_count=0; MPI_Aint *memory_displacements=NULL; int bytes_to_read_in_cycle=0; size_t max_data=0, bytes_per_cycle=0; uint32_t iov_count=0, iov_index=0; struct iovec *decoded_iov=NULL, *iov=NULL; mca_fcoll_static_local_io_array *local_iov_array=NULL, *global_iov_array=NULL; mca_fcoll_static_local_io_array *file_offsets_for_agg=NULL; char *global_buf=NULL, *receive_buf=NULL; int blocklen[3] = {1, 1, 1}; int static_num_io_procs=1; OPAL_PTRDIFF_TYPE d[3], base; ompi_datatype_t *types[3]; ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator=-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_common_ompio_print_entry nentry; #endif #if DEBUG_ON MPI_Aint gc_in; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } /* In case the data is not contigous in memory, decode it into an iovec */ if (!recvbuf_is_contiguous ) { fh->f_decode_datatype ( (struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &static_num_io_procs ); fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, static_num_io_procs, max_data); my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /* printf("max_data %ld\n", max_data); */ ret = fh->f_generate_current_file_view((struct mca_io_ompio_file_t *)fh, max_data, &iov, &iov_size); if (ret != OMPI_SUCCESS){ goto exit; } if ( iov_size > 0 ) { local_iov_array = (mca_fcoll_static_local_io_array *)malloc (iov_size * sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (j=0; j < iov_size; j++){ local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) iov[j].iov_base; local_iov_array[j].length = (size_t)iov[j].iov_len; local_iov_array[j].process_id = fh->f_rank; } } else { /* Allocate at least one element to correctly create the derived data type */ local_iov_array = (mca_fcoll_static_local_io_array *)malloc (sizeof(mca_fcoll_static_local_io_array)); if ( NULL == local_iov_array){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } local_iov_array[0].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) 0; local_iov_array[0].length = (size_t) 0; local_iov_array[0].process_id = fh->f_rank; } d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0]; d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length; d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id; base = d[0]; for (i=0 ; i<3 ; i++) { d[i] -= base; } /* io_array datatype for using in communication*/ types[0] = &ompi_mpi_long.dt; types[1] = &ompi_mpi_long.dt; types[2] = &ompi_mpi_int.dt; ompi_datatype_create_struct (3, blocklen, d, types, &io_array_type); ompi_datatype_commit (&io_array_type); /* #########################################################*/ fh->f_get_bytes_per_agg ( (int*) &bytes_per_cycle); local_cycles = ceil((double)max_data*fh->f_procs_per_group/bytes_per_cycle); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles, &cycles, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { disp_index = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int )); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_remaining = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == bytes_remaining){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } current_index = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == current_index){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } iovec_count_per_process = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == iovec_count_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs = (int *) calloc (fh->f_procs_per_group, sizeof(int)); if (NULL == displs){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&iov_size, 1, MPI_INT, iovec_count_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { displs[0] = 0; global_iov_count = iovec_count_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_iov_count += iovec_count_per_process[i]; displs[i] = displs[i-1] + iovec_count_per_process[i-1]; } } if ( (my_aggregator == fh->f_rank) && (global_iov_count > 0 )) { global_iov_array = (mca_fcoll_static_local_io_array *) malloc (global_iov_count * sizeof(mca_fcoll_static_local_io_array)); if (NULL == global_iov_array){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = fcoll_base_coll_gatherv_array (local_iov_array, iov_size, io_array_type, global_iov_array, iovec_count_per_process, displs, io_array_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ fprintf(stderr,"global_iov_array gather error!\n"); goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if ( ( my_aggregator == fh->f_rank) && ( global_iov_count > 0 )) { sorted = (int *)malloc (global_iov_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } read_local_heap_sort (global_iov_array, global_iov_count, sorted); send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for ( i=0; i<fh->f_procs_per_group; i++ ) { sendtype[i] = MPI_DATATYPE_NULL; } if (NULL == bytes_per_process){ bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == bytes_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (gc_in=0; gc_in<global_iov_count; gc_in++){ printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n", global_iov_array[sorted[gc_in]].process_id, gc_in, global_iov_array[sorted[gc_in]].offset, gc_in, global_iov_array[sorted[gc_in]].length); } } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++){ if (my_aggregator == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if ( NULL != sendtype ) { for ( i=0; i<fh->f_procs_per_group; i++ ) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy (&sendtype[i] ); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } if (index < local_cycles ) { if ((index == local_cycles-1) && (max_data % (bytes_per_cycle/fh->f_procs_per_group))) { bytes_to_read_in_cycle = max_data - position; } else if (max_data <= bytes_per_cycle/fh->f_procs_per_group) { bytes_to_read_in_cycle = max_data; } else { bytes_to_read_in_cycle = bytes_per_cycle/fh->f_procs_per_group; } } else { bytes_to_read_in_cycle = 0; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif fcoll_base_coll_gather_array (&bytes_to_read_in_cycle, 1, MPI_INT, bytes_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_to_read_in_cycle) { receive_buf = (char *) malloc (bytes_to_read_in_cycle * sizeof(char)); if ( NULL == receive_buf){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_to_read_in_cycle, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif if (my_aggregator == fh->f_rank) { for (i=0;i<fh->f_procs_per_group; i++){ while (bytes_per_process[i] > 0){ /*printf("%d: bytes_per_process[%d]: %d, bytes_remaining[%d]: %d\n", index, i, bytes_per_process[i], i, bytes_remaining[i]);*/ if (read_get_process_id(global_iov_array[sorted[current_index[i]]].process_id, fh) == i){ /* current id owns this entry!*/ if (bytes_remaining[i]){ /*Remaining bytes in the current entry of the global offset array*/ if (bytes_remaining[i] <= bytes_per_process[i]){ blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); bytes_per_process[i] -= bytes_remaining[i]; blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_remaining[i] = 0; /* This entry has been used up, we need to move to the next entry of this process and make current_index point there*/ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } continue; } else{ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); bytes_remaining[i] -= bytes_per_process[i]; bytes_per_process[i] = 0; break; } } else{ if (bytes_per_process[i] < global_iov_array[sorted[current_index[i]]].length){ blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; bytes_remaining[i] = global_iov_array[sorted[current_index[i]]].length - bytes_per_process[i]; bytes_per_process[i] = 0; break; } else { blocklen_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].length; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_per_process[i] -= global_iov_array[sorted[current_index[i]]].length; current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ break; } } } } else{ current_index[i] = read_find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1){ bytes_per_process[i] = 0; /* no more entries left to service this request*/ continue; } } } } entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group;i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ entries_per_aggregator++; #if DEBUG_ON printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_fcoll_static_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_fcoll_static_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator * sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } temp_index=0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i]; j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } read_local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } global_buf = (char *) malloc (global_count * sizeof(char)); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld, disp_index :%d\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]], disp_index[i]); } #endif fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); if (my_aggregator == fh->f_rank){ for (i=0 ; i<global_count/4 ; i++) printf (" READ %d \n",((int *)global_buf)[i]); } #endif temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group; i++){ send_req[i] = MPI_REQUEST_NULL; ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } /* if ( my_aggregator == fh->f_rank ) */ ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif position += bytes_to_read_in_cycle; if (!recvbuf_is_contiguous) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_to_read_in_cycle; while (remaining && (iov_count > iov_index)){ mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else{ memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = static_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){ mca_common_ompio_register_print_entry(fh->f_coll_read_time, nentry); } #endif exit: if (NULL != decoded_iov){ free(decoded_iov); decoded_iov = NULL; } if (NULL != displs){ free(displs); displs = NULL; } if (NULL != iovec_count_per_process){ free(iovec_count_per_process); iovec_count_per_process=NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != global_iov_array){ free(global_iov_array); global_iov_array=NULL; } if (my_aggregator == fh->f_rank) { for(l=0;l<fh->f_procs_per_group;l++){ if (blocklen_per_process) { free(blocklen_per_process[l]); } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } } if (NULL != bytes_per_process){ free(bytes_per_process); bytes_per_process =NULL; } if (NULL != disp_index){ free(disp_index); disp_index =NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } if(NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining = NULL; } if(NULL != current_index){ free(current_index); current_index = NULL; } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process =NULL; } if (NULL != bytes_remaining){ free(bytes_remaining); bytes_remaining =NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != sendtype){ free(sendtype); sendtype=NULL; } if ( !recvbuf_is_contiguous ) { if (NULL != receive_buf){ free(receive_buf); receive_buf=NULL; } } if (NULL != global_buf) { free(global_buf); global_buf = NULL; } if (NULL != sorted) { free(sorted); sorted = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return ret; }
int mca_sharedfp_addproc_write_ordered (mca_io_ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0, offsetReceived = 0; long sendBuff = 0; long *buff=NULL; long offsetBuff; long bytesRequested = 0; int recvcnt = 1, sendcnt = 1; size_t numofBytes; int rank, size, i; struct mca_sharedfp_base_data_t *sh = NULL; if(NULL == fh->f_sharedfp_data){ opal_output(0, "sharedfp_addproc_write_ordered: shared file pointer " "structure not initialized correctly\n"); return OMPI_ERROR; } /*Retrieve the shared file pointer structure*/ sh = fh->f_sharedfp_data; /* Calculate the number of bytes to write*/ opal_datatype_type_size ( &datatype->super, &numofBytes); sendBuff = count * numofBytes; /* Get the ranks in the communicator */ rank = ompi_comm_rank ( sh->comm ); size = ompi_comm_size ( sh->comm ); if ( 0 == rank ) { buff = (long*)malloc(sizeof(OMPI_MPI_OFFSET_TYPE) * size); if ( NULL == buff ) return OMPI_ERR_OUT_OF_RESOURCE; } ret = sh->comm->c_coll.coll_gather ( &sendBuff, sendcnt, OMPI_OFFSET_DATATYPE, buff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module); if( OMPI_SUCCESS != ret ){ goto exit; } /* All the counts are present now in the recvBuff. The size of recvBuff is sizeof_newComm */ if ( 0 == rank ) { for (i = 0; i < size ; i ++) { bytesRequested += buff[i]; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_write_ordered: Bytes requested are %ld\n", bytesRequested); } } /* Request the offset to write bytesRequested bytes ** only the root process needs to do the request, ** since the root process will then tell the other ** processes at what offset they should write their ** share of the data. */ ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offsetReceived); if( OMPI_SUCCESS != ret ){ goto exit; } if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_write_ordered: Offset received is %lld\n", offsetReceived); } buff[0] += offsetReceived; for (i = 1 ; i < size; i++) { buff[i] += buff[i-1]; } } /* Scatter the results to the other processes*/ ret = sh->comm->c_coll.coll_scatter ( buff, sendcnt, OMPI_OFFSET_DATATYPE, &offsetBuff, recvcnt, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if( OMPI_SUCCESS != ret ){ goto exit; } /*Each process now has its own individual offset in recvBUFF*/ offset = offsetBuff - sendBuff; offset /= sh->sharedfh->f_etype_size; if ( mca_sharedfp_addproc_verbose ){ opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_addproc_write_ordered: Offset returned is %lld\n", offset); } /* write to the file */ ret = ompio_io_ompio_file_write_at_all(sh->sharedfh,offset,buf,count,datatype,status); exit: if ( NULL != buff ) { free ( buff ); } return ret; }
int mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0, ret=OMPI_SUCCESS; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int *sorted_file_offsets=NULL, entries_per_aggregator=0; int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; char *receive_buf = NULL; MPI_Aint *memory_displacements=NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index=0, temp_index=0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL; char *global_buf = NULL; MPI_Aint global_count = 0; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; int dynamic_gen2_num_io_procs; size_t max_data = 0; MPI_Aint *total_bytes_per_process = NULL; ompi_datatype_t **sendtype = NULL; MPI_Request *send_req=NULL, recv_req=NULL; int my_aggregator =-1; bool recvbuf_is_contiguous=false; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0; double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0; mca_io_ompio_print_entry nentry; #endif /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { recvbuf_is_contiguous = true; } if (! recvbuf_is_contiguous ) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs); ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh, dynamic_gen2_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the File offsets/lengths corresponding to this write ********************************************************************/ ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } /************************************************************* *** 4. Allgather the File View information at all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array) { free (local_iov_array); local_iov_array = NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_iov_array[sorted[i]].iov_base, global_iov_array[sorted[i]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle); cycles = ceil((double)total_bytes/bytes_per_cycle); if ( my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == send_req){ opal_output ( 1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == sendtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ sendtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != sendtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); sendtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually read in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_received = 0; while (bytes_to_read_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_read_in_cycle) { /* Data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_read_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_iov_array[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* end while (bytes_to_read_in_cycle) */ /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; global_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; global_count += blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else{ continue; } /* Sort the displacements for each aggregator */ read_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } /********************************************************** *** 7e. Create the io array, and pass it to fbtl *********************************************************/ fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else{ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_preadv (fh)) { opal_output (1, "READ FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += end_read_time - start_read_time; #endif /********************************************************** ******************** DONE READING ************************ *********************************************************/ temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<entries_per_aggregator; i++){ temp_index = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_index][temp_disp_index[temp_index]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_index] < disp_index[temp_index]){ temp_disp_index[temp_index] += 1; } else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_index, temp_disp_index[temp_index], temp_index, disp_index[temp_index]); } } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif for (i=0;i<fh->f_procs_per_group;i++){ send_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &sendtype[i]); ompi_datatype_commit(&sendtype[i]); ret = MCA_PML_CALL (isend(global_buf, 1, sendtype[i], fh->f_procs_in_group[i], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req[i])); if(OMPI_SUCCESS != ret){ goto exit; } } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /********************************************************** *** 7f. Scatter the Data from the readers *********************************************************/ if ( recvbuf_is_contiguous ) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rcomm_time = MPI_Wtime(); #endif ret = MCA_PML_CALL(irecv(receive_buf, bytes_received, MPI_BYTE, my_aggregator, 123, fh->f_comm, &recv_req)); if (OMPI_SUCCESS != ret){ goto exit; } if (my_aggregator == fh->f_rank){ ret = ompi_request_wait_all (fh->f_procs_per_group, send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!recvbuf_is_contiguous ) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rcomm_time = MPI_Wtime(); rcomm_time += end_rcomm_time - start_rcomm_time; #endif } /* end for (index=0; index < cycles; index ++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += end_rexch - start_rexch; nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_gen2_num_io_procs; if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){ fh->f_register_print_entry(READ_PRINT_QUEUE, nentry); } #endif exit: if (!recvbuf_is_contiguous) { if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array=NULL; } if (NULL != displs) { free (displs); displs = NULL; } if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if (NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements= NULL; } if (NULL != sendtype){ for (i = 0; i < fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != sendtype[i] ) { ompi_datatype_destroy(&sendtype[i]); } } free(sendtype); sendtype=NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if ( NULL != blocklen_per_process){ for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } } free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ for (l=0; i<fh->f_procs_per_group; l++){ if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } free(displs_per_process); displs_per_process = NULL; } if ( NULL != send_req ) { free ( send_req ); send_req = NULL; } } return ret; }
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { size_t max_data = 0; int i; int num_groups = 0; contg *contg_groups; MPI_Aint lb,ub; fh->f_iov_count = 0; fh->f_disp = disp; fh->f_offset = disp; fh->f_total_bytes = 0; ompi_io_ompio_decode_datatype (fh, filetype, 1, NULL, &max_data, &fh->f_decoded_iov, &fh->f_iov_count); opal_datatype_get_extent(&filetype->super, &lb, &fh->f_view_extent); opal_datatype_type_ub (&filetype->super, &ub); opal_datatype_type_size (&etype->super, &fh->f_etype_size); opal_datatype_type_size (&filetype->super, &fh->f_view_size); ompi_datatype_duplicate (etype, &fh->f_etype); ompi_datatype_duplicate (filetype, &fh->f_filetype); fh->f_cc_size = get_contiguous_chunk_size (fh); if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) { if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) && fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) { fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW; } } contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg)); if (NULL == contg_groups) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } for( i = 0; i < fh->f_size; i++) { contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int)); if(NULL == contg_groups[i].procs_in_contg_group) { int j; opal_output (1, "OUT OF MEMORY\n"); for(j=0; j<i; j++) { free(contg_groups[j].procs_in_contg_group); } free(contg_groups); return OMPI_ERR_OUT_OF_RESOURCE; } } if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh, &num_groups, contg_groups)) { opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n"); free(contg_groups); return OMPI_ERROR; } if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) && (num_groups == 1 || num_groups == fh->f_size)) ) { mca_io_ompio_finalize_initial_grouping(fh, num_groups, contg_groups); } for( i = 0; i < fh->f_size; i++) { free(contg_groups[i].procs_in_contg_group); } free(contg_groups); return OMPI_SUCCESS; }
int mca_io_ompio_file_set_view (ompi_file_t *fp, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { mca_io_ompio_data_t *data; mca_io_ompio_file_t *fh; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb; data = (mca_io_ompio_data_t *) fp->f_io_selected_data; fh = &data->ompio_fh; ompi_datatype_destroy (&fh->f_etype); ompi_datatype_destroy (&fh->f_filetype); ompi_datatype_destroy (&fh->f_orig_filetype); if (NULL != fh->f_decoded_iov) { free (fh->f_decoded_iov); fh->f_decoded_iov = NULL; } if (NULL != fh->f_datarep) { free (fh->f_datarep); fh->f_datarep = NULL; } /* Reset the flags first */ fh->f_flags = 0; fh->f_flags |= OMPIO_FILE_VIEW_IS_SET; fh->f_datarep = strdup (datarep); ompi_datatype_duplicate (filetype, &fh->f_orig_filetype ); opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent); opal_datatype_type_size (&filetype->super, &ftype_size); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ) { ompi_datatype_t *newfiletype; ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE, &ompi_mpi_byte.dt, &newfiletype); ompi_datatype_commit (&newfiletype); mca_io_ompio_set_view_internal (fh, disp, etype, newfiletype, datarep, info); ompi_datatype_destroy ( &newfiletype ); } else { mca_io_ompio_set_view_internal (fh, disp, etype, filetype, datarep, info); } if (OMPI_SUCCESS != mca_fcoll_base_file_select (&data->ompio_fh, NULL)) { opal_output(1, "mca_fcoll_base_file_select() failed\n"); return OMPI_ERROR; } return OMPI_SUCCESS; }
int mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0, datatype_size = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; int recv_req_count=0; #if TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; print_entry nentry; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } mca_io_ompio_get_num_aggregators ( &dynamic_num_io_procs ); ret = ompi_io_ompio_set_aggregator_props (fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = ompi_io_ompio_generate_current_file_view(fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %d LENGTH: %ld\n", fh->f_rank, local_iov_array[i].iov_base, local_iov_array[i].iov_len); } #endif /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON printf("total_fview_count : %d\n", total_fview_count); if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } ret = ompi_io_ompio_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].iov_base, global_iov_array[sorted[tv]].iov_len); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } mca_io_ompio_get_bytes_per_agg ( (int *)&bytes_per_cycle ); cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers*/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == recvtype){ recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ /* Calculate how much data will be contributed in this cycle by each process*/ bytes_sent = 0; #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /* The blocklen and displs calculation only done at aggregators!*/ while (bytes_to_write_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_write_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif #if TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif global_buf = (char *) malloc (global_count); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); opal_datatype_type_size(&recvtype[i]->super, &datatype_size); if (datatype_size){ recv_req = (MPI_Request *)realloc ((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request)); ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[recv_req_count])); recv_req_count++; if (OMPI_SUCCESS != ret){ goto exit; } } } } if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } remaining = bytes_sent; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ send_req = (MPI_Request *) malloc (sizeof(MPI_Request)); if (NULL == send_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { ret = ompi_request_wait_all (recv_req_count, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } #if TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /********************************************************** **************** DONE GATHERING OF DATA ****************** *********************************************************/ /********************************************************** ******* Create the io array, and pass it to fbtl ********* *********************************************************/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { #if TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } if (NULL != send_req){ free(send_req); send_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } for (i =0; i< fh->f_procs_per_group; i++) ompi_datatype_destroy(recvtype+i); if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } #if TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){ ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit : if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return OMPI_SUCCESS; }
int mca_sharedfp_addproc_read_ordered (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; OMPI_MPI_OFFSET_TYPE offset = 0, offsetReceived = 0; long sendBuff = 0; long *buff=NULL; long offsetBuff, bytesRequested = 0; size_t numofBytes; int rank, size, i; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if(NULL == fh->f_sharedfp_data){ if ( mca_sharedfp_addproc_verbose ) { printf("sharedfp_addproc_read_ordered: opening the shared file pointer file\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(1,"sharedfp_addproc_read_ordered - error opening the shared file pointer\n"); return ret; } } /*Retrieve the new communicator*/ sh = fh->f_sharedfp_data; /* Calculate the number of bytes to read*/ opal_datatype_type_size ( &datatype->super, &numofBytes); sendBuff = count * numofBytes; /* Get the ranks in the communicator */ rank = ompi_comm_rank ( sh->comm); size = ompi_comm_size ( sh->comm); if ( 0 == rank ) { buff = (long*)malloc(sizeof(long) * size); if ( NULL == buff ) return OMPI_ERR_OUT_OF_RESOURCE; } ret = sh->comm->c_coll.coll_gather( &sendBuff, 1, OMPI_OFFSET_DATATYPE, buff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module); if ( OMPI_SUCCESS != ret ) { goto exit; } /* All the counts are present now in the recvBuff. The size of recvBuff is sizeof_newComm */ if ( 0 == rank ) { for (i = 0; i < size ; i ++) { if ( mca_sharedfp_addproc_verbose ){ printf("sharedfp_addproc_read_ordered: Buff is %ld\n",buff[i]); } bytesRequested += buff[i]; if ( mca_sharedfp_addproc_verbose ){ printf("sharedfp_addproc_read_ordered: Bytes requested are %ld\n",bytesRequested); } } /* Request the offset to read bytesRequested bytes ** only the root process needs to do the request, ** since the root process will then tell the other ** processes at what offset they should read their ** share of the data. */ ret = mca_sharedfp_addproc_request_position(sh,bytesRequested,&offsetReceived); if( OMPI_SUCCESS != ret ){ goto exit; } if ( mca_sharedfp_addproc_verbose ){ printf("sharedfp_addproc_read_ordered: Offset received is %lld\n",offsetReceived); } buff[0] += offsetReceived; for (i = 1 ; i < size; i++) { buff[i] += buff[i-1]; } } /* Scatter the results to the other processes*/ ret = sh->comm->c_coll.coll_scatter ( buff, 1, OMPI_OFFSET_DATATYPE, &offsetBuff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if ( OMPI_SUCCESS != ret ) { goto exit; } /*Each process now has its own individual offset in recvBUFF*/ offset = offsetBuff - sendBuff; if ( mca_sharedfp_addproc_verbose ){ printf("sharedfp_addproc_read_ordered: Offset returned is %lld\n",offset); } /* read from the file */ ret = ompio_io_ompio_file_read_at_all(sh->sharedfh,offset,buf,count,datatype,status); exit: if ( NULL != buff ) { free ( buff ); } return ret; }
int mca_sharedfp_individual_write_ordered (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS; int size = 0, rank = 0; int i = 0; size_t numofbytes = 0; size_t totalbytes = 0; OMPI_MPI_OFFSET_TYPE *offbuff=NULL; OMPI_MPI_OFFSET_TYPE global_offset = 0; OMPI_MPI_OFFSET_TYPE prev_offset = 0; OMPI_MPI_OFFSET_TYPE temp = 0, offset = 0; mca_sharedfp_individual_header_record *headnode = NULL; struct mca_sharedfp_base_data_t *sh = NULL; mca_sharedfp_base_module_t * shared_fp_base_module = NULL; if(fh->f_sharedfp_data==NULL){ if ( mca_sharedfp_individual_verbose ) { opal_output(ompi_sharedfp_base_framework.framework_output, "sharedfp_individual_write_ordered - opening the shared file pointer\n"); } shared_fp_base_module = fh->f_sharedfp; ret = shared_fp_base_module->sharedfp_file_open(fh->f_comm, fh->f_filename, fh->f_amode, fh->f_info, fh); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered - error opening the shared file pointer\n"); return ret; } } /*Retrieve the sharedfp data structures*/ sh = fh->f_sharedfp_data; rank = ompi_comm_rank ( sh->comm ); size = ompi_comm_size ( sh->comm ); /* Calculate the number of bytes of data that needs to be written*/ opal_datatype_type_size ( &datatype->super, &numofbytes); totalbytes = count * numofbytes; headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data; if ( NULL == headnode) { opal_output (0, "sharedfp_individual_write_ordered: headnode is NULL but file is open\n"); return OMPI_ERROR; } /* Data from all the metadata is combined and written to the main file */ ret = mca_sharedfp_individual_collaborate_data ( sh ); if ( OMPI_SUCCESS != ret) { return ret; } if ( 0 == rank ) { offbuff = (OMPI_MPI_OFFSET_TYPE *)malloc ( sizeof(OMPI_MPI_OFFSET_TYPE) * size); if (NULL == offbuff ) { return OMPI_ERR_OUT_OF_RESOURCE; } } /*collect the total bytes to be written*/ sh->comm->c_coll.coll_gather ( &totalbytes, 1, OMPI_OFFSET_DATATYPE, offbuff, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_gather_module ); if ( 0 == rank ) { prev_offset = offbuff[0]; offbuff[0] = sh->global_offset; for (i = 1; i < size ; i++){ temp = offbuff[i]; offbuff[i] = offbuff[i - 1] + prev_offset; prev_offset = temp; } for (i = 0; i < size; i++){ global_offset = offbuff[size - 1] + prev_offset; } } /* Scatter the results to the other processes */ ret = sh->comm->c_coll.coll_scatter ( offbuff, 1, OMPI_OFFSET_DATATYPE, &offset, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_scatter_module ); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered: Error in scattering offsets \n"); goto exit; } ret = sh->comm->c_coll.coll_bcast ( &global_offset, 1, OMPI_OFFSET_DATATYPE, 0, sh->comm, sh->comm->c_coll.coll_bcast_module ); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered: Error while bcasting global offset \n"); goto exit; } sh->global_offset = global_offset; /*use file_write_at_all to ensure the order*/ ret = ompio_io_ompio_file_write_at_all(sh->sharedfh,offset, buf,count,datatype,status); if ( OMPI_SUCCESS != ret ) { opal_output(0,"sharedfp_individual_write_ordered: Error while writing the datafile \n"); } exit: if ( NULL != offbuff ) { free ( offbuff); } return ret; }
static int two_phase_read_and_exch(mca_io_ompio_file_t *fh, void *buf, MPI_Datatype datatype, mca_io_ompio_access_array_t *others_req, struct iovec *offset_len, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, size_t *buf_idx, int striping_unit, int *aggregator_list){ int ret=OMPI_SUCCESS, i = 0, j = 0, ntimes = 0, max_ntimes = 0; int m = 0; int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL; int *partial_send=NULL, *start_pos=NULL, req_len=0, flag=0; int *recd_from_proc=NULL; MPI_Aint buftype_extent=0; size_t byte_size = 0; OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off=0, done=0, for_next_iter=0; OMPI_MPI_OFFSET_TYPE size=0, req_off=0, real_size=0, real_off=0, len=0; OMPI_MPI_OFFSET_TYPE for_curr_iter=0; char *read_buf=NULL, *tmp_buf=NULL; MPI_Datatype byte = MPI_BYTE; opal_datatype_type_size(&byte->super, &byte_size); for (i = 0; i < fh->f_size; i++){ if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0;i<fh->f_size;i++){ for(j=0;j< others_req[i].count; j++){ st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]); end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } } ntimes = (int)((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/ mca_fcoll_two_phase_cycle_buffer_size); if ((st_loc == -1) && (end_loc == -1)){ ntimes = 0; } fh->f_comm->c_coll.coll_allreduce (&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (ntimes){ read_buf = (char *) calloc (mca_fcoll_two_phase_cycle_buffer_size, sizeof(char)); if ( NULL == read_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } curr_offlen_ptr = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == curr_offlen_ptr){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } count = (int *)calloc (fh->f_size, sizeof(int)); if (NULL == count){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } partial_send = (int *)calloc(fh->f_size, sizeof(int)); if ( NULL == partial_send ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } send_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == send_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_size = (int *)malloc(fh->f_size * sizeof(int)); if (NULL == recv_size){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recd_from_proc = (int *)calloc(fh->f_size,sizeof(int)); if (NULL == recd_from_proc){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } start_pos = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == start_pos ){ ret = OMPI_ERR_OUT_OF_RESOURCE; return ret; } done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; ompi_datatype_type_extent(datatype, &buftype_extent); for (m=0; m<ntimes; m++) { size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<fh->f_size; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; MPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); send_size[i] += (int)(OMPIO_MIN(real_off + real_size - req_off, (OMPI_MPI_OFFSET_TYPE)req_len)); if (real_off+real_size-req_off < (OMPI_MPI_OFFSET_TYPE)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = OMPIO_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<fh->f_size; i++) if (count[i]) flag = 1; if (flag) { #if TIME_BREAKDOWN start_read_time = MPI_Wtime(); #endif len = size * byte_size; fh->f_io_array = (mca_io_ompio_io_array_t *)calloc (1,sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)off; fh->f_io_array[0].length = len; fh->f_io_array[0].memory_address = read_buf+for_curr_iter; fh->f_num_of_io_entries = 1; if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) { opal_output(1, "READ FAILED\n"); return OMPI_ERROR; } } #if 0 int ii; printf("%d: len/4 : %lld\n", fh->f_rank, len/4); for (ii = 0; ii < len/4 ;ii++){ printf("%d: read_buf[%d]: %ld\n", fh->f_rank, ii, (int *)read_buf[ii]); } #endif fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } #if TIME_BREAKDOWN end_read_time = MPI_Wtime(); read_time += (end_read_time - start_read_time); #endif } for_curr_iter = for_next_iter; for (i=0; i< fh->f_size; i++){ recv_size[i] = 0; } two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, aggregator_list); if (for_next_iter){ tmp_buf = (char *) calloc (for_next_iter, sizeof(char)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); free(read_buf); read_buf = (char *)malloc(for_next_iter+mca_fcoll_two_phase_cycle_buffer_size); memcpy(read_buf, tmp_buf, for_next_iter); free(tmp_buf); } off += size; done += size; } for (i=0; i<fh->f_size; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) two_phase_exchange_data(fh, buf, offset_len, send_size, start_pos, recv_size, count, partial_send, recd_from_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, m, buf_idx, buftype_extent, striping_unit, aggregator_list); if (ntimes){ free(read_buf); read_buf = NULL; } if (NULL != curr_offlen_ptr){ free(curr_offlen_ptr); curr_offlen_ptr = NULL; } if (NULL != count){ free(count); count = NULL; } if (NULL != partial_send){ free(partial_send); partial_send = NULL; } if (NULL != send_size){ free(send_size); send_size = NULL; } if (NULL != recv_size){ free(recv_size); recv_size = NULL; } if (NULL != recd_from_proc){ free(recd_from_proc); recd_from_proc = NULL; } if (NULL != start_pos){ free(start_pos); start_pos = NULL; } exit: return ret; }
int mca_fcoll_dynamic_file_write_all (ompio_file_t *fh, const void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; int dynamic_num_io_procs; size_t max_data = 0, datatype_size = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request send_req=NULL, *recv_req=NULL; int my_aggregator=-1; bool sendbuf_is_contiguous = false; size_t ftype_size; ptrdiff_t ftype_extent, lb; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; mca_common_ompio_print_entry nentry; #endif opal_datatype_type_size ( &datatype->super, &ftype_size ); opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent ); /************************************************************************** ** 1. In case the data is not contigous in memory, decode it into an iovec **************************************************************************/ if ( ( ftype_extent == (ptrdiff_t) ftype_size) && opal_datatype_is_contiguous_memory_layout(&datatype->super,1) && 0 == lb ) { sendbuf_is_contiguous = true; } if (! sendbuf_is_contiguous ) { ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } dynamic_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators")); if ( OMPI_ERR_MAX == dynamic_num_io_procs ) { ret = OMPI_ERROR; goto exit; } ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh, dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } my_aggregator = fh->f_procs_in_group[0]; /************************************************************************** ** 2. Determine the total amount of data to be written **************************************************************************/ total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** 3. Generate the local offsets/lengths array corresponding to *** this write operation ********************************************************************/ ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %d LENGTH: %ld\n", fh->f_rank, local_iov_array[i].iov_base, local_iov_array[i].iov_len); } #endif /************************************************************* *** 4. Allgather the offset/lengths array from all processes *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON printf("total_fview_count : %d\n", total_fview_count); if (my_aggregator == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif ret = ompi_fcoll_base_coll_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, 0, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /**************************************************************************************** *** 5. Sort the global offset/lengths list based on the offsets. *** The result of the sort operation is the 'sorted', an integer array, *** which contains the indexes of the global_iov_array based on the offset. *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset *** in the file, and that one is followed by global_iov_array[z].offset, than *** sorted[0] = x, sorted[1]=y and sorted[2]=z; ******************************************************************************************/ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_fcoll_base_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].iov_base, global_iov_array[sorted[tv]].iov_len); } } #endif /************************************************************* *** 6. Determine the number of cycles required to execute this *** operation *************************************************************/ bytes_per_cycle = fh->f_bytes_per_agg; cycles = ceil((double)total_bytes/bytes_per_cycle); if (my_aggregator == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req = (MPI_Request *)malloc ((fh->f_procs_per_group)*sizeof(MPI_Request)); if ( NULL == recv_req ) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } global_buf = (char *) malloc (bytes_per_cycle); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(l=0;l<fh->f_procs_per_group;l++){ recvtype[l] = MPI_DATATYPE_NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { /********************************************************************** *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (my_aggregator == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } fh->f_num_of_io_entries = 0; if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); recvtype[i] = MPI_DATATYPE_NULL; } } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; free(blocklen_per_process[l]); free(displs_per_process[l]); blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l] || NULL == blocklen_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } /* (my_aggregator == fh->f_rank */ /************************************************************************** *** 7b. Determine the number of bytes to be actually written in this cycle **************************************************************************/ if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (my_aggregator == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ bytes_sent = 0; /* The blocklen and displs calculation only done at aggregators!*/ while (bytes_to_write_in_cycle) { /* This next block identifies which process is the holder ** of the sorted[current_index] element; */ blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { /* Finish up a partially used buffer from the previous cycle */ if (bytes_remaining <= bytes_to_write_in_cycle) { /* The data fits completely into the block */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { /* the remaining data from the previous cycle is larger than the bytes_to_write_in_cycle, so we have to segment again */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { /* No partially used entry available, have to start a new one */ if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { /* This entry has more data than we can sendin one cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { /* Next data entry is less than bytes_to_write_in_cycle */ if (my_aggregator == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (ptrdiff_t) global_iov_array[sorted[current_index]].iov_base; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } /************************************************************************* *** 7d. Calculate the displacement on where to put the data and allocate *** the recieve buffer (global_buf) *************************************************************************/ if (my_aggregator == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif /************************************************************************* *** 7e. Perform the actual communication *************************************************************************/ for (i=0;i<fh->f_procs_per_group; i++) { recv_req[i] = MPI_REQUEST_NULL; if ( 0 < disp_index[i] ) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); opal_datatype_type_size(&recvtype[i]->super, &datatype_size); if (datatype_size){ ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[i])); if (OMPI_SUCCESS != ret){ goto exit; } } } } } /* end if (my_aggregator == fh->f_rank ) */ if ( sendbuf_is_contiguous ) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ ptrdiff_t mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } remaining = bytes_sent; while (remaining) { mem_address = (ptrdiff_t) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, my_aggregator, 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, &send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(&send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (my_aggregator == fh->f_rank) { ret = ompi_request_wait_all (fh->f_procs_per_group, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (my_aggregator == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += (end_comm_time - start_comm_time); #endif /********************************************************** *** 7f. Create the io array, and pass it to fbtl *********************************************************/ if (my_aggregator == fh->f_rank) { #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[0].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[0].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (ptrdiff_t)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } /* end if (my_aggregator == fh->f_rank) */ } /* end for (index = 0; index < cycles; index++) */ #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (my_aggregator == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = dynamic_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){ mca_common_ompio_register_print_entry(fh->f_coll_write_time, nentry); } #endif exit : if (my_aggregator == fh->f_rank) { if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } if (NULL != recvtype){ for (i =0; i< fh->f_procs_per_group; i++) { if ( MPI_DATATYPE_NULL != recvtype[i] ) { ompi_datatype_destroy(&recvtype[i]); } } free(recvtype); recvtype=NULL; } if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process){ free(blocklen_per_process[l]); } if (NULL != displs_per_process){ free(displs_per_process[l]); } } free(blocklen_per_process); free(displs_per_process); } if (NULL != displs){ free(displs); displs=NULL; } if (! sendbuf_is_contiguous) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } return OMPI_SUCCESS; }
int mpich_typeub2( void ) { int blocklen[3], err = 0; size_t sz1, sz2, sz3; OPAL_PTRDIFF_TYPE disp[3], lb, ub, ex1, ex2, ex3; opal_datatype_t *types[3], *dt1, *dt2, *dt3; blocklen[0] = 1; blocklen[1] = 1; blocklen[2] = 1; disp[0] = -3; disp[1] = 0; disp[2] = 6; types[0] = (opal_datatype_t*)&opal_datatype_lb; types[1] = (opal_datatype_t*)&opal_datatype_int4; types[2] = (opal_datatype_t*)&opal_datatype_ub; opal_datatype_create_struct(3, blocklen, disp, types, &dt1); opal_datatype_commit( dt1 ); opal_datatype_type_lb(dt1, &lb); opal_datatype_type_ub(dt1, &ub); opal_datatype_type_extent(dt1,&ex1); opal_datatype_type_size(dt1,&sz1); /* Values should be lb = -3, ub = 6 extent 9; size depends on implementation */ if (lb != -3 || ub != 6 || ex1 != 9) { printf("Example 3.26 type1 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex1, (int)sz1); err++; } else printf("Example 3.26 type1 correct\n" ); opal_datatype_create_contiguous(2, dt1, &dt2); opal_datatype_type_lb(dt2, &lb); opal_datatype_type_ub(dt2, &ub); opal_datatype_type_extent(dt2,&ex2); opal_datatype_type_size(dt2,&sz2); /* Values should be lb = -3, ub = 15, extent = 18, size depends on implementation */ if (lb != -3 || ub != 15 || ex2 != 18) { printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8); printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex2, (int)sz2); err++; } else printf("Example 3.26 type1 correct\n" ); OBJ_RELEASE( dt2 ); assert( dt2 == NULL ); opal_datatype_create_contiguous(2,dt1,&dt2); opal_datatype_type_lb(dt2, &lb); opal_datatype_type_ub(dt2, &ub); opal_datatype_type_extent(dt2,&ex2); opal_datatype_type_size(dt2,&sz2); /* Values should be lb = -3, ub = 15, extent = 18, size depends on implementation */ if (lb != -3 || ub != 15 || ex2 != 18) { printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8); printf("Example 3.26 type2 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex2, (int)sz2); err++; } else printf( "Example 3.26 type2 correct\n" ); types[0]=dt1; types[1]=dt1; blocklen[0]=1; blocklen[1]=1; disp[0]=0; disp[1]=ex1; opal_datatype_create_struct(2, blocklen, disp, types, &dt3); opal_datatype_commit( dt3 ); opal_datatype_type_lb(dt3, &lb); opal_datatype_type_ub(dt3, &ub); opal_datatype_type_extent(dt3,&ex3); opal_datatype_type_size(dt3,&sz3); /* Another way to express type2 */ if (lb != -3 || ub != 15 || ex3 != 18) { printf("type3 lb %d ub %d extent %d size %d\n", (int)-3, (int)15, (int)18, 8); printf("type3 lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex3, (int)sz2); err++; } else printf( "type3 correct\n" ); OBJ_RELEASE( dt1 ); /*assert( dt1 == NULL );*/ OBJ_RELEASE( dt2 ); /*assert( dt2 == NULL );*/ OBJ_RELEASE( dt3 ); assert( dt3 == NULL ); return err; }
static int two_phase_exch_and_write(mca_io_ompio_file_t *fh, void *buf, MPI_Datatype datatype, mca_io_ompio_access_array_t *others_req, struct iovec *offset_len, int contig_access_count, OMPI_MPI_OFFSET_TYPE min_st_offset, OMPI_MPI_OFFSET_TYPE fd_size, OMPI_MPI_OFFSET_TYPE *fd_start, OMPI_MPI_OFFSET_TYPE *fd_end, Flatlist_node *flat_buf, size_t *buf_idx, int striping_unit, int *aggregator_list) { int i, j, ntimes, max_ntimes, m; int *curr_offlen_ptr=NULL, *count=NULL, *send_size=NULL, *recv_size=NULL; int *partial_recv=NULL, *start_pos=NULL, req_len, flag; int *sent_to_proc=NULL, ret = OMPI_SUCCESS; int *send_buf_idx=NULL, *curr_to_proc=NULL, *done_to_proc=NULL; OMPI_MPI_OFFSET_TYPE st_loc=-1, end_loc=-1, off, done; OMPI_MPI_OFFSET_TYPE size=0, req_off, len; MPI_Aint buftype_extent; int hole; size_t byte_size; MPI_Datatype byte = MPI_BYTE; #if DEBUG_ON int ii,jj; #endif char *write_buf=NULL; opal_datatype_type_size(&byte->super, &byte_size); for (i = 0; i < fh->f_size; i++){ if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0;i<fh->f_size;i++){ for(j=0;j< others_req[i].count; j++){ st_loc = OMPIO_MIN(st_loc, others_req[i].offsets[j]); end_loc = OMPIO_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } } ntimes = (int) ((end_loc - st_loc + mca_fcoll_two_phase_cycle_buffer_size)/mca_fcoll_two_phase_cycle_buffer_size); if ((st_loc == -1) && (end_loc == -1)) { ntimes = 0; } fh->f_comm->c_coll.coll_allreduce (&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (ntimes){ write_buf = (char *) malloc (mca_fcoll_two_phase_cycle_buffer_size); if ( NULL == write_buf ){ return OMPI_ERR_OUT_OF_RESOURCE; } } curr_offlen_ptr = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == curr_offlen_ptr ){ return OMPI_ERR_OUT_OF_RESOURCE; } count = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == count ){ return OMPI_ERR_OUT_OF_RESOURCE; } partial_recv = (int *)calloc(fh->f_size, sizeof(int)); if ( NULL == partial_recv ){ return OMPI_ERR_OUT_OF_RESOURCE; } send_size = (int *) calloc(fh->f_size,sizeof(int)); if ( NULL == send_size ){ return OMPI_ERR_OUT_OF_RESOURCE; } recv_size = (int *) calloc(fh->f_size,sizeof(int)); if ( NULL == recv_size ){ return OMPI_ERR_OUT_OF_RESOURCE; } send_buf_idx = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == send_buf_idx ){ return OMPI_ERR_OUT_OF_RESOURCE; } sent_to_proc = (int *) calloc(fh->f_size, sizeof(int)); if ( NULL == sent_to_proc){ return OMPI_ERR_OUT_OF_RESOURCE; } curr_to_proc = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == curr_to_proc ){ return OMPI_ERR_OUT_OF_RESOURCE; } done_to_proc = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == done_to_proc ){ return OMPI_ERR_OUT_OF_RESOURCE; } start_pos = (int *) malloc(fh->f_size*sizeof(int)); if ( NULL == start_pos ){ return OMPI_ERR_OUT_OF_RESOURCE; } done = 0; off = st_loc; ompi_datatype_type_extent(datatype, &buftype_extent); for (m=0;m <ntimes; m++){ for (i=0; i< fh->f_size; i++) count[i] = recv_size[i] = 0; size = OMPIO_MIN((unsigned)mca_fcoll_two_phase_cycle_buffer_size, end_loc-st_loc+1-done); for (i=0;i<fh->f_size;i++){ if(others_req[i].count){ start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_recv[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_recv[i]; req_len = others_req[i].lens[j] - partial_recv[i]; partial_recv[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < off + size) { count[i]++; #if DEBUG_ON printf("%d: req_off : %lld, off : %lld, size : %lld, count[%d]: %d\n", fh->f_rank, req_off, off, size,i, count[i]); #endif MPI_Address(write_buf+req_off-off, &(others_req[i].mem_ptrs[j])); #if DEBUG_ON printf("%d : mem_ptrs : %ld\n", fh->f_rank, others_req[i].mem_ptrs[j]); #endif recv_size[i] += (int) (OMPIO_MIN(off + size - req_off, (unsigned)req_len)); if (off+size-req_off < (unsigned)req_len){ partial_recv[i] = (int)(off + size - req_off); break; } } else break; } curr_offlen_ptr[i] = j; } } ret = two_phase_exchage_data(fh, buf, write_buf, offset_len,send_size, start_pos,recv_size,off,size, count, partial_recv, sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, others_req, send_buf_idx, curr_to_proc, done_to_proc, m, buf_idx, buftype_extent, striping_unit, aggregator_list, &hole); if ( OMPI_SUCCESS != ret ){ goto exit; } flag = 0; for (i=0; i<fh->f_size; i++) if (count[i]) flag = 1; if (flag){ #if TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif #if DEBUG_ON printf("rank : %d enters writing\n", fh->f_rank); printf("size : %ld, off : %ld\n",size, off); for (ii=0, jj=0;jj<size;jj+=4, ii++){ printf("%d : write_buf[%d]: %d\n", fh->f_rank, ii,((int *)write_buf[jj])); } #endif len = size * byte_size; fh->f_io_array = (mca_io_ompio_io_array_t *)malloc (sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_io_array[0].offset =(IOVBASE_TYPE *)(intptr_t) off; fh->f_io_array[0].length = len; fh->f_io_array[0].memory_address = write_buf; fh->f_num_of_io_entries = 1; #if DEBUG_ON for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf("%d: ADDRESS: %p OFFSET: %ld LENGTH: %d\n", fh->f_rank, fh->f_io_array[i].memory_address, fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif if (fh->f_num_of_io_entries){ if (OMPI_SUCCESS != fh->f_fbtl->fbtl_pwritev (fh, NULL)) { opal_output(1, "WRITE FAILED\n"); return OMPI_ERROR; } } #if TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += (end_write_time - start_write_time); #endif } /***************** DONE WRITING *****************************************/ /****RESET **********************/ fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } off += size; done += size; } for (i=0; i<fh->f_size; i++) count[i] = recv_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) { ret = two_phase_exchage_data(fh, buf, write_buf, offset_len,send_size, start_pos,recv_size,off,size, count, partial_recv, sent_to_proc, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf,others_req, send_buf_idx, curr_to_proc, done_to_proc, m, buf_idx, buftype_extent, striping_unit, aggregator_list, &hole); if ( OMPI_SUCCESS != ret ){ goto exit; } } exit: if (ntimes){ if ( NULL != write_buf ){ free(write_buf); } } if ( NULL != curr_offlen_ptr ){ free(curr_offlen_ptr); } if ( NULL != count ){ free(count); } if ( NULL != partial_recv ){ free(partial_recv); } if ( NULL != send_size ){ free(send_size); } if ( NULL != recv_size ){ free(recv_size); } if ( NULL != sent_to_proc ){ free(sent_to_proc); } if ( NULL != start_pos ){ free(start_pos); } if ( NULL != send_buf_idx ){ free(send_buf_idx); } if ( NULL != curr_to_proc ){ free(curr_to_proc); } if ( NULL != done_to_proc ){ free(done_to_proc); } return ret; }
int mpich_typeub3( void ) { int blocklen[3], err = 0, idisp[3]; size_t sz; OPAL_PTRDIFF_TYPE disp[3], lb, ub, ex; opal_datatype_t *types[3], *dt1, *dt2, *dt3, *dt4, *dt5; /* Create a datatype with explicit LB and UB */ blocklen[0] = 1; blocklen[1] = 1; blocklen[2] = 1; disp[0] = -3; disp[1] = 0; disp[2] = 6; types[0] = (opal_datatype_t*)&opal_datatype_lb; types[1] = (opal_datatype_t*)&opal_datatype_int4; types[2] = (opal_datatype_t*)&opal_datatype_ub; /* Generate samples for contiguous, hindexed, hvector, indexed, and vector (struct and contiguous tested in typeub2) */ opal_datatype_create_struct(3, blocklen, disp, types, &dt1); opal_datatype_commit( dt1 ); /* This type is the same as in typeub2, and is tested there */ types[0]=dt1; types[1]=dt1; blocklen[0]=1; blocklen[1]=1; disp[0]=-4; disp[1]=7; idisp[0]=-4; idisp[1]=7; opal_datatype_create_hindexed( 2, blocklen, disp, dt1, &dt2 ); opal_datatype_commit( dt2 ); opal_datatype_type_lb( dt2, &lb ); opal_datatype_type_ub( dt2, &ub ); opal_datatype_type_extent( dt2, &ex ); opal_datatype_type_size( dt2, &sz ); if (lb != -7 || ub != 13 || ex != 20) { printf("hindexed lb %d ub %d extent %d size %d\n", (int)-7, (int)13, (int)20, (int)sz); printf("hindexed lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz); err++; } else printf( "hindexed ok\n" ); opal_datatype_create_indexed( 2, blocklen, idisp, dt1, &dt3 ); opal_datatype_commit( dt3 ); opal_datatype_type_lb( dt3, &lb ); opal_datatype_type_ub( dt3, &ub ); opal_datatype_type_extent( dt3, &ex ); opal_datatype_type_size( dt3, &sz ); if (lb != -39 || ub != 69 || ex != 108) { printf("indexed lb %d ub %d extent %d size %d\n", (int)-39, (int)69, (int)108, (int)sz); printf("indexed lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz); err++; } else printf( "indexed ok\n" ); opal_datatype_create_hvector( 2, 1, 14, dt1, &dt4 ); opal_datatype_commit( dt4 ); opal_datatype_type_lb( dt4, &lb ); opal_datatype_type_ub( dt4, &ub ); opal_datatype_type_extent( dt4, &ex ); opal_datatype_type_size( dt4, &sz ); if (lb != -3 || ub != 20 || ex != 23) { printf("hvector lb %d ub %d extent %d size %d\n", (int)-3, (int)20, (int)23, (int)sz); printf("hvector lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz); err++; } else printf( "hvector ok\n" ); opal_datatype_create_vector( 2, 1, 14, dt1, &dt5 ); opal_datatype_commit( dt5 ); opal_datatype_type_lb( dt5, &lb ); opal_datatype_type_ub( dt5, &ub ); opal_datatype_type_extent( dt5, &ex ); opal_datatype_type_size( dt5, &sz ); if (lb != -3 || ub != 132 || ex != 135) { printf("vector lb %d ub %d extent %d size %d\n", (int)-3, (int)132, (int)135, (int)sz); printf("vector lb %d ub %d extent %d size %d\n", (int)lb, (int)ub, (int)ex, (int)sz); err++; } else printf( "vector ok\n" ); OBJ_RELEASE( dt1 ); /*assert( dt1 == NULL );*/ OBJ_RELEASE( dt2 ); /*assert( dt2 == NULL );*/ OBJ_RELEASE( dt3 ); /*assert( dt3 == NULL );*/ OBJ_RELEASE( dt4 ); /*assert( dt4 == NULL );*/ OBJ_RELEASE( dt5 ); assert( dt5 == NULL ); return err; }
int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE disp, ompi_datatype_t *etype, ompi_datatype_t *filetype, char *datarep, ompi_info_t *info) { size_t max_data = 0; int i; int num_groups = 0; contg *contg_groups; size_t ftype_size; OPAL_PTRDIFF_TYPE ftype_extent, lb, ub; ompi_datatype_t *newfiletype; if ( NULL != fh->f_etype ) { ompi_datatype_destroy (&fh->f_etype); } if ( NULL != fh->f_filetype ) { ompi_datatype_destroy (&fh->f_filetype); } if ( NULL != fh->f_orig_filetype ) { ompi_datatype_destroy (&fh->f_orig_filetype); } if (NULL != fh->f_decoded_iov) { free (fh->f_decoded_iov); fh->f_decoded_iov = NULL; } if (NULL != fh->f_datarep) { free (fh->f_datarep); fh->f_datarep = NULL; } /* Reset the flags first */ fh->f_flags = 0; fh->f_flags |= OMPIO_FILE_VIEW_IS_SET; fh->f_datarep = strdup (datarep); ompi_datatype_duplicate (filetype, &fh->f_orig_filetype ); opal_datatype_get_extent(&filetype->super, &lb, &ftype_extent); opal_datatype_type_size (&filetype->super, &ftype_size); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){ ompi_datatype_create_contiguous(MCA_IO_DEFAULT_FILE_VIEW_SIZE, &ompi_mpi_byte.dt, &newfiletype); ompi_datatype_commit (&newfiletype); } else { newfiletype = filetype; } fh->f_iov_count = 0; fh->f_disp = disp; fh->f_offset = disp; fh->f_total_bytes = 0; ompi_io_ompio_decode_datatype (fh, newfiletype, 1, NULL, &max_data, &fh->f_decoded_iov, &fh->f_iov_count); opal_datatype_get_extent(&newfiletype->super, &lb, &fh->f_view_extent); opal_datatype_type_ub (&newfiletype->super, &ub); opal_datatype_type_size (&etype->super, &fh->f_etype_size); opal_datatype_type_size (&newfiletype->super, &fh->f_view_size); ompi_datatype_duplicate (etype, &fh->f_etype); ompi_datatype_duplicate (newfiletype, &fh->f_filetype); fh->f_cc_size = get_contiguous_chunk_size (fh); if (opal_datatype_is_contiguous_memory_layout(&etype->super,1)) { if (opal_datatype_is_contiguous_memory_layout(&filetype->super,1) && fh->f_view_extent == (OPAL_PTRDIFF_TYPE)fh->f_view_size ) { fh->f_flags |= OMPIO_CONTIGUOUS_FVIEW; } } contg_groups = (contg*) calloc ( 1, fh->f_size * sizeof(contg)); if (NULL == contg_groups) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } for( i = 0; i < fh->f_size; i++){ contg_groups[i].procs_in_contg_group = (int*)calloc (1,fh->f_size * sizeof(int)); if(NULL == contg_groups[i].procs_in_contg_group){ int j; opal_output (1, "OUT OF MEMORY\n"); for(j=0; j<i; j++) { free(contg_groups[j].procs_in_contg_group); } free(contg_groups); return OMPI_ERR_OUT_OF_RESOURCE; } } if( OMPI_SUCCESS != mca_io_ompio_fview_based_grouping(fh, &num_groups, contg_groups)){ opal_output(1, "mca_io_ompio_fview_based_grouping() failed\n"); free(contg_groups); return OMPI_ERROR; } if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) && (num_groups == 1 || num_groups == fh->f_size)) ) { mca_io_ompio_finalize_initial_grouping(fh, num_groups, contg_groups); } for( i = 0; i < fh->f_size; i++){ free(contg_groups[i].procs_in_contg_group); } free(contg_groups); if ( etype == filetype && ompi_datatype_is_predefined (filetype ) && ftype_extent == (OPAL_PTRDIFF_TYPE)ftype_size ){ ompi_datatype_destroy ( &newfiletype ); } if (OMPI_SUCCESS != mca_fcoll_base_file_select (fh, NULL)) { opal_output(1, "mca_fcoll_base_file_select() failed\n"); return OMPI_ERROR; } return OMPI_SUCCESS; }