int mca_fcoll_two_phase_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int i, j,interleave_count=0, striping_unit=0; uint32_t iov_count=0,ti; struct iovec *decoded_iov=NULL, *temp_iov=NULL; size_t max_data = 0, total_bytes = 0; int domain_size=0, *count_my_req_per_proc=NULL, count_my_req_procs; int count_other_req_procs, ret=OMPI_SUCCESS; size_t *buf_indices=NULL; int local_count = 0, local_size=0,*aggregator_list = NULL; struct iovec *iov = NULL; OMPI_MPI_OFFSET_TYPE start_offset, end_offset, fd_size; OMPI_MPI_OFFSET_TYPE *start_offsets=NULL, *end_offsets=NULL; OMPI_MPI_OFFSET_TYPE *fd_start=NULL, *fd_end=NULL, min_st_offset; Flatlist_node *flat_buf=NULL; mca_io_ompio_access_array_t *my_req=NULL, *others_req=NULL; MPI_Aint send_buf_addr; #if TIME_BREAKDOWN print_entry nentry; #endif if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { fh->f_flags = fh->f_flags | OMPIO_CONTIGUOUS_MEMORY; } if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &temp_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } send_buf_addr = (OPAL_PTRDIFF_TYPE)buf; decoded_iov = (struct iovec *)malloc (iov_count * sizeof(struct iovec)); for (ti = 0; ti < iov_count; ti ++){ decoded_iov[ti].iov_base = (IOVBASE_TYPE *)( (OPAL_PTRDIFF_TYPE)temp_iov[ti].iov_base - send_buf_addr); decoded_iov[ti].iov_len = temp_iov[ti].iov_len ; #if DEBUG_ON printf("d_offset[%d]: %ld, d_len[%d]: %ld\n", ti, (OPAL_PTRDIFF_TYPE)decoded_iov[ti].iov_base, ti, decoded_iov[ti].iov_len); #endif } } else{ max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } if(-1 == mca_fcoll_two_phase_num_io_procs){ ret = ompi_io_ompio_set_aggregator_props (fh, mca_fcoll_two_phase_num_io_procs, max_data); if ( OMPI_SUCCESS != ret){ return ret; } mca_fcoll_two_phase_num_io_procs = ceil((float)fh->f_size/fh->f_procs_per_group); } if (mca_fcoll_two_phase_num_io_procs > fh->f_size){ mca_fcoll_two_phase_num_io_procs = fh->f_size; } #if DEBUG_ON printf("Number of aggregators : %ld\n", mca_fcoll_two_phase_num_io_procs); #endif aggregator_list = (int *) malloc (mca_fcoll_two_phase_num_io_procs * sizeof(int)); if ( NULL == aggregator_list ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (i =0; i< mca_fcoll_two_phase_num_io_procs; i++){ aggregator_list[i] = i; } ret = ompi_io_ompio_generate_current_file_view (fh, max_data, &iov, &local_count); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = fh->f_comm->c_coll.coll_allreduce (&max_data, &total_bytes, 1, MPI_DOUBLE, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if ( OMPI_SUCCESS != ret ) { goto exit; } if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { /* This datastructre translates between OMPIO->ROMIO its a little hacky!*/ /* But helps to re-use romio's code for handling non-contiguous file-type*/ flat_buf = (Flatlist_node *)malloc(sizeof(Flatlist_node)); if ( NULL == flat_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->type = datatype; flat_buf->next = NULL; flat_buf->count = 0; local_size = iov_count/count; flat_buf->indices = (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == flat_buf->indices ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->blocklens = (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == flat_buf->blocklens ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->count = local_size; i=0;j=0; while(j < local_size){ flat_buf->indices[j] = (OMPI_MPI_OFFSET_TYPE)(intptr_t)decoded_iov[i].iov_base; flat_buf->blocklens[j] = decoded_iov[i].iov_len; if(i < (int)iov_count) i+=1; j+=1; } #if DEBUG_ON printf("flat_buf_count : %d\n", flat_buf->count); for(i=0;i<flat_buf->count;i++){ printf("%d: blocklen[%d] : %lld, indices[%d]: %lld \n", fh->f_rank, i, flat_buf->blocklens[i], i ,flat_buf->indices[i]); } #endif } #if DEBUG_ON printf("%d: fcoll:two_phase:write_all->total_bytes:%ld, local_count: %d\n", fh->f_rank,total_bytes, local_count); for (i=0 ; i<local_count ; i++) { printf("%d: fcoll:two_phase:write_all:OFFSET:%ld,LENGTH:%ld\n", fh->f_rank, (size_t)iov[i].iov_base, (size_t)iov[i].iov_len); } #endif start_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[0].iov_base; end_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_base + (OMPI_MPI_OFFSET_TYPE)iov[local_count-1].iov_len - 1; #if DEBUG_ON printf("%d: fcoll:two_phase:write_all:START OFFSET:%ld,END OFFSET:%ld\n", fh->f_rank, (size_t)start_offset, (size_t)end_offset); #endif start_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc (fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == start_offsets ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } end_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc (fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == end_offsets ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&start_offset, 1, MPI_LONG, start_offsets, 1, MPI_LONG, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&end_offset, 1, MPI_LONG, end_offsets, 1, MPI_LONG, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON for (i=0;i<fh->f_size;i++){ printf("%d: fcoll:two_phase:write_all:start[%d]:%ld,end[%d]:%ld\n", fh->f_rank,i, (size_t)start_offsets[i],i, (size_t)end_offsets[i]); } #endif for (i=1; i<fh->f_size; i++){ if ((start_offsets[i] < end_offsets[i-1]) && (start_offsets[i] <= end_offsets[i])){ interleave_count++; } } #if DEBUG_ON printf("%d: fcoll:two_phase:write_all:interleave_count:%d\n", fh->f_rank,interleave_count); #endif ret = mca_fcoll_two_phase_domain_partition(fh, start_offsets, end_offsets, &min_st_offset, &fd_start, &fd_end, domain_size, &fd_size, striping_unit, mca_fcoll_two_phase_num_io_procs); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON for (i=0;i<mca_fcoll_two_phase_num_io_procs;i++){ printf("fd_start[%d] : %lld, fd_end[%d] : %lld, local_count: %d\n", i, fd_start[i], i, fd_end[i], local_count); } #endif ret = mca_fcoll_two_phase_calc_my_requests (fh, iov, local_count, min_st_offset, fd_start, fd_end, fd_size, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_indices, striping_unit, mca_fcoll_two_phase_num_io_procs, aggregator_list); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = mca_fcoll_two_phase_calc_others_requests(fh, count_my_req_procs, count_my_req_per_proc, my_req, &count_other_req_procs, &others_req); if (OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON printf("count_other_req_procs : %d\n", count_other_req_procs); #endif #if TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif ret = two_phase_exch_and_write(fh, buf, datatype, others_req, iov, local_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, buf_indices, striping_unit, aggregator_list); if (OMPI_SUCCESS != ret){ goto exit; } #if TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += (end_exch - start_exch); nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (is_aggregator(fh->f_rank, mca_fcoll_two_phase_num_io_procs, aggregator_list)){ nentry.aggregator = 1; } else{ nentry.aggregator = 0; } nentry.nprocs_for_coll = mca_fcoll_two_phase_num_io_procs; if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){ ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit : if (flat_buf != NULL) { if (flat_buf->blocklens != NULL) { free (flat_buf->blocklens); } if (flat_buf->indices != NULL) { free (flat_buf->indices); } free (flat_buf); } if (start_offsets != NULL) { free(start_offsets); } if (end_offsets != NULL){ free(end_offsets); } if (aggregator_list != NULL){ free(aggregator_list); } return ret; }
int mca_fcoll_two_phase_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int ret = OMPI_SUCCESS, i = 0, j = 0, interleave_count = 0, striping_unit = 0; MPI_Aint recv_buf_addr = 0; uint32_t iov_count = 0, ti = 0; struct iovec *decoded_iov = NULL, *temp_iov = NULL, *iov = NULL; size_t max_data = 0; long long_max_data = 0, long_total_bytes = 0; int domain_size=0, *count_my_req_per_proc=NULL, count_my_req_procs = 0; int count_other_req_procs; size_t *buf_indices=NULL; int *aggregator_list = NULL, local_count = 0, local_size = 0; int two_phase_num_io_procs=1; OMPI_MPI_OFFSET_TYPE start_offset = 0, end_offset = 0, fd_size = 0; OMPI_MPI_OFFSET_TYPE *start_offsets=NULL, *end_offsets=NULL; OMPI_MPI_OFFSET_TYPE *fd_start=NULL, *fd_end=NULL, min_st_offset = 0; Flatlist_node *flat_buf=NULL; mca_io_ompio_access_array_t *my_req=NULL, *others_req=NULL; #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN mca_common_ompio_print_entry nentry; #endif // if (opal_datatype_is_predefined(&datatype->super)) { // fh->f_flags = fh->f_flags | OMPIO_CONTIGUOUS_MEMORY; // } if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh, datatype, count, buf, &max_data, &temp_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } recv_buf_addr = (size_t)(buf); decoded_iov = (struct iovec *) calloc (iov_count, sizeof(struct iovec)); for (ti = 0; ti < iov_count; ti++){ decoded_iov[ti].iov_base = (IOVBASE_TYPE *) ((OPAL_PTRDIFF_TYPE)temp_iov[ti].iov_base - recv_buf_addr); decoded_iov[ti].iov_len = temp_iov[ti].iov_len; #if DEBUG printf("d_offset[%d]: %ld, d_len[%d]: %ld\n", ti, (OPAL_PTRDIFF_TYPE)decoded_iov[ti].iov_base, ti, decoded_iov[ti].iov_len); #endif } } else{ max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } fh->f_get_num_aggregators (&two_phase_num_io_procs); if (-1 == two_phase_num_io_procs ){ ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *)fh, two_phase_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } two_phase_num_io_procs = fh->f_final_num_aggrs; } if (two_phase_num_io_procs > fh->f_size){ two_phase_num_io_procs = fh->f_size; } aggregator_list = (int *) calloc (two_phase_num_io_procs, sizeof(int)); if (NULL == aggregator_list){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i< two_phase_num_io_procs; i++){ aggregator_list[i] = i * fh->f_size / two_phase_num_io_procs; } ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *)fh, max_data, &iov, &local_count); if (OMPI_SUCCESS != ret){ goto exit; } long_max_data = (long) max_data; ret = fh->f_comm->c_coll.coll_allreduce (&long_max_data, &long_total_bytes, 1, MPI_LONG, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if ( OMPI_SUCCESS != ret ) { goto exit; } if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { /* This datastructre translates between OMPIO->ROMIO its a little hacky!*/ /* But helps to re-use romio's code for handling non-contiguous file-type*/ /*Flattened datatype for ompio is in decoded_iov it translated into flatbuf*/ flat_buf = (Flatlist_node *)calloc(1, sizeof(Flatlist_node)); if ( NULL == flat_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->type = datatype; flat_buf->next = NULL; flat_buf->count = 0; flat_buf->indices = NULL; flat_buf->blocklens = NULL; if ( 0 < count ) { local_size = OMPIO_MAX(1,iov_count/count); } else { local_size = 0; } if ( 0 < local_size ) { flat_buf->indices = (OMPI_MPI_OFFSET_TYPE *)calloc(local_size, sizeof(OMPI_MPI_OFFSET_TYPE)); if (NULL == flat_buf->indices){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->blocklens = (OMPI_MPI_OFFSET_TYPE *)calloc(local_size, sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == flat_buf->blocklens ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } flat_buf->count = local_size; for (j = 0 ; j < local_size ; ++j) { flat_buf->indices[j] = (OMPI_MPI_OFFSET_TYPE)(intptr_t)decoded_iov[j].iov_base; flat_buf->blocklens[j] = decoded_iov[j].iov_len; } #if DEBUG printf("flat_buf count: %d\n", flat_buf->count); for(i=0;i<flat_buf->count;i++){ printf("%d: blocklen[%d] : %lld, indices[%d]: %lld\n", fh->f_rank, i, flat_buf->blocklens[i], i ,flat_buf->indices[i]); } #endif } #if DEBUG printf("%d: total_bytes:%ld, local_count: %d\n", fh->f_rank, long_total_bytes, local_count); for (i=0 ; i<local_count ; i++) { printf("%d: fcoll:two_phase:read_all:OFFSET:%ld,LENGTH:%ld\n", fh->f_rank, (size_t)iov[i].iov_base, (size_t)iov[i].iov_len); } #endif start_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[0].iov_base; if ( 0 < local_count ) { end_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_base + (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_len - 1; } else { end_offset = 0; } #if DEBUG printf("%d: START OFFSET:%ld, END OFFSET:%ld\n", fh->f_rank, (size_t)start_offset, (size_t)end_offset); #endif start_offsets = (OMPI_MPI_OFFSET_TYPE *)calloc (fh->f_size, sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == start_offsets ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } end_offsets = (OMPI_MPI_OFFSET_TYPE *)calloc (fh->f_size, sizeof(OMPI_MPI_OFFSET_TYPE)); if (NULL == end_offsets){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&start_offset, 1, OMPI_OFFSET_DATATYPE, start_offsets, 1, OMPI_OFFSET_DATATYPE, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&end_offset, 1, OMPI_OFFSET_DATATYPE, end_offsets, 1, OMPI_OFFSET_DATATYPE, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG for (i=0;i<fh->f_size;i++){ printf("%d: start[%d]:%ld,end[%d]:%ld\n", fh->f_rank,i, (size_t)start_offsets[i],i, (size_t)end_offsets[i]); } #endif for (i=1; i<fh->f_size; i++){ if ((start_offsets[i] < end_offsets[i-1]) && (start_offsets[i] <= end_offsets[i])){ interleave_count++; } } #if DEBUG printf("%d: interleave_count:%d\n", fh->f_rank,interleave_count); #endif ret = mca_fcoll_two_phase_domain_partition(fh, start_offsets, end_offsets, &min_st_offset, &fd_start, &fd_end, domain_size, &fd_size, striping_unit, two_phase_num_io_procs); if (OMPI_SUCCESS != ret){ goto exit; } #if DEBUG for (i=0;i<two_phase_num_io_procs;i++){ printf("fd_start[%d] : %lld, fd_end[%d] : %lld, local_count: %d\n", i, fd_start[i], i, fd_end[i], local_count); } #endif ret = mca_fcoll_two_phase_calc_my_requests (fh, iov, local_count, min_st_offset, fd_start, fd_end, fd_size, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_indices, striping_unit, two_phase_num_io_procs, aggregator_list); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = mca_fcoll_two_phase_calc_others_requests(fh, count_my_req_procs, count_my_req_per_proc, my_req, &count_other_req_procs, &others_req); if (OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG printf("%d count_other_req_procs : %d\n", fh->f_rank, count_other_req_procs); #endif #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_rexch = MPI_Wtime(); #endif ret = two_phase_read_and_exch(fh, buf, datatype, others_req, iov, local_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, buf_indices, striping_unit, two_phase_num_io_procs, aggregator_list); if (OMPI_SUCCESS != ret){ goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN end_rexch = MPI_Wtime(); read_exch += (end_rexch - start_rexch); nentry.time[0] = read_time; nentry.time[1] = rcomm_time; nentry.time[2] = read_exch; if (isread_aggregator(fh->f_rank, two_phase_num_io_procs, aggregator_list)){ nentry.aggregator = 1; } else{ nentry.aggregator = 0; } nentry.nprocs_for_coll = two_phase_num_io_procs; if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){ mca_common_ompio_register_print_entry(fh->f_coll_read_time, nentry); } #endif exit: if (flat_buf != NULL){ if (flat_buf->blocklens != NULL){ free (flat_buf->blocklens); } if (flat_buf->indices != NULL){ free (flat_buf->indices); } free (flat_buf); } free (start_offsets); free (end_offsets); free (aggregator_list); free (fd_start); free (decoded_iov); free (buf_indices); free (count_my_req_per_proc); free (my_req); free (others_req); free (fd_end); return ret; }