int mca_io_ompio_generate_current_file_view (ompi_file_t *fp, size_t max_data, struct iovec **f_iov, int *iov_count) { int res; mca_io_ompio_data_t *data; mca_io_ompio_file_t *fh; data = (mca_io_ompio_data_t *) fp->f_io_selected_data; fh = &data->ompio_fh; res = ompi_io_ompio_generate_current_file_view (fh, max_data, f_iov, iov_count); if(res != OMPI_SUCCESS){ printf("Error in ompi_io_generate_current_file_view\n"); return res; } return OMPI_SUCCESS; }
int mca_fcoll_dynamic_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint total_bytes_written = 0; /* total bytes that have been written*/ MPI_Aint total_bytes = 0; /* total bytes to be written */ MPI_Aint bytes_to_write_in_cycle = 0; /* left to be written in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total written in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, l=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been written from the current value from total_bytes_per_process */ int bytes_sent = 0, ret =0; int blocks=0, entries_per_aggregator=0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; char *send_buf = NULL; size_t current_position = 0; struct iovec *local_iov_array=NULL, *global_iov_array=NULL; local_io_array *file_offsets_for_agg=NULL; /* global iovec at the writers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; int local_count = 0, temp_pindex; int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL; int current_index = 0, temp_index=0; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL, *sorted_file_offsets=NULL; int *displs = NULL; size_t max_data = 0; int **blocklen_per_process=NULL; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; ompi_datatype_t **recvtype = NULL; MPI_Aint *total_bytes_per_process = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; int datatype_size, recv_req_count=0; #if TIME_BREAKDOWN double start_time=0.0, end_time=0.0, start_time2=0.0, end_time2=0.0; double total=0.0 , total_io=0.0, max_io=0.0; /* max_pp=0.0;*/ double start_ptime=0.0, end_ptime=0.0, tpw=0.0; /* max_tpw=0.0;*/ double start_cio_array=0.0, end_cio_array=0.0, tcio_array=0.0;/* max_cio=0.0;*/ double start_sr=0.0, end_sr=0.0, tsr=0.0;/* max_sr=0.0;*/ double comm_time = 0.0, max_comm_time=0.0; double write_time = 0.0, max_write_time=0.0; #endif #if TIME_BREAKDOWN start_time = MPI_Wtime(); #endif if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } if (! (fh->f_flags & OMPIO_AGGREGATOR_IS_SET)) { ret = ompi_io_ompio_set_aggregator_props (fh, mca_fcoll_dynamic_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ goto exit; } } if (-1 == mca_fcoll_dynamic_num_io_procs) { mca_fcoll_dynamic_num_io_procs = 1; } #if TIME_BREAKDOWN if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { start_time = MPI_Wtime(); } #endif total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ret = ompi_io_ompio_generate_current_file_view(fh, max_data, &local_iov_array, &local_count); if (ret != OMPI_SUCCESS){ goto exit; } #if DEBUG_ON for (i=0 ; i<local_count ; i++) { printf("Local offset-length pair for rank:%d \n", fh->f_rank); printf("%d: OFFSET: %p LENGTH: %lld\n", fh->f_rank, iov[i].iov_base, iov[i].iov_len); } #endif /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret){ goto exit; } displs = (int*) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } #endif /* allocate the global iovec */ if (0 != total_fview_count) { global_iov_array = (struct iovec*) malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_iov_array){ opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (fh->f_flags & OMPIO_UNIFORM_FVIEW) { ret = ompi_io_ompio_allgather_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, local_count, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } } else { ret = ompi_io_ompio_allgatherv_array (local_iov_array, local_count, fh->f_iov_type, global_iov_array, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret){ goto exit; } } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ompi_io_ompio_sort_iovec (global_iov_array, total_fview_count, sorted); } if (NULL != local_iov_array){ free(local_iov_array); local_iov_array = NULL; } if (NULL != displs){ free(displs); displs=NULL; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { uint32_t tv=0; for (tv=0 ; tv<total_fview_count ; tv++) { printf("%d: OFFSET: %lld LENGTH: %ld\n", fh->f_rank, global_iov_array[sorted[tv]].offset, global_iov_array[sorted[tv]].length); } } #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(i=0;i<fh->f_procs_per_group;i++){ blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } bytes_per_cycle = mca_fcoll_dynamic_cycle_buffer_size; cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; #if TIME_BREAKDOWN end_time = MPI_Wtime(); total = end_time-start_time; start_time2 = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { /* Getting ready for next cycle Initializing and freeing buffers*/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == recvtype){ recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0;l<fh->f_procs_per_group;l++){ disp_index[l] = 1; if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]){ opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets){ free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg){ free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements){ free(memory_displacements); memory_displacements = NULL; } } if (cycles-1 == index) { bytes_to_write_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_write_in_cycle = bytes_per_cycle; } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ /* Calculate how much data will be contributed in this cycle by each process*/ bytes_sent = 0; #if DEBUG_ON printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", bytes_to_write_in_cycle, index); #endif /* The blocklen and displs calculation only done at aggregators!*/ #if TIME_BREAKDOWN start_cio_array = MPI_Wtime(); #endif while (bytes_to_write_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_write_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_remaining; } current_index ++; bytes_to_write_in_cycle -= bytes_remaining; bytes_remaining = 0; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { /* In this cases the length is consumed so allocating for next displacement and blocklength*/ blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *) realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; } continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base + (global_iov_array[sorted[current_index]].iov_len - bytes_remaining); } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining -= bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } } else { if (bytes_to_write_in_cycle < (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = bytes_to_write_in_cycle; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += bytes_to_write_in_cycle; } bytes_remaining = global_iov_array[sorted[current_index]].iov_len - bytes_to_write_in_cycle; bytes_to_write_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { blocklen_per_process[n][disp_index[n] - 1] = global_iov_array[sorted[current_index]].iov_len; displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE) global_iov_array[sorted[current_index]].iov_base; blocklen_per_process[n] = (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int)); displs_per_process[n] = (MPI_Aint *)realloc ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint)); blocklen_per_process[n][disp_index[n]] = 0; displs_per_process[n][disp_index[n]] = 0; disp_index[n] += 1; /*realloc for next blocklength and assign this displacement and check for next displs as the total length of this entry has been consumed!*/ } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_sent += global_iov_array[sorted[current_index]].iov_len; } bytes_to_write_in_cycle -= global_iov_array[sorted[current_index]].iov_len; current_index ++; continue; } } } #if TIME_BREAKDOWN start_sr = MPI_Wtime(); #endif /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { entries_per_aggregator=0; for (i=0;i<fh->f_procs_per_group; i++){ for (j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0) entries_per_aggregator++ ; } } #if DEBUG_ON printf("%d: cycle: %d, bytes_sent: %d\n ",fh->f_rank,index, bytes_sent); printf("%d : Entries per aggregator : %d\n",fh->f_rank,entries_per_aggregator); #endif if (entries_per_aggregator > 0){ file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /*Moving file offsets to an IO array!*/ temp_index = 0; for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen_per_process[i][j] > 0){ file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } } else{ continue; } /* Sort the displacements for each aggregator*/ local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); /*create contiguous memory displacements based on blocklens on the same displs array and map it to this aggregator's actual file-displacements (this is in the io-array created above)*/ memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++){ memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } /*Now update the displacements array with memory offsets*/ global_count = 0; for (i=0;i<entries_per_aggregator;i++){ temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else{ printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index){ free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0;i<fh->f_procs_per_group; i++){ for(j=0;j<disp_index[i];j++){ if (blocklen[i][j] > 0){ printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); } } } printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator;i++){ printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]]); } printf("%d : global_count : %ld, bytes_sent : %d\n", fh->f_rank,global_count, bytes_sent); #endif global_buf = (char *) malloc (global_count); if (NULL == global_buf){ opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req_count = 0; for (i=0;i<fh->f_procs_per_group; i++){ ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); MPI_Type_size (recvtype[i], &datatype_size); if (datatype_size){ recv_req = (MPI_Request *)realloc ((void *)recv_req, (recv_req_count + 1)*sizeof(MPI_Request)); ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[recv_req_count])); recv_req_count++; if (OMPI_SUCCESS != ret){ goto exit; } } } } #if TIME_BREAKDOWN end_cio_array = MPI_Wtime(); tcio_array = end_cio_array - start_cio_array; #endif if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_sent) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_sent); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } remaining = bytes_sent; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *) mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_sent; /* Gather the sendbuf from each process in appropritate locations in aggregators*/ send_req = (MPI_Request *) malloc (sizeof(MPI_Request)); if (NULL == send_req){ opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } if (bytes_sent){ ret = MCA_PML_CALL(isend(send_buf, bytes_sent, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req)); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = ompi_request_wait(send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { ret = ompi_request_wait_all (recv_req_count, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret){ goto exit; } } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){ printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif #if TIME_BREAKDOWN end_sr = MPI_Wtime(); tsr = end_sr - start_sr; comm_time += tsr; #endif if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { if (NULL != send_buf) { free (send_buf); send_buf = NULL; } } /********************************************************** **************** DONE GATHERING OF DATA ****************** *********************************************************/ /********************************************************** ******* Create the io array, and pass it to fbtl ********* *********************************************************/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1;i<entries_per_aggregator;i++){ /* If the enrties are contiguous merge them, else make a new entry */ if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset){ fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if TIME_BREAKDOWN start_ptime = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if (OMPI_SUCCESS != fh->f_fbtl->fbtl_pwritev (fh, NULL)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_ptime = MPI_Wtime(); tpw = end_ptime - start_ptime; write_time += tpw; #endif } if (NULL != send_req){ free(send_req); send_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } #if TIME_BREAKDOWN end_time2 = MPI_Wtime(); total_io = end_time2-start_time2; fh->f_comm->c_coll.coll_allreduce (&total_io, &max_io, 1, MPI_DOUBLE, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); fh->f_comm->c_coll.coll_allreduce (&comm_time, &max_comm_time, 1, MPI_DOUBLE, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); fh->f_comm->c_coll.coll_allreduce (&write_time, &max_write_time, 1, MPI_DOUBLE, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (0 == fh->f_rank){ printf ("Max Exchange and write ---- %f\n", max_io); printf ("AVG pwrite time : %f \n", max_write_time/mca_fcoll_dynamic_num_io_procs); printf ("AVG communication time : %f\n", max_comm_time/fh->f_size); } fh->f_comm->c_coll.coll_allreduce (&comm_time, &max_comm_time, 1, MPI_DOUBLE, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); fh->f_comm->c_coll.coll_allreduce (&write_time, &max_write_time, 1, MPI_DOUBLE, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (0 == fh->f_rank){ printf ("MAX pwrite time : %f \n", max_write_time); printf ("MAX communication time : %f\n", max_comm_time); } fh->f_comm->c_coll.coll_allreduce (&comm_time, &max_comm_time, 1, MPI_DOUBLE, MPI_MIN, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (0 == fh->f_rank){ printf ("MIN communication time : %f\n", max_comm_time); } #endif exit : if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != disp_index){ free(disp_index); disp_index = NULL; } if (NULL != recvtype){ free(recvtype); recvtype=NULL; } if (NULL != recv_req){ free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } for(l=0;l<fh->f_procs_per_group;l++){ if (NULL != blocklen_per_process[l]){ free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]){ free(displs_per_process[l]); displs_per_process[l] = NULL; } } if (NULL != blocklen_per_process){ free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process){ free(displs_per_process); displs_per_process = NULL; } } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_iov_array) { free (global_iov_array); global_iov_array = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != send_req){ free(send_req); send_req = NULL; } return OMPI_SUCCESS; }
int mca_fcoll_two_phase_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { int i, j,interleave_count=0, striping_unit=0; uint32_t iov_count=0,ti; struct iovec *decoded_iov=NULL, *temp_iov=NULL; size_t max_data = 0, total_bytes = 0; int domain_size=0, *count_my_req_per_proc=NULL, count_my_req_procs; int count_other_req_procs, ret=OMPI_SUCCESS; size_t *buf_indices=NULL; int local_count = 0, local_size=0,*aggregator_list = NULL; struct iovec *iov = NULL; OMPI_MPI_OFFSET_TYPE start_offset, end_offset, fd_size; OMPI_MPI_OFFSET_TYPE *start_offsets=NULL, *end_offsets=NULL; OMPI_MPI_OFFSET_TYPE *fd_start=NULL, *fd_end=NULL, min_st_offset; Flatlist_node *flat_buf=NULL; mca_io_ompio_access_array_t *my_req=NULL, *others_req=NULL; MPI_Aint send_buf_addr; #if TIME_BREAKDOWN print_entry nentry; #endif if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { fh->f_flags = fh->f_flags | OMPIO_CONTIGUOUS_MEMORY; } if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ret = ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &temp_iov, &iov_count); if (OMPI_SUCCESS != ret ){ goto exit; } send_buf_addr = (OPAL_PTRDIFF_TYPE)buf; decoded_iov = (struct iovec *)malloc (iov_count * sizeof(struct iovec)); for (ti = 0; ti < iov_count; ti ++){ decoded_iov[ti].iov_base = (IOVBASE_TYPE *)( (OPAL_PTRDIFF_TYPE)temp_iov[ti].iov_base - send_buf_addr); decoded_iov[ti].iov_len = temp_iov[ti].iov_len ; #if DEBUG_ON printf("d_offset[%d]: %ld, d_len[%d]: %ld\n", ti, (OPAL_PTRDIFF_TYPE)decoded_iov[ti].iov_base, ti, decoded_iov[ti].iov_len); #endif } } else{ max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } if(-1 == mca_fcoll_two_phase_num_io_procs){ ret = ompi_io_ompio_set_aggregator_props (fh, mca_fcoll_two_phase_num_io_procs, max_data); if ( OMPI_SUCCESS != ret){ return ret; } mca_fcoll_two_phase_num_io_procs = ceil((float)fh->f_size/fh->f_procs_per_group); } if (mca_fcoll_two_phase_num_io_procs > fh->f_size){ mca_fcoll_two_phase_num_io_procs = fh->f_size; } #if DEBUG_ON printf("Number of aggregators : %ld\n", mca_fcoll_two_phase_num_io_procs); #endif aggregator_list = (int *) malloc (mca_fcoll_two_phase_num_io_procs * sizeof(int)); if ( NULL == aggregator_list ) { return OMPI_ERR_OUT_OF_RESOURCE; } for (i =0; i< mca_fcoll_two_phase_num_io_procs; i++){ aggregator_list[i] = i; } ret = ompi_io_ompio_generate_current_file_view (fh, max_data, &iov, &local_count); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = fh->f_comm->c_coll.coll_allreduce (&max_data, &total_bytes, 1, MPI_DOUBLE, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if ( OMPI_SUCCESS != ret ) { goto exit; } if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { /* This datastructre translates between OMPIO->ROMIO its a little hacky!*/ /* But helps to re-use romio's code for handling non-contiguous file-type*/ flat_buf = (Flatlist_node *)malloc(sizeof(Flatlist_node)); if ( NULL == flat_buf ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->type = datatype; flat_buf->next = NULL; flat_buf->count = 0; local_size = iov_count/count; flat_buf->indices = (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == flat_buf->indices ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->blocklens = (OMPI_MPI_OFFSET_TYPE *)malloc(local_size * sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == flat_buf->blocklens ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } flat_buf->count = local_size; i=0;j=0; while(j < local_size){ flat_buf->indices[j] = (OMPI_MPI_OFFSET_TYPE)(intptr_t)decoded_iov[i].iov_base; flat_buf->blocklens[j] = decoded_iov[i].iov_len; if(i < (int)iov_count) i+=1; j+=1; } #if DEBUG_ON printf("flat_buf_count : %d\n", flat_buf->count); for(i=0;i<flat_buf->count;i++){ printf("%d: blocklen[%d] : %lld, indices[%d]: %lld \n", fh->f_rank, i, flat_buf->blocklens[i], i ,flat_buf->indices[i]); } #endif } #if DEBUG_ON printf("%d: fcoll:two_phase:write_all->total_bytes:%ld, local_count: %d\n", fh->f_rank,total_bytes, local_count); for (i=0 ; i<local_count ; i++) { printf("%d: fcoll:two_phase:write_all:OFFSET:%ld,LENGTH:%ld\n", fh->f_rank, (size_t)iov[i].iov_base, (size_t)iov[i].iov_len); } #endif start_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[0].iov_base; end_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)iov[local_count-1].iov_base + (OMPI_MPI_OFFSET_TYPE)iov[local_count-1].iov_len - 1; #if DEBUG_ON printf("%d: fcoll:two_phase:write_all:START OFFSET:%ld,END OFFSET:%ld\n", fh->f_rank, (size_t)start_offset, (size_t)end_offset); #endif start_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc (fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == start_offsets ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } end_offsets = (OMPI_MPI_OFFSET_TYPE *)malloc (fh->f_size*sizeof(OMPI_MPI_OFFSET_TYPE)); if ( NULL == end_offsets ){ ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&start_offset, 1, MPI_LONG, start_offsets, 1, MPI_LONG, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = fh->f_comm->c_coll.coll_allgather(&end_offset, 1, MPI_LONG, end_offsets, 1, MPI_LONG, fh->f_comm, fh->f_comm->c_coll.coll_allgather_module); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON for (i=0;i<fh->f_size;i++){ printf("%d: fcoll:two_phase:write_all:start[%d]:%ld,end[%d]:%ld\n", fh->f_rank,i, (size_t)start_offsets[i],i, (size_t)end_offsets[i]); } #endif for (i=1; i<fh->f_size; i++){ if ((start_offsets[i] < end_offsets[i-1]) && (start_offsets[i] <= end_offsets[i])){ interleave_count++; } } #if DEBUG_ON printf("%d: fcoll:two_phase:write_all:interleave_count:%d\n", fh->f_rank,interleave_count); #endif ret = mca_fcoll_two_phase_domain_partition(fh, start_offsets, end_offsets, &min_st_offset, &fd_start, &fd_end, domain_size, &fd_size, striping_unit, mca_fcoll_two_phase_num_io_procs); if ( OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON for (i=0;i<mca_fcoll_two_phase_num_io_procs;i++){ printf("fd_start[%d] : %lld, fd_end[%d] : %lld, local_count: %d\n", i, fd_start[i], i, fd_end[i], local_count); } #endif ret = mca_fcoll_two_phase_calc_my_requests (fh, iov, local_count, min_st_offset, fd_start, fd_end, fd_size, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_indices, striping_unit, mca_fcoll_two_phase_num_io_procs, aggregator_list); if ( OMPI_SUCCESS != ret ){ goto exit; } ret = mca_fcoll_two_phase_calc_others_requests(fh, count_my_req_procs, count_my_req_per_proc, my_req, &count_other_req_procs, &others_req); if (OMPI_SUCCESS != ret ){ goto exit; } #if DEBUG_ON printf("count_other_req_procs : %d\n", count_other_req_procs); #endif #if TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif ret = two_phase_exch_and_write(fh, buf, datatype, others_req, iov, local_count, min_st_offset, fd_size, fd_start, fd_end, flat_buf, buf_indices, striping_unit, aggregator_list); if (OMPI_SUCCESS != ret){ goto exit; } #if TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += (end_exch - start_exch); nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (is_aggregator(fh->f_rank, mca_fcoll_two_phase_num_io_procs, aggregator_list)){ nentry.aggregator = 1; } else{ nentry.aggregator = 0; } nentry.nprocs_for_coll = mca_fcoll_two_phase_num_io_procs; if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)){ ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit : if (flat_buf != NULL) { if (flat_buf->blocklens != NULL) { free (flat_buf->blocklens); } if (flat_buf->indices != NULL) { free (flat_buf->indices); } free (flat_buf); } if (start_offsets != NULL) { free(start_offsets); } if (end_offsets != NULL){ free(end_offsets); } if (aggregator_list != NULL){ free(aggregator_list); } return ret; }
int mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { MPI_Aint position = 0; MPI_Aint total_bytes = 0; /* total bytes to be read */ MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/ MPI_Aint bytes_per_cycle = 0; /* total read in each cycle by each process*/ int index = 0; int cycles = 0; int i=0, j=0, x=0; int n=0; /* current position in total_bytes_per_process array */ MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current value from total_bytes_per_process */ int bytes_received = 0; int blocks = 0; /* iovec structure and count of the buffer passed in */ uint32_t iov_count = 0; struct iovec *decoded_iov = NULL; int iov_index = 0; size_t current_position = 0; char *receive_buf = NULL; /* global iovec at the readers that contain the iovecs created from file_set_view */ uint32_t total_fview_count = 0; struct iovec *global_fview = NULL; int local_count = 0; struct iovec *iov = NULL; int *fview_count = NULL; int current_index; char *global_buf = NULL; MPI_Aint global_count = 0; /* array that contains the sorted indices of the global_iov */ int *sorted = NULL; int *displs = NULL; size_t max_data = 0; int *bytes_per_process = NULL; MPI_Aint bytes_left = 0; MPI_Aint *total_bytes_per_process = NULL; if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; } /************************************************************************** ** In case the data is not contigous in memory, decode it into an iovec ** **************************************************************************/ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if (! (fh->f_flags & OMPIO_AGGREGATOR_IS_SET)) { ompi_io_ompio_set_aggregator_props (fh, mca_fcoll_dynamic_num_io_procs, max_data); } total_bytes_per_process = (MPI_Aint*)malloc (fh->f_procs_per_group*sizeof(MPI_Aint)); if (NULL == total_bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } ompi_io_ompio_allgather_array (&max_data, 1, MPI_LONG, total_bytes_per_process, 1, MPI_LONG, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); for (i=0 ; i<fh->f_procs_per_group ; i++) { total_bytes += total_bytes_per_process[i]; } if (NULL != total_bytes_per_process) { free (total_bytes_per_process); total_bytes_per_process = NULL; } /* fh->f_comm->c_coll.coll_allreduce (&max_data, &total_bytes, 1, MPI_DOUBLE, MPI_SUM, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); */ /********************************************************************* *** Generate the File offsets/lengths corresponding to this write *** ********************************************************************/ ompi_io_ompio_generate_current_file_view (fh, max_data, &iov, &local_count); /* for (i=0 ; i<local_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, iov[i].iov_base, iov[i].iov_len); } */ /************************************************************* *** ALLGather the File View information at all processes *** *************************************************************/ fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == fview_count) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } ompi_io_ompio_allgather_array (&local_count, 1, MPI_INT, fview_count, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); displs = (int*)malloc (fh->f_procs_per_group*sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } displs[0] = 0; total_fview_count = fview_count[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { total_fview_count += fview_count[i]; displs[i] = displs[i-1] + fview_count[i-1]; } /* if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("%d: PROCESS: %d ELEMENTS: %d DISPLS: %d\n", fh->f_rank, i, fview_count[i], displs[i]); } } */ /* allocate the global iovec */ if (0 != total_fview_count) { global_fview = (struct iovec*)malloc (total_fview_count * sizeof(struct iovec)); if (NULL == global_fview) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } if (fh->f_flags & OMPIO_UNIFORM_FVIEW) { ompi_io_ompio_allgather_array (iov, local_count, fh->f_iov_type, global_fview, local_count, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); } else { ompi_io_ompio_allgatherv_array (iov, local_count, fh->f_iov_type, global_fview, fview_count, displs, fh->f_iov_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); } /* sort it */ if (0 != total_fview_count) { sorted = (int *)malloc (total_fview_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } ompi_io_ompio_sort_iovec (global_fview, total_fview_count, sorted); } if (NULL != iov) { free (iov); iov = NULL; } /* if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<total_fview_count ; i++) { printf("%d: OFFSET: %p LENGTH: %d\n", fh->f_rank, global_fview[sorted[i]].iov_base, global_fview[sorted[i]].iov_len); } } */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { bytes_per_process = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } /* * Calculate how many bytes are read in each cycle */ bytes_per_cycle = mca_fcoll_dynamic_cycle_buffer_size; cycles = ceil((double)total_bytes/bytes_per_cycle); n = 0; bytes_remaining = 0; current_index = 0; for (index = 0; index < cycles; index++) { int k; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { memset(displs, 0x0, fh->f_procs_per_group*sizeof(int)); memset(bytes_per_process, 0x0, fh->f_procs_per_group*sizeof(int)); } if (cycles-1 == index) { bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index; } else { bytes_to_read_in_cycle = bytes_per_cycle; } /* if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { printf ("****%d: CYCLE %d Bytes %d**********\n", fh->f_rank, index, bytes_to_write_in_cycle); } */ /* Calculate how much data will be contributed in this cycle by each process*/ bytes_received = 0; while (bytes_to_read_in_cycle) { blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[current_index] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } if (bytes_remaining) { if (bytes_remaining <= bytes_to_read_in_cycle) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { bytes_per_process[n] += bytes_remaining; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_remaining; } current_index ++; bytes_to_read_in_cycle -= bytes_remaining; bytes_remaining = 0; continue; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { bytes_per_process[n] += bytes_to_read_in_cycle; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining -= bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } } else { if (bytes_to_read_in_cycle < (MPI_Aint) global_fview[sorted[current_index]].iov_len) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { bytes_per_process[n] += bytes_to_read_in_cycle; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += bytes_to_read_in_cycle; } bytes_remaining = global_fview[sorted[current_index]].iov_len - bytes_to_read_in_cycle; bytes_to_read_in_cycle = 0; break; } else { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { bytes_per_process[n] += global_fview[sorted[current_index]].iov_len; } if (fh->f_procs_in_group[n] == fh->f_rank) { bytes_received += global_fview[sorted[current_index]].iov_len; } bytes_to_read_in_cycle -= global_fview[sorted[current_index]].iov_len; current_index ++; continue; } } } /* Calculate the displacement on where to put the data and allocate the recieve buffer (global_buf) */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { displs[0] = 0; global_count = bytes_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_count += bytes_per_process[i]; displs[i] = displs[i-1] + bytes_per_process[i-1]; } /* for (i=0 ; i<fh->f_procs_per_group ; i++) { printf ("Proc %d sending %d at %d\n", i, bytes_per_process[i], displs[i]); } */ global_buf = malloc (global_count); if (NULL == global_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } /********************************************************** ******* Create the io array, and pass it to fbtl ********* *********************************************************/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { MPI_Aint bytes_to_read = global_count; MPI_Aint *temp = NULL; int block = 1; k = 0; temp = (MPI_Aint *)malloc (sizeof(MPI_Aint) * fh->f_procs_per_group); if (NULL == temp) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } memset(temp, 0x0, fh->f_procs_per_group*sizeof(MPI_Aint)); fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (OMPIO_IOVEC_INITIAL_SIZE * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } while (bytes_to_read) { int start = 0; if (OMPIO_IOVEC_INITIAL_SIZE*block <= k) { block ++; fh->f_io_array = (mca_io_ompio_io_array_t *)realloc (fh->f_io_array, OMPIO_IOVEC_INITIAL_SIZE *block * sizeof(mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } blocks = fview_count[0]; for (j=0 ; j<fh->f_procs_per_group ; j++) { if (sorted[x] < blocks) { n = j; break; } else { blocks += fview_count[j+1]; } } for (j=0 ; j<n ; j++) { start += bytes_per_process[j]; } if (bytes_left) { if (bytes_left <= bytes_to_read) { fh->f_io_array[k].offset = (IOVBASE_TYPE *) ((OPAL_PTRDIFF_TYPE)global_fview[sorted[x]].iov_base + (global_fview[sorted[x]].iov_len - bytes_left)); fh->f_io_array[k].length = bytes_left; fh->f_io_array[k].memory_address = &global_buf[start+temp[n]]; temp[n] += fh->f_io_array[k].length; bytes_to_read -= bytes_left; bytes_left = 0; k ++; x ++; continue; } else { fh->f_io_array[k].offset = (IOVBASE_TYPE *) ((OPAL_PTRDIFF_TYPE)global_fview[sorted[x]].iov_base + (global_fview[sorted[x]].iov_len - bytes_left)); fh->f_io_array[k].length = bytes_to_read; fh->f_io_array[k].memory_address = &global_buf[start+temp[n]]; temp[n] += fh->f_io_array[k].length; bytes_left -= bytes_to_read; bytes_to_read = 0;; k ++; break; } } else { if (bytes_to_read < (MPI_Aint) global_fview[sorted[x]].iov_len) { fh->f_io_array[k].offset = global_fview[sorted[x]].iov_base; fh->f_io_array[k].length = bytes_to_read; fh->f_io_array[k].memory_address = &global_buf[start+temp[n]]; bytes_left = global_fview[sorted[x]].iov_len - bytes_to_read; bytes_to_read = 0; k ++; break; } else { fh->f_io_array[k].offset = global_fview[sorted[x]].iov_base; fh->f_io_array[k].length = global_fview[sorted[x]].iov_len; fh->f_io_array[k].memory_address = &global_buf[start+temp[n]]; temp[n] += fh->f_io_array[k].length; bytes_to_read -= global_fview[sorted[x]].iov_len; k ++; x ++; continue; } } } fh->f_num_of_io_entries = k; /* printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %p LENGTH: %d\n", fh->f_io_array[i].memory_address, fh->f_io_array[i].offset, fh->f_io_array[i].length); } */ if (fh->f_num_of_io_entries) { if (OMPI_SUCCESS != fh->f_fbtl->fbtl_preadv (fh, NULL)) { opal_output (1, "READ FAILED\n"); return OMPI_ERROR; } } if (NULL != temp) { free (temp); temp = NULL; } } /********************************************************** ******************** DONE READING ************************ *********************************************************/ /********************************************************** ********* Scatter the Data from the readers ************** *********************************************************/ if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { receive_buf = &((char*)buf)[position]; } else if (bytes_received) { /* allocate a receive buffer and copy the data that needs to be received into it in case the data is non-contigous in memory */ receive_buf = malloc (bytes_received); if (NULL == receive_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } ompi_io_ompio_scatterv_array (global_buf, bytes_per_process, displs, MPI_BYTE, receive_buf, bytes_received, MPI_BYTE, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); position += bytes_received; /* If data is not contigous in memory, copy the data from the receive buffer into the buffer passed in */ if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; remaining = bytes_received; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy ((IOVBASE_TYPE *) mem_address, receive_buf+temp_position, remaining); current_position = current_position + remaining; remaining = 0; } } if (NULL != receive_buf) { free (receive_buf); receive_buf = NULL; } } /********************************************************** **************** DONE SCATTERING OF DATA ***************** *********************************************************/ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } if (NULL != sorted) { free (sorted); sorted = NULL; } if (NULL != global_fview) { free (global_fview); global_fview = NULL; } if (NULL != fview_count) { free (fview_count); fview_count = NULL; } if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if (NULL != bytes_per_process) { free (bytes_per_process); bytes_per_process = NULL; } if (NULL != displs) { free (displs); displs = NULL; } return OMPI_SUCCESS; }
int mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh, void *buf, int count, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { size_t max_data = 0, bytes_per_cycle=0; struct iovec *iov=NULL, *decoded_iov=NULL; uint32_t iov_count=0, iov_index=0; int i=0,j=0,l=0, temp_index; int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL; int index, *disp_index=NULL, **blocklen_per_process=NULL; int *iovec_count_per_process=NULL, *displs=NULL; size_t total_bytes_written=0; MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL; MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0; local_io_array *local_iov_array =NULL, *global_iov_array=NULL; local_io_array *file_offsets_for_agg=NULL; int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL; char *send_buf=NULL, *global_buf=NULL; int iov_size=0, current_position=0, *current_index=NULL; int *bytes_remaining=NULL, entries_per_aggregator=0; ompi_datatype_t **recvtype = NULL; MPI_Request *send_req=NULL, *recv_req=NULL; /* For creating datatype of type io_array */ int blocklen[3] = {1, 1, 1}; int static_num_io_procs=1; OPAL_PTRDIFF_TYPE d[3], base; ompi_datatype_t *types[3]; ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL; /*----------------------------------------------*/ #if TIME_BREAKDOWN double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0; double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0; print_entry nentry; #endif #if DEBUG_ON MPI_Aint gc_in; #endif // if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) { // fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY; // } /* In case the data is not contigous in memory, decode it into an iovec */ if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) { ompi_io_ompio_decode_datatype (fh, datatype, count, buf, &max_data, &decoded_iov, &iov_count); } else { max_data = count * datatype->super.size; } if ( MPI_STATUS_IGNORE != status ) { status->_ucount = max_data; } mca_io_ompio_get_num_aggregators ( & static_num_io_procs ); ompi_io_ompio_set_aggregator_props (fh, static_num_io_procs, max_data); /* io_array datatype for using in communication*/ types[0] = &ompi_mpi_long.dt; types[1] = &ompi_mpi_long.dt; types[2] = &ompi_mpi_int.dt; d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0]; d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length; d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id; base = d[0]; for (i=0 ; i<3 ; i++) { d[i] -= base; } ompi_datatype_create_struct (3, blocklen, d, types, &io_array_type); ompi_datatype_commit (&io_array_type); /* #########################################################*/ ret = ompi_io_ompio_generate_current_file_view(fh, max_data, &iov, &iov_size); if (ret != OMPI_SUCCESS) { fprintf(stderr,"Current File View Generation Error\n"); goto exit; } if (0 == iov_size) { iov_size = 1; } local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array)); if ( NULL == local_iov_array) { fprintf(stderr,"local_iov_array allocation error\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (j=0; j < iov_size; j++) { local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t) iov[j].iov_base; local_iov_array[j].length = (size_t)iov[j].iov_len; local_iov_array[j].process_id = fh->f_rank; } mca_io_ompio_get_bytes_per_agg ( (int *) &bytes_per_cycle); local_cycles = ceil((double)max_data/bytes_per_cycle); ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles, &cycles, 1, MPI_INT, MPI_MAX, fh->f_comm, fh->f_comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret) { fprintf(stderr,"local cycles allreduce!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); if (NULL == disp_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int )); if (NULL == bytes_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == bytes_remaining) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == current_index) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*)); if (NULL == blocklen_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process = (MPI_Aint **) malloc (fh->f_procs_per_group * sizeof (MPI_Aint*)); if (NULL == displs_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for(i=0; i<fh->f_procs_per_group; i++) { current_index[i] = 0; bytes_remaining[i] =0; blocklen_per_process[i] = NULL; displs_per_process[i] = NULL; } } iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == iovec_count_per_process) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs = (int *) malloc (fh->f_procs_per_group * sizeof(int)); if (NULL == displs) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = ompi_io_ompio_allgather_array (&iov_size, 1, MPI_INT, iovec_count_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if( OMPI_SUCCESS != ret) { fprintf(stderr,"iov size allgatherv array!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { displs[0] = 0; global_iov_count = iovec_count_per_process[0]; for (i=1 ; i<fh->f_procs_per_group ; i++) { global_iov_count += iovec_count_per_process[i]; displs[i] = displs[i-1] + iovec_count_per_process[i-1]; } } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { global_iov_array = (local_io_array *) malloc (global_iov_count * sizeof(local_io_array)); if (NULL == global_iov_array) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } ret = ompi_io_ompio_gatherv_array (local_iov_array, iov_size, io_array_type, global_iov_array, iovec_count_per_process, displs, io_array_type, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); if (OMPI_SUCCESS != ret) { fprintf(stderr,"global_iov_array gather error!\n"); goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if ( 0 == global_iov_count) { global_iov_count = 1; } sorted = (int *)malloc (global_iov_count * sizeof(int)); if (NULL == sorted) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } local_heap_sort (global_iov_array, global_iov_count, sorted); } #if DEBUG_ON if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (gc_in=0; gc_in<global_iov_count; gc_in++) { printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n", global_iov_array[gc_in].process_id, gc_in, global_iov_array[gc_in].offset, gc_in, global_iov_array[gc_in].length); } } #endif #if TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif for (index = 0; index < cycles; index++) { if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL == recvtype) { recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *)); if (NULL == recvtype) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } for(l=0; l<fh->f_procs_per_group; l++) { disp_index[l] = 1; if (NULL != blocklen_per_process[l]) { free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]) { free(displs_per_process[l]); displs_per_process[l] = NULL; } blocklen_per_process[l] = (int *) calloc (1, sizeof(int)); if (NULL == blocklen_per_process[l]) { opal_output (1, "OUT OF MEMORY for blocklen\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint)); if (NULL == displs_per_process[l]) { opal_output (1, "OUT OF MEMORY for displs\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } if (NULL != sorted_file_offsets) { free(sorted_file_offsets); sorted_file_offsets = NULL; } if(NULL != file_offsets_for_agg) { free(file_offsets_for_agg); file_offsets_for_agg = NULL; } if (NULL != memory_displacements) { free(memory_displacements); memory_displacements = NULL; } } if (local_cycles > index) { if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) { bytes_to_write_in_cycle = max_data % bytes_per_cycle; } else if (max_data <= bytes_per_cycle) { bytes_to_write_in_cycle = max_data; } else { bytes_to_write_in_cycle = bytes_per_cycle; } } else { bytes_to_write_in_cycle = 0; } #if DEBUG_ON /* if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/ printf ("***%d: CYCLE %d Bytes %ld**********\n", fh->f_rank, index, bytes_to_write_in_cycle); /* }*/ #endif /********************************************************** **Gather the Data from all the processes at the writers ** *********************************************************/ /* gather from each process how many bytes each will be sending */ ompi_io_ompio_gather_array (&bytes_to_write_in_cycle, 1, MPI_INT, bytes_per_process, 1, MPI_INT, fh->f_aggregator_index, fh->f_procs_in_group, fh->f_procs_per_group, fh->f_comm); /* For each aggregator it needs to get bytes_to_write_in_cycle from each process in group which adds up to bytes_per_cycle */ if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0; i<fh->f_procs_per_group; i++) { /* printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]); */ #if DEBUG_ON printf ("%d : bytes_per_process : %d\n", fh->f_procs_in_group[i], bytes_per_process[i]); #endif while (bytes_per_process[i] > 0) { if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id, fh) == i) { /* current id owns this entry!*/ /*Add and subtract length and create blocklength and displs array*/ if (bytes_remaining[i]) { /*Remaining bytes in the current entry of the global offset array*/ if (bytes_remaining[i] <= bytes_per_process[i]) { blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); bytes_per_process[i] -= bytes_remaining[i]; blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; bytes_remaining[i] = 0; disp_index[i] += 1; /* This entry has been used up, we need to move to the next entry of this process and make current_index point there*/ current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1) { /* No more entries left, so Its all done! exit!*/ break; } continue; } else { blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset + (global_iov_array[sorted[current_index[i]]].length - bytes_remaining[i]); bytes_remaining[i] -= bytes_per_process[i]; bytes_per_process[i] = 0; break; } } else { if (bytes_per_process[i] < global_iov_array[sorted[current_index[i]]].length) { blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i]; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; bytes_remaining[i] = global_iov_array[sorted[current_index[i]]].length - bytes_per_process[i]; bytes_per_process[i] = 0; break; } else { blocklen_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].length; displs_per_process[i][disp_index[i] - 1] = global_iov_array[sorted[current_index[i]]].offset; blocklen_per_process[i] = (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int)); displs_per_process[i] = (MPI_Aint *)realloc ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint)); blocklen_per_process[i][disp_index[i]] = 0; displs_per_process[i][disp_index[i]] = 0; disp_index[i] += 1; bytes_per_process[i] -= global_iov_array[sorted[current_index[i]]].length; current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1) { break; } } } } else { current_index[i] = find_next_index(i, current_index[i], fh, global_iov_array, global_iov_count, sorted); if (current_index[i] == -1) { bytes_per_process[i] = 0; /* no more entries left to service this request*/ continue; } } } } entries_per_aggregator=0; for (i=0; i<fh->f_procs_per_group; i++) { for (j=0; j<disp_index[i]; j++) { if (blocklen_per_process[i][j] > 0) { entries_per_aggregator++; #if DEBUG_ON printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", fh->f_procs_in_group[i],j, blocklen_per_process[i][j],j, displs_per_process[i][j], fh->f_rank); #endif } } } if (entries_per_aggregator > 0) { file_offsets_for_agg = (local_io_array *) malloc(entries_per_aggregator*sizeof(local_io_array)); if (NULL == file_offsets_for_agg) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); if (NULL == sorted_file_offsets) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } temp_index = 0; for (i=0; i<fh->f_procs_per_group; i++) { for(j=0; j<disp_index[i]; j++) { if (blocklen_per_process[i][j] > 0) { file_offsets_for_agg[temp_index].length = blocklen_per_process[i][j]; file_offsets_for_agg[temp_index].process_id = i; file_offsets_for_agg[temp_index].offset = displs_per_process[i][j]; temp_index++; } } } } else { continue; } local_heap_sort (file_offsets_for_agg, entries_per_aggregator, sorted_file_offsets); memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); memory_displacements[sorted_file_offsets[0]] = 0; for (i=1; i<entries_per_aggregator; i++) { memory_displacements[sorted_file_offsets[i]] = memory_displacements[sorted_file_offsets[i-1]] + file_offsets_for_agg[sorted_file_offsets[i-1]].length; } temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int)); if (NULL == temp_disp_index) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } global_count = 0; for (i=0; i<entries_per_aggregator; i++) { temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = memory_displacements[sorted_file_offsets[i]]; if (temp_disp_index[temp_pindex] < disp_index[temp_pindex]) temp_disp_index[temp_pindex] += 1; else { printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", temp_pindex, temp_disp_index[temp_pindex], temp_pindex, disp_index[temp_pindex]); } global_count += file_offsets_for_agg[sorted_file_offsets[i]].length; } if (NULL != temp_disp_index) { free(temp_disp_index); temp_disp_index = NULL; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); for (i=0; i<entries_per_aggregator; i++) { printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld, disp : %d\n", file_offsets_for_agg[sorted_file_offsets[i]].process_id, file_offsets_for_agg[sorted_file_offsets[i]].offset, file_offsets_for_agg[sorted_file_offsets[i]].length, memory_displacements[sorted_file_offsets[i]], disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]); } #endif #if DEBUG_ON printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n", fh->f_rank, global_count, bytes_to_write_in_cycle, fh->f_procs_per_group); #endif #if TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif global_buf = (char *) malloc (global_count); if (NULL == global_buf) { opal_output(1, "OUT OF MEMORY"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } recv_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request)); if (NULL == recv_req) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } for (i=0; i<fh->f_procs_per_group; i++) { ompi_datatype_create_hindexed(disp_index[i], blocklen_per_process[i], displs_per_process[i], MPI_BYTE, &recvtype[i]); ompi_datatype_commit(&recvtype[i]); ret = MCA_PML_CALL(irecv(global_buf, 1, recvtype[i], fh->f_procs_in_group[i], 123, fh->f_comm, &recv_req[i])); if (OMPI_SUCCESS != ret) { fprintf(stderr,"irecv Error!\n"); goto exit; } } } if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) { send_buf = &((char*)buf)[total_bytes_written]; } else if (bytes_to_write_in_cycle) { /* allocate a send buffer and copy the data that needs to be sent into it in case the data is non-contigous in memory */ OPAL_PTRDIFF_TYPE mem_address; size_t remaining = 0; size_t temp_position = 0; send_buf = malloc (bytes_to_write_in_cycle); if (NULL == send_buf) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } remaining = bytes_to_write_in_cycle; while (remaining) { mem_address = (OPAL_PTRDIFF_TYPE) (decoded_iov[iov_index].iov_base) + current_position; if (remaining >= (decoded_iov[iov_index].iov_len - current_position)) { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, decoded_iov[iov_index].iov_len - current_position); remaining = remaining - (decoded_iov[iov_index].iov_len - current_position); temp_position = temp_position + (decoded_iov[iov_index].iov_len - current_position); iov_index = iov_index + 1; current_position = 0; } else { memcpy (send_buf+temp_position, (IOVBASE_TYPE *)mem_address, remaining); current_position = current_position + remaining; remaining = 0; } } } total_bytes_written += bytes_to_write_in_cycle; send_req = (MPI_Request *) malloc (sizeof(MPI_Request)); if (NULL == send_req) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ret = MCA_PML_CALL(isend(send_buf, bytes_to_write_in_cycle, MPI_BYTE, fh->f_procs_in_group[fh->f_aggregator_index], 123, MCA_PML_BASE_SEND_STANDARD, fh->f_comm, send_req)); if ( OMPI_SUCCESS != ret ) { fprintf(stderr,"isend error!\n"); goto exit; } ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret) { goto exit; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { ret = ompi_request_wait_all (fh->f_procs_per_group, recv_req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != ret) { goto exit; } #if DEBUG_ON printf("************Cycle: %d, Aggregator: %d ***************\n", index+1,fh->f_rank); if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { for (i=0 ; i<global_count/4 ; i++) printf (" RECV %d \n",((int *)global_buf)[i]); } #endif } #if TIME_BREAKDOWN end_comm_time = MPI_Wtime(); comm_time += end_comm_time - start_comm_time; #endif if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_io_array = (mca_io_ompio_io_array_t *) malloc (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t)); if (NULL == fh->f_io_array) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } fh->f_num_of_io_entries = 0; /*First entry for every aggregator*/ fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[0]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[0]]; fh->f_num_of_io_entries++; for (i=1; i<entries_per_aggregator; i++) { if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset + file_offsets_for_agg[sorted_file_offsets[i-1]].length == file_offsets_for_agg[sorted_file_offsets[i]].offset) { fh->f_io_array[fh->f_num_of_io_entries - 1].length += file_offsets_for_agg[sorted_file_offsets[i]].length; } else { fh->f_io_array[fh->f_num_of_io_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; fh->f_io_array[fh->f_num_of_io_entries].length = file_offsets_for_agg[sorted_file_offsets[i]].length; fh->f_io_array[fh->f_num_of_io_entries].memory_address = global_buf+memory_displacements[sorted_file_offsets[i]]; fh->f_num_of_io_entries++; } } #if DEBUG_ON printf("*************************** %d\n", fh->f_num_of_io_entries); for (i=0 ; i<fh->f_num_of_io_entries ; i++) { printf(" ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", fh->f_io_array[i].memory_address, (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset, fh->f_io_array[i].length); } #endif #if TIME_BREAKDOWN start_write_time = MPI_Wtime(); #endif if (fh->f_num_of_io_entries) { if ( 0 > fh->f_fbtl->fbtl_pwritev (fh)) { opal_output (1, "WRITE FAILED\n"); ret = OMPI_ERROR; goto exit; } } #if TIME_BREAKDOWN end_write_time = MPI_Wtime(); write_time += end_write_time - start_write_time; #endif } if (NULL != send_req) { free(send_req); send_req = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { fh->f_num_of_io_entries = 0; if (NULL != fh->f_io_array) { free (fh->f_io_array); fh->f_io_array = NULL; } for (i = 0; i < fh->f_procs_per_group; i++) ompi_datatype_destroy(recvtype+i); if (NULL != recvtype) { free(recvtype); recvtype=NULL; } if (NULL != recv_req) { free(recv_req); recv_req = NULL; } if (NULL != global_buf) { free (global_buf); global_buf = NULL; } } } #if TIME_BREAKDOWN end_exch = MPI_Wtime(); exch_write += end_exch - start_exch; nentry.time[0] = write_time; nentry.time[1] = comm_time; nentry.time[2] = exch_write; if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) nentry.aggregator = 1; else nentry.aggregator = 0; nentry.nprocs_for_coll = static_num_io_procs; if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)) { ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE, nentry); } #endif exit: if (NULL != decoded_iov) { free(decoded_iov); decoded_iov = NULL; } if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) { if (NULL != disp_index) { free(disp_index); disp_index = NULL; } if (NULL != local_iov_array) { free(local_iov_array); local_iov_array = NULL; } for(l=0; l<fh->f_procs_per_group; l++) { if (NULL != blocklen_per_process[l]) { free(blocklen_per_process[l]); blocklen_per_process[l] = NULL; } if (NULL != displs_per_process[l]) { free(displs_per_process[l]); displs_per_process[l] = NULL; } } if (NULL != blocklen_per_process) { free(blocklen_per_process); blocklen_per_process = NULL; } if (NULL != displs_per_process) { free(displs_per_process); displs_per_process = NULL; } if(NULL != bytes_remaining) { free(bytes_remaining); bytes_remaining = NULL; } if(NULL != current_index) { free(current_index); current_index = NULL; } } return ret; }