void ADIOI_Calc_file_realms (ADIO_File fd, ADIO_Offset min_st_offset, ADIO_Offset max_end_offset) { int nprocs_for_coll; int file_realm_calc_type; MPI_Datatype *file_realm_types = NULL; ADIO_Offset *file_realm_st_offs = NULL; #ifdef AGGREGATION_PROFILE MPE_Log_event (5004, 0, NULL); #endif #ifdef DEBUG printf ("ADIOI_Calc_file_realms\n"); #endif nprocs_for_coll = fd->hints->cb_nodes; file_realm_calc_type = fd->hints->cb_fr_type; /* If PFRs are disabled we know these pointers are not allocated */ if (fd->hints->cb_pfr != ADIOI_HINT_ENABLE) { fd->file_realm_st_offs = NULL; fd->file_realm_types = NULL; } if (nprocs_for_coll == 1) { /* if there's only one aggregator, we can reset the file * realms every single time */ if (fd->file_realm_st_offs == NULL) { file_realm_st_offs = (ADIO_Offset *) ADIOI_Malloc (sizeof(ADIO_Offset)); file_realm_types = (MPI_Datatype *) ADIOI_Malloc (sizeof(MPI_Datatype)); } else { file_realm_st_offs = fd->file_realm_st_offs; file_realm_types = fd->file_realm_types; } *file_realm_st_offs = min_st_offset; MPI_Type_contiguous ((max_end_offset - min_st_offset + 1), MPI_BYTE, file_realm_types); MPI_Type_commit (file_realm_types); ADIOI_Add_contig_flattened (*file_realm_types); } else if (fd->file_realm_st_offs == NULL) { file_realm_st_offs = (ADIO_Offset *) ADIOI_Malloc (nprocs_for_coll * sizeof(ADIO_Offset)); file_realm_types = (MPI_Datatype *) ADIOI_Malloc (nprocs_for_coll * sizeof(MPI_Datatype)); if (file_realm_calc_type == ADIOI_FR_AAR) { ADIOI_Calc_file_realms_aar (fd, nprocs_for_coll, fd->hints->cb_pfr, min_st_offset, max_end_offset, file_realm_st_offs, file_realm_types); /* flatten file realm datatype for future use - only one * because all are the same*/ ADIOI_Flatten_datatype (file_realm_types[0]); } else if (file_realm_calc_type == ADIOI_FR_FSZ) { ADIOI_Calc_file_realms_fsize (fd, nprocs_for_coll, max_end_offset, file_realm_st_offs, file_realm_types); /* flatten file realm datatype for future use - only one * because all are the same*/ ADIOI_Flatten_datatype (file_realm_types[0]); } else if (file_realm_calc_type == ADIOI_FR_USR_REALMS) { /* copy user provided realm datatypes and realm offsets in * hints to file descriptor. may also want to verify that * the provided file realms are covering (for pfr at * least) and non-overlapping */ } else if (file_realm_calc_type > 0) { ADIOI_Calc_file_realms_user_size (fd, file_realm_calc_type, nprocs_for_coll, file_realm_st_offs, file_realm_types); /* flatten file realm datatype for future use - only one * because all are the same */ ADIOI_Flatten_datatype (file_realm_types[0]); } } fd->file_realm_st_offs = file_realm_st_offs; fd->file_realm_types = file_realm_types; #ifdef AGGREGATION_PROFILE MPE_Log_event (5005, 0, NULL); #endif }
/* ADIOI_Exchange_file_views - Sends all the aggregators the file * views and file view states of the clients. It fills in the * client_file_view_state_arr for the aggregators and the * my_mem_view_state for the client. It also initializes the * agg_file_view_state for all clients, which is the view for each * aggregator of a client's filetype. */ void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type, ADIO_File fd, int count, MPI_Datatype datatype, ADIO_Offset off, view_state *my_mem_view_state_arr, view_state *agg_file_view_state_arr, view_state *client_file_view_state_arr) { /* Convert my own fileview to an ADIOI_Flattened type and a * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes. * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node * to/from each of the aggregators with the rest of the file view * state. */ int i = -1, j = -1; amount_and_extra_data_t *send_count_arr = NULL; amount_and_extra_data_t *recv_count_arr = NULL; int send_req_arr_sz = 0; int recv_req_arr_sz = 0; MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL; MPI_Status *statuses = NULL; ADIO_Offset disp_off_sz_ext_typesz[6]; MPI_Aint memtype_extent, filetype_extent, lb; int ret = -1; /* parameters for datatypes */ ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL; MPI_Count memtype_sz = -1; int memtype_is_contig = -1; ADIO_Offset filetype_sz = -1; #ifdef AGGREGATION_PROFILE MPE_Log_event (5014, 0, NULL); #endif /* The memtype will be freed after the call. The filetype will be * freed in the close and should have been flattened in the file * view. */ MPI_Type_size_x(datatype, &memtype_sz); MPI_Type_get_extent(datatype, &lb, &memtype_extent); if (memtype_sz == memtype_extent) { memtype_is_contig = 1; flat_mem_p = ADIOI_Add_contig_flattened(datatype); flat_mem_p->blocklens[0] = memtype_sz*count; } else { flat_mem_p = ADIOI_Flatten_and_find(datatype); } MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent); MPI_Type_size_x(fd->filetype, &filetype_sz); if (filetype_extent == filetype_sz) { flat_file_p = ADIOI_Add_contig_flattened(fd->filetype); flat_file_p->blocklens[0] = memtype_sz*count; filetype_extent = memtype_sz*count; filetype_sz = filetype_extent; } else { flat_file_p = ADIOI_Flatlist; while (flat_file_p->type != fd->filetype) flat_file_p = flat_file_p->next; } disp_off_sz_ext_typesz[0] = fd->fp_ind; disp_off_sz_ext_typesz[1] = fd->disp; disp_off_sz_ext_typesz[2] = off; disp_off_sz_ext_typesz[3] = memtype_sz*count; disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent; disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz; if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); } else { send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes, sizeof(amount_and_extra_data_t)); /* only aggregators receive data */ if (fd->is_agg) { recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); recv_req_arr = ADIOI_Malloc (nprocs * sizeof(MPI_Request)); for (i=0; i < nprocs; i++) MPI_Irecv (&recv_count_arr[i], sizeof(amount_and_extra_data_t), MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]); } /* only send data to aggregators */ send_req_arr = ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request)); for (i=0; i < fd->hints->cb_nodes; i++) { send_count_arr[i].count = flat_file_p->count; send_count_arr[i].fp_ind = disp_off_sz_ext_typesz[0]; send_count_arr[i].disp = disp_off_sz_ext_typesz[1]; send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2]; send_count_arr[i].sz = disp_off_sz_ext_typesz[3]; send_count_arr[i].ext = disp_off_sz_ext_typesz[4]; send_count_arr[i].type_sz = disp_off_sz_ext_typesz[5]; MPI_Isend (&send_count_arr[i], sizeof(amount_and_extra_data_t), MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm, &send_req_arr[i]); } } /* Every client has to build mem and file view_states for each aggregator. * We initialize their values here. and we also initialize * send_count_arr */ if (memtype_is_contig) { /* if memory is contigous, we now replace memtype_sz and * memtype_extent with the full access size */ memtype_sz *= count; memtype_extent = memtype_sz; } for (i = 0; i < fd->hints->cb_nodes; i++) { int tmp_agg_idx = fd->hints->ranklist[i]; memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state)); my_mem_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; my_mem_view_state_arr[tmp_agg_idx].ext = (ADIO_Offset) memtype_extent; my_mem_view_state_arr[tmp_agg_idx].type_sz = (ADIO_Offset) memtype_sz; my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p; ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), TEMP_OFF); ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), REAL_OFF); memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state)); agg_file_view_state_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0]; agg_file_view_state_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1]; agg_file_view_state_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2]; agg_file_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; agg_file_view_state_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4]; agg_file_view_state_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5]; agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p; ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), TEMP_OFF); ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), REAL_OFF); if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { send_count_arr[tmp_agg_idx].count = flat_file_p->count; send_count_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0]; send_count_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1]; send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2]; send_count_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; send_count_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4]; send_count_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5]; } } #ifdef DEBUG2 fprintf(stderr, "my own flattened memtype: "); ADIOI_Print_flatlist_node(flat_mem_p); fprintf(stderr, "my own flattened filetype: "); ADIOI_Print_flatlist_node(flat_file_p); #endif if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t), MPI_BYTE, recv_count_arr, sizeof(amount_and_extra_data_t), MPI_BYTE, fd->comm); if (ret != MPI_SUCCESS) { fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed " "with error %d", ret); return; } } else { statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status)); if (fd->is_agg) { MPI_Waitall(nprocs, recv_req_arr, statuses); ADIOI_Free(recv_req_arr); } MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses); ADIOI_Free(statuses); ADIOI_Free(send_req_arr); } #ifdef DEBUG2 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { fprintf(stderr, "send_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count); } fprintf(stderr, "\n"); fprintf(stderr, "recv_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count); } fprintf(stderr, "\n"); } else { fprintf(stderr, "send_count_arr:"); for (i = 0; i < fd->hints->cb_nodes; i++) { fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count); } fprintf(stderr, "\n"); if (fd->is_agg) { fprintf(stderr, "recv_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count); } fprintf(stderr, "\n"); } } #endif if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { for (i=0; i < fd->hints->cb_nodes; i++) if (send_count_arr[i].count > 0) send_req_arr_sz++; } /* Figure out how many counts to send/recv */ for (i = 0; i < nprocs; i++) { if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { if (send_count_arr[i].count > 0) send_req_arr_sz++; } /* Only aggregators should recv*/ if (fd->is_agg) { if (recv_count_arr[i].count > 0) { if ((client_file_view_state_arr[i].flat_type_p = (ADIOI_Flatlist_node *) ADIOI_Malloc( sizeof(ADIOI_Flatlist_node))) == NULL) { fprintf(stderr, "ADIOI_Exchange_file_views: malloc " "flat_type_p failed\n"); } client_file_view_state_arr[i].flat_type_p->count = recv_count_arr[i].count; client_file_view_state_arr[i].flat_type_p->indices = (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset)); client_file_view_state_arr[i].flat_type_p->blocklens = (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset)); /* Copy the extra data out of the stuff we Alltoall'd */ memcpy (&client_file_view_state_arr[i].fp_ind, &recv_count_arr[i].fp_ind, 6*sizeof(ADIO_Offset)); recv_req_arr_sz++; } } } /* Since ADIOI_Calloc may do other things we add the +1 * to avoid a 0-size malloc */ send_req_arr = (MPI_Request *) ADIOI_Calloc(2*(send_req_arr_sz)+1, sizeof(MPI_Request)); j = 0; if (recv_req_arr_sz > 0) { assert (fd->is_agg); recv_req_arr = (MPI_Request *) ADIOI_Calloc(2*(recv_req_arr_sz), sizeof(MPI_Request)); for (i = 0; i < nprocs; i++) { if (recv_count_arr[i].count > 0) { MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices, recv_count_arr[i].count, ADIO_OFFSET, i, INDICES, fd->comm, &recv_req_arr[j]); j++; MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens, recv_count_arr[i].count, ADIO_OFFSET, i, BLOCK_LENS, fd->comm, &recv_req_arr[j]); j++; } } } if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { j = 0; for (i = 0; i < nprocs; i++) { if (send_count_arr[i].count > 0) { MPI_Isend(flat_file_p->indices, send_count_arr[i].count, ADIO_OFFSET, i, INDICES, fd->comm, &send_req_arr[j]); j++; MPI_Isend(flat_file_p->blocklens, send_count_arr[i].count, ADIO_OFFSET, i, BLOCK_LENS, fd->comm, &send_req_arr[j]); j++; } } } else { j = 0; for (i = 0; i < fd->hints->cb_nodes; i++) { if (send_count_arr[i].count > 0) { MPI_Isend(flat_file_p->indices, send_count_arr[i].count, ADIO_OFFSET, fd->hints->ranklist[i], INDICES, fd->comm, &send_req_arr[j]); j++; MPI_Isend(flat_file_p->blocklens, send_count_arr[i].count, ADIO_OFFSET, fd->hints->ranklist[i], BLOCK_LENS, fd->comm, &send_req_arr[j]); j++; } } } /* Since ADIOI_Malloc may do other things we add the +1 * to avoid a 0-size malloc */ statuses = (MPI_Status *) ADIOI_Malloc(1 + 2 * ADIOI_MAX(send_req_arr_sz,recv_req_arr_sz) * sizeof(MPI_Status)); if (send_req_arr_sz > 0) { MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses); ADIOI_Free(send_count_arr); ADIOI_Free(send_req_arr); } if (recv_req_arr_sz > 0) { MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses); ADIOI_Free(recv_count_arr); ADIOI_Free(recv_req_arr); } ADIOI_Free(statuses); if (fd->is_agg == 1) { ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, TEMP_OFF); ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, REAL_OFF); } #ifdef DEBUG if (fd->is_agg == 1) { ADIOI_Flatlist_node *fr_node_p = ADIOI_Flatlist; for (i = 0; i < nprocs; i++) { fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld," "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i, client_file_view_state_arr[i].fp_ind, client_file_view_state_arr[i].disp, client_file_view_state_arr[i].byte_off, client_file_view_state_arr[i].sz, client_file_view_state_arr[i].ext); } while (fr_node_p->type != fd->file_realm_types[fd->my_cb_nodes_index]) fr_node_p = fr_node_p->next; assert(fr_node_p != NULL); fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ", fd->my_cb_nodes_index, fd->file_realm_st_offs[fd->my_cb_nodes_index]); ADIOI_Print_flatlist_node(fr_node_p); } #endif #ifdef DEBUG2 if (fd->is_agg == 1) { for (i = 0; i < nprocs; i++) { fprintf(stderr, "client_file_view_state_arr[%d]: ", i); ADIOI_Print_flatlist_node( client_file_view_state_arr[i].flat_type_p); } } #endif #ifdef AGGREGATION_PROFILE MPE_Log_event (5015, 0, NULL); #endif }