int convert_mpi_pvfs2_dtype(MPI_Datatype *mpi_dtype, PVFS_Request *pvfs_dtype) { int num_int = -1, num_addr = -1, num_dtype = -1, combiner = -1, i = -1, ret = -1, leaf = -1; int *arr_int = NULL; MPI_Aint *arr_addr = NULL; MPI_Datatype *arr_dtype = NULL; PVFS_Request *old_pvfs_dtype = NULL; PVFS_Request *old_pvfs_dtype_arr = NULL; int arr_count = -1; PVFS_size *pvfs_arr_disp = NULL; int *pvfs_arr_len = NULL; MPI_Type_get_envelope(*mpi_dtype, &num_int, &num_addr, &num_dtype, &combiner); /* Depending on type of datatype do the following * operations */ if (combiner == MPI_COMBINER_NAMED) { convert_named(mpi_dtype, pvfs_dtype, combiner); return 1; } /* Allocate space for the arrays necessary for * MPI_Type_get_contents */ if ((arr_int = ADIOI_Malloc(sizeof(int)*num_int)) == NULL) { fprintf(stderr, "Failed to allocate array_int\n"); return -1; } if ((arr_addr = ADIOI_Malloc(sizeof(int)*num_addr)) == NULL) { ADIOI_Free(arr_int); fprintf(stderr, "Failed to allocate array_addr\n"); return -1; } if ((arr_dtype = ADIOI_Malloc(sizeof(MPI_Datatype)*num_dtype)) == NULL) { ADIOI_Free(arr_int); ADIOI_Free(arr_addr); fprintf(stderr, "Failed to allocate array_dtypes\n"); return -1; } MPI_Type_get_contents(*mpi_dtype, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype); /* If it's not a predefined datatype, it is either a * derived datatype or a structured datatype */ if (combiner != MPI_COMBINER_STRUCT) { if ((old_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL) fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Request\n"); switch (combiner) { case MPI_COMBINER_CONTIGUOUS: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); ret = PVFS_Request_contiguous(arr_int[0], *old_pvfs_dtype, pvfs_dtype); break; case MPI_COMBINER_VECTOR: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); ret = PVFS_Request_vector(arr_int[0], arr_int[1], arr_int[2], *old_pvfs_dtype, pvfs_dtype); break; case MPI_COMBINER_HVECTOR: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); ret = PVFS_Request_hvector(arr_int[0], arr_int[1], arr_addr[0], *old_pvfs_dtype, pvfs_dtype); break; /* Both INDEXED and HINDEXED types require PVFS_size * address arrays. Therefore, we need to copy and * convert the data from MPI_get_contents() into * a PVFS_size buffer */ case MPI_COMBINER_INDEXED: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); if ((pvfs_arr_disp = ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0) { fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_disp\n"); } for (i = 0; i < arr_int[0]; i++) { pvfs_arr_disp[i] = (PVFS_size) arr_int[arr_int[0]+1+i]; } ret = PVFS_Request_indexed(arr_int[0], &arr_int[1], pvfs_arr_disp, *old_pvfs_dtype, pvfs_dtype); ADIOI_Free(pvfs_arr_disp); break; case MPI_COMBINER_HINDEXED: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); if ((pvfs_arr_disp = ADIOI_Malloc(arr_int[0]*sizeof(PVFS_size))) == 0) { fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_disp\n"); } for (i = 0; i < arr_int[0]; i++) { pvfs_arr_disp[i] = (PVFS_size) arr_addr[i]; } ret = PVFS_Request_hindexed(arr_int[0], &arr_int[1], (int64_t *)&arr_addr[0], *old_pvfs_dtype, pvfs_dtype); ADIOI_Free(pvfs_arr_disp); break; case MPI_COMBINER_DUP: leaf = convert_mpi_pvfs2_dtype(&arr_dtype[0], old_pvfs_dtype); ret = PVFS_Request_contiguous(1, *old_pvfs_dtype, pvfs_dtype); break; case MPI_COMBINER_INDEXED_BLOCK: /* No native PVFS2 support for this operation currently */ ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "INDEXED_BLOCK is unsupported\n"); break; case MPI_COMBINER_HINDEXED_BLOCK: /* No native PVFS2 support for this operation currently */ ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "HINDEXED_BLOCK is unsupported\n"); break; case MPI_COMBINER_HINDEXED_INTEGER: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "HINDEXED_INTEGER is unsupported\n"); break; case MPI_COMBINER_STRUCT_INTEGER: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "STRUCT_INTEGER is unsupported\n"); break; case MPI_COMBINER_SUBARRAY: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "SUBARRAY is unsupported\n"); break; case MPI_COMBINER_DARRAY: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "DARRAY is unsupported\n"); break; case MPI_COMBINER_F90_REAL: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_REAL is unsupported\n"); break; case MPI_COMBINER_F90_COMPLEX: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_COMPLEX is unsupported\n"); break; case MPI_COMBINER_F90_INTEGER: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "F90_INTEGER is unsupported\n"); break; case MPI_COMBINER_RESIZED: ADIOI_Free(old_pvfs_dtype); fprintf(stderr, "convert_mpi_pvfs2_dtype: " "RESIZED is unsupported\n"); break; default: break; } if (ret != 0) fprintf(stderr, "Error in PVFS_Request_* " "for a derived datatype\n"); #ifdef DEBUG_DTYPE print_dtype_info(combiner, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype); #endif if (leaf != 1 && combiner != MPI_COMBINER_DUP) MPI_Type_free(&arr_dtype[0]); ADIOI_Free(arr_int); ADIOI_Free(arr_addr); ADIOI_Free(arr_dtype); PVFS_Request_free(old_pvfs_dtype); ADIOI_Free(old_pvfs_dtype); return ret; } else /* MPI_COMBINER_STRUCT */ { MPI_Aint mpi_lb = -1, mpi_extent = -1; PVFS_offset pvfs_lb = -1; PVFS_size pvfs_extent = -1; int has_lb_ub = 0; /* When converting into a PVFS_Request_struct, we no longer * can use MPI_LB and MPI_UB. Therfore, we have to do the * following. * We simply ignore all the MPI_LB and MPI_UB types and * get the lb and extent and pass it on through a * PVFS resized_req */ arr_count = 0; for (i = 0; i < arr_int[0]; i++) { if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) { arr_count++; } } if (arr_int[0] != arr_count) { MPI_Type_get_extent(*mpi_dtype, &mpi_lb, &mpi_extent); pvfs_lb = mpi_lb; pvfs_extent = mpi_extent; if ((pvfs_arr_len = ADIOI_Malloc(arr_count*sizeof(int))) == NULL) { fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_len\n"); } has_lb_ub = 1; } if ((old_pvfs_dtype_arr = ADIOI_Malloc(arr_count*sizeof(PVFS_Request))) == NULL) fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Requests\n"); if ((pvfs_arr_disp = ADIOI_Malloc(arr_count*sizeof(PVFS_size))) == NULL) { fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate pvfs_arr_disp\n"); } arr_count = 0; for (i = 0; i < arr_int[0]; i++) { if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) { leaf = convert_mpi_pvfs2_dtype( &arr_dtype[i], &old_pvfs_dtype_arr[arr_count]); if (leaf != 1) MPI_Type_free(&arr_dtype[i]); pvfs_arr_disp[arr_count] = (PVFS_size) arr_addr[i]; if (has_lb_ub) { pvfs_arr_len[arr_count] = arr_int[i+1]; } arr_count++; } } /* If a MPI_UB or MPI_LB did exist, we have to * resize the datatype */ if (has_lb_ub) { PVFS_Request *tmp_pvfs_dtype = NULL; if ((tmp_pvfs_dtype = ADIOI_Malloc(sizeof(PVFS_Request))) == NULL) fprintf(stderr, "convert_mpi_pvfs2_dtype: " "Failed to allocate PVFS_Request\n"); ret = PVFS_Request_struct(arr_count, pvfs_arr_len, pvfs_arr_disp, old_pvfs_dtype_arr, tmp_pvfs_dtype); if (ret != 0) fprintf(stderr, "Error in PVFS_Request_struct\n"); arr_count = 0; for (i = 0; i < arr_int[0]; i++) { if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) { PVFS_Request_free(&old_pvfs_dtype_arr[arr_count]); arr_count++; } } #ifdef DEBUG_DTYPE fprintf(stderr, "STRUCT(WITHOUT %d LB or UB)(%d,[", arr_int[0] - arr_count, arr_count); for (i = 0; i < arr_count; i++) fprintf(stderr, "(%d,%Ld) ", pvfs_arr_len[i], pvfs_arr_disp[i]); fprintf(stderr, "]\n"); fprintf(stderr, "RESIZED(LB = %Ld, EXTENT = %Ld)\n", pvfs_lb, pvfs_extent); #endif ret = PVFS_Request_resized(*tmp_pvfs_dtype, pvfs_lb, pvfs_extent, pvfs_dtype); if (ret != 0) fprintf(stderr, "Error in PVFS_Request_resize\n"); PVFS_Request_free(tmp_pvfs_dtype); ADIOI_Free(tmp_pvfs_dtype); } else /* No MPI_LB or MPI_UB datatypes */ { ret = PVFS_Request_struct(arr_int[0], &arr_int[1], pvfs_arr_disp, old_pvfs_dtype_arr, pvfs_dtype); if (ret != 0) fprintf(stderr, "Error in PVFS_Request_struct\n"); for (i = 0; i < arr_int[0]; i++) { if (arr_dtype[i] != MPI_LB && arr_dtype[i] != MPI_UB) PVFS_Request_free(&old_pvfs_dtype_arr[i]); } #ifdef DEBUG_DTYPE print_dtype_info(combiner, num_int, num_addr, num_dtype, arr_int, arr_addr, arr_dtype); #endif } ADIOI_Free(arr_int); ADIOI_Free(arr_addr); ADIOI_Free(arr_dtype); ADIOI_Free(old_pvfs_dtype_arr); ADIOI_Free(pvfs_arr_disp); ADIOI_Free(pvfs_arr_len); return ret; } /* Shouldn't have gotten here */ fprintf(stderr, "convert_mpi_pvfs2_dtype: SERIOUS ERROR\n"); return -1; }
void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, new_brd_size, brd_size, size; int i, j, k, err, err_flag, st_index=0; MPI_Count num, bufsize; int n_etypes_in_filetype; ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype; ADIO_Offset abs_off_in_filetype=0, new_frd_size, frd_size=0, st_frd_size; MPI_Count filetype_size, etype_size, buftype_size, partial_read; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off, req_len, sum; ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off; char *readbuf, *tmp_buf, *value; int info_flag; unsigned max_bufsize, readbuf_len; static char myname[] = "ADIOI_NFS_READSTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(MPI_Count)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ flat_buf = ADIOI_Flatten_and_find(datatype); off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; readbuf_off = off; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (unsigned) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, readbuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_READ off += flat_buf->blocklens[i]; } if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(readbuf); /* malloced in the buffered_read macro */ if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ flat_file = ADIOI_Flatten_and_find(fd->filetype); disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* frd_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; frd_size = flat_file->blocklens[i]; break; } if (dist > 0 ) { frd_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; frd_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao: read request is within a single flat_file contig * block e.g. with subarray types that actually describe the whole * array */ if (buftype_is_contig && bufsize <= frd_size) { /* a count of bytes can overflow. operate on original type instead */ ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte that * can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == frd_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ st_frd_size = frd_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { i_offset += frd_size; end_offset = off + frd_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock (exclusive) the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read into readbuf */ readbuf_off = offset; readbuf = (char *) ADIOI_Malloc(max_bufsize); readbuf_len = (int) (MPL_MIN(max_bufsize, end_offset-readbuf_off+1)); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, offset, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, readbuf, readbuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len); if (err == -1) err_flag = 1; if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; frd_size = MPL_MIN(st_frd_size, bufsize); while (i_offset < bufsize) { if (frd_size) { /* TYPE_UB and TYPE_LB can result in frd_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/ req_off = off; req_len = frd_size; userbuf_off = i_offset; ADIOI_BUFFERED_READ } i_offset += frd_size; if (off + frd_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += frd_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by frd_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; frd_size = MPL_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
/*@ MPI_File_read_ordered_begin - Begin a split collective read using shared file pointer Input Parameters: . fh - file handle (handle) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . buf - initial address of buffer (choice) .N fortran @*/ int MPI_File_read_ordered_begin(MPI_File fh, void *buf, int count, MPI_Datatype datatype) { int error_code, nprocs, myrank; MPI_Count datatype_size; int source, dest; ADIO_Offset shared_fp, incr; ADIO_File adio_fh; static char myname[] = "MPI_FILE_READ_ORDERED_BEGIN"; void *xbuf=NULL, *e32_buf=NULL; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); if (adio_fh->split_coll_count) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iosplitcoll", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ adio_fh->split_coll_count = 1; MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_FS_SUPPORTS_SHARED(adio_fh, myname, error_code); MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); MPI_Comm_size(adio_fh->comm, &nprocs); MPI_Comm_rank(adio_fh->comm, &myrank); incr = (count*datatype_size)/adio_fh->etype_size; /* Use a message as a 'token' to order the operations */ source = myrank - 1; dest = myrank + 1; if (source < 0) source = MPI_PROC_NULL; if (dest >= nprocs) dest = MPI_PROC_NULL; MPI_Recv(NULL, 0, MPI_BYTE, source, 0, adio_fh->comm, MPI_STATUS_IGNORE); ADIO_Get_shared_fp(adio_fh, incr, &shared_fp, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Send(NULL, 0, MPI_BYTE, dest, 0, adio_fh->comm); xbuf = buf; if (adio_fh->is_external32) { MPI_Aint e32_size = 0; error_code = MPIU_datatype_full_size(datatype, &e32_size); if (error_code != MPI_SUCCESS) goto fn_exit; e32_buf = ADIOI_Malloc(e32_size*count); xbuf = e32_buf; } ADIO_ReadStridedColl(adio_fh, xbuf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, &adio_fh->split_status, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ if (e32_buf != NULL) { error_code = MPIU_read_external32_conversion_fn(xbuf, datatype, count, e32_buf); ADIOI_Free(e32_buf); } fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; }
void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int req_len; MPI_Count filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf=NULL, *value; int st_fwr_size, st_n_filetypes, writebuf_len, write_sz; int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize; static char myname[] = "ADIOI_NFS_WRITESTRIDED"; ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size_x(fd->filetype, &filetype_size); if ( ! filetype_size ) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } /* write the buffer out finally */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, writebuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_a, 0, NULL ); #endif err = write(fd->fd_sys, writebuf, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_write_b, 0, NULL ); #endif if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET, offset, status, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { i += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_a, 0, NULL ); #endif lseek(fd->fd_sys, writebuf_off, SEEK_SET); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_lseek_b, 0, NULL ); #endif #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_a, 0, NULL ); #endif err = read(fd->fd_sys, writebuf, writebuf_len); #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_read_b, 0, NULL ); #endif if (err == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0); goto fn_exit; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i; ADIOI_BUFFERED_WRITE } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else {
void ADIOI_NFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size, req_len; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset, writebuf_off, start_off; char *writebuf, *value; int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_NFS_WRITESTRIDED"; #endif ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; /* get max_bufsize from the info object. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); MPI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); max_bufsize = atoi(value); ADIOI_Free(value); if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + etype_size * offset; start_off = off; end_offset = off + bufsize - 1; writebuf_off = off; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { userbuf_off = j*buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } /* write the buffer out finally */ lseek(fd->fd_sys, writebuf_off, SEEK_SET); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); err = write(fd->fd_sys, writebuf, writebuf_len); if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (err == -1) err_flag = 1; if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_Free(writebuf); /* malloced in the buffered_write macro */ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; #ifdef PRINT_ERR_MSG *error_code = (err_flag) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (err_flag) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = (int) (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset); flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } start_off = offset; /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { i += fwr_size; end_offset = off + fwr_size - 1; if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); /* initial read for the read-modify-write */ writebuf_off = offset; writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (int)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1)); if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); lseek(fd->fd_sys, writebuf_off, SEEK_SET); err = read(fd->fd_sys, writebuf, writebuf_len); if (err == -1) { FPRINTF(stderr, "ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i; ADIOI_BUFFERED_WRITE } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else {
void ADIOI_Get_eof_offset(ADIO_File fd, ADIO_Offset *eof_offset) { unsigned filetype_size; int error_code, filetype_is_contig, etype_size; ADIO_Offset fsize, disp, sum=0, size_in_file, n_filetypes, rem; int flag, i; ADIO_Fcntl_t *fcntl_struct; MPI_Aint filetype_extent; ADIOI_Flatlist_node *flat_file; /* find the eof in bytes */ fcntl_struct = (ADIO_Fcntl_t *) ADIOI_Malloc(sizeof(ADIO_Fcntl_t)); ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, fcntl_struct, &error_code); fsize = fcntl_struct->fsize; ADIOI_Free(fcntl_struct); /* Find the offset in etype units corresponding to eof. The eof could lie in a hole in the current view, or in the middle of an etype. In that case the offset will be the offset corresponding to the start of the next etype in the current view.*/ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); etype_size = fd->etype_size; if (filetype_is_contig) *eof_offset = (fsize - fd->disp + etype_size - 1)/etype_size; /* ceiling division in case fsize is not a multiple of etype_size;*/ else { /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; MPI_Type_size(fd->filetype, (int*)&filetype_size); MPI_Type_extent(fd->filetype, &filetype_extent); disp = fd->disp; n_filetypes = -1; flag = 0; while (!flag) { sum = 0; n_filetypes++; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (disp + flat_file->indices[i] + n_filetypes* ADIOI_AINT_CAST_TO_OFFSET filetype_extent + flat_file->blocklens[i] >= fsize) { if (disp + flat_file->indices[i] + n_filetypes * ADIOI_AINT_CAST_TO_OFFSET filetype_extent >= fsize) sum -= flat_file->blocklens[i]; else { rem = (disp + flat_file->indices[i] + n_filetypes* ADIOI_AINT_CAST_TO_OFFSET filetype_extent + flat_file->blocklens[i] - fsize); sum -= rem; } flag = 1; break; } } } size_in_file = n_filetypes*(ADIO_Offset)filetype_size + sum; *eof_offset = (size_in_file+etype_size-1)/etype_size; /* ceiling division */ } }
/* ADIOI_Exchange_file_views - Sends all the aggregators the file * views and file view states of the clients. It fills in the * client_file_view_state_arr for the aggregators and the * my_mem_view_state for the client. It also initializes the * agg_file_view_state for all clients, which is the view for each * aggregator of a client's filetype. */ void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type, ADIO_File fd, int count, MPI_Datatype datatype, ADIO_Offset off, view_state * my_mem_view_state_arr, view_state * agg_file_view_state_arr, view_state * client_file_view_state_arr) { /* Convert my own fileview to an ADIOI_Flattened type and a * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes. * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node * to/from each of the aggregators with the rest of the file view * state. */ int i = -1, j = -1; amount_and_extra_data_t *send_count_arr = NULL; amount_and_extra_data_t *recv_count_arr = NULL; int send_req_arr_sz = 0; int recv_req_arr_sz = 0; MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL; MPI_Status *statuses = NULL; ADIO_Offset disp_off_sz_ext_typesz[6]; MPI_Aint memtype_extent, filetype_extent; int ret = -1; /* parameters for datatypes */ ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL; MPI_Count memtype_sz = -1; int memtype_is_contig = -1; ADIO_Offset filetype_sz = -1; #ifdef AGGREGATION_PROFILE MPE_Log_event(5014, 0, NULL); #endif /* The memtype will be freed after the call. The filetype will be * freed in the close and should have been flattened in the file * view. */ MPI_Type_size_x(datatype, &memtype_sz); MPI_Type_extent(datatype, &memtype_extent); if (memtype_sz == memtype_extent) { memtype_is_contig = 1; flat_mem_p = ADIOI_Flatten_and_find(datatype); flat_mem_p->blocklens[0] = memtype_sz * count; } else { flat_mem_p = ADIOI_Flatten_and_find(datatype); } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size_x(fd->filetype, &filetype_sz); flat_file_p = ADIOI_Flatten_and_find(fd->filetype); if (filetype_extent == filetype_sz) { flat_file_p->blocklens[0] = memtype_sz * count; filetype_extent = memtype_sz * count; filetype_sz = filetype_extent; } disp_off_sz_ext_typesz[0] = fd->fp_ind; disp_off_sz_ext_typesz[1] = fd->disp; disp_off_sz_ext_typesz[2] = off; disp_off_sz_ext_typesz[3] = memtype_sz * count; disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent; disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz; if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); } else { send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes, sizeof(amount_and_extra_data_t)); /* only aggregators receive data */ if (fd->is_agg) { recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t)); recv_req_arr = ADIOI_Malloc(nprocs * sizeof(MPI_Request)); for (i = 0; i < nprocs; i++) MPI_Irecv(&recv_count_arr[i], sizeof(amount_and_extra_data_t), MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]); } /* only send data to aggregators */ send_req_arr = ADIOI_Calloc(fd->hints->cb_nodes, sizeof(MPI_Request)); for (i = 0; i < fd->hints->cb_nodes; i++) { send_count_arr[i].count = flat_file_p->count; send_count_arr[i].fp_ind = disp_off_sz_ext_typesz[0]; send_count_arr[i].disp = disp_off_sz_ext_typesz[1]; send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2]; send_count_arr[i].sz = disp_off_sz_ext_typesz[3]; send_count_arr[i].ext = disp_off_sz_ext_typesz[4]; send_count_arr[i].type_sz = disp_off_sz_ext_typesz[5]; MPI_Isend(&send_count_arr[i], sizeof(amount_and_extra_data_t), MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm, &send_req_arr[i]); } } /* Every client has to build mem and file view_states for each aggregator. * We initialize their values here. and we also initialize * send_count_arr */ if (memtype_is_contig) { /* if memory is contigous, we now replace memtype_sz and * memtype_extent with the full access size */ memtype_sz *= count; memtype_extent = memtype_sz; } for (i = 0; i < fd->hints->cb_nodes; i++) { int tmp_agg_idx = fd->hints->ranklist[i]; memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state)); my_mem_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; my_mem_view_state_arr[tmp_agg_idx].ext = (ADIO_Offset) memtype_extent; my_mem_view_state_arr[tmp_agg_idx].type_sz = (ADIO_Offset) memtype_sz; my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p; ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), TEMP_OFF); ADIOI_init_view_state(file_ptr_type, 1, &(my_mem_view_state_arr[tmp_agg_idx]), REAL_OFF); memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state)); agg_file_view_state_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0]; agg_file_view_state_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1]; agg_file_view_state_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2]; agg_file_view_state_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; agg_file_view_state_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4]; agg_file_view_state_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5]; agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p; ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), TEMP_OFF); ADIOI_init_view_state(file_ptr_type, 1, &(agg_file_view_state_arr[tmp_agg_idx]), REAL_OFF); if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { send_count_arr[tmp_agg_idx].count = flat_file_p->count; send_count_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0]; send_count_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1]; send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2]; send_count_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3]; send_count_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4]; send_count_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5]; } } #ifdef DEBUG2 fprintf(stderr, "my own flattened memtype: "); ADIOI_Print_flatlist_node(flat_mem_p); fprintf(stderr, "my own flattened filetype: "); ADIOI_Print_flatlist_node(flat_file_p); #endif if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t), MPI_BYTE, recv_count_arr, sizeof(amount_and_extra_data_t), MPI_BYTE, fd->comm); if (ret != MPI_SUCCESS) { fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed " "with error %d", ret); return; } } else { #ifdef MPI_STATUSES_IGNORE statuses = MPI_STATUSES_IGNORE; #else statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status)); #endif if (fd->is_agg) { MPI_Waitall(nprocs, recv_req_arr, statuses); ADIOI_Free(recv_req_arr); } MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses); #ifndef MPI_STATUSES_IGNORE ADIOI_Free(statuses); #endif ADIOI_Free(send_req_arr); } #ifdef DEBUG2 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { fprintf(stderr, "send_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count); } fprintf(stderr, "\n"); fprintf(stderr, "recv_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count); } fprintf(stderr, "\n"); } else { fprintf(stderr, "send_count_arr:"); for (i = 0; i < fd->hints->cb_nodes; i++) { fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count); } fprintf(stderr, "\n"); if (fd->is_agg) { fprintf(stderr, "recv_count_arr:"); for (i = 0; i < nprocs; i++) { fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count); } fprintf(stderr, "\n"); } } #endif if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) { for (i = 0; i < fd->hints->cb_nodes; i++) if (send_count_arr[i].count > 0) send_req_arr_sz++; } /* Figure out how many counts to send/recv */ for (i = 0; i < nprocs; i++) { if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { if (send_count_arr[i].count > 0) send_req_arr_sz++; } /* Only aggregators should recv */ if (fd->is_agg) { if (recv_count_arr[i].count > 0) { if ((client_file_view_state_arr[i].flat_type_p = (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node))) == NULL) { fprintf(stderr, "ADIOI_Exchange_file_views: malloc " "flat_type_p failed\n"); } client_file_view_state_arr[i].flat_type_p->count = recv_count_arr[i].count; client_file_view_state_arr[i].flat_type_p->indices = (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset)); client_file_view_state_arr[i].flat_type_p->blocklens = (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count, sizeof(ADIO_Offset)); /* Copy the extra data out of the stuff we Alltoall'd */ memcpy(&client_file_view_state_arr[i].fp_ind, &recv_count_arr[i].fp_ind, 6 * sizeof(ADIO_Offset)); recv_req_arr_sz++; } } } /* Since ADIOI_Calloc may do other things we add the +1 * to avoid a 0-size malloc */ send_req_arr = (MPI_Request *) ADIOI_Calloc(2 * (send_req_arr_sz) + 1, sizeof(MPI_Request)); j = 0; if (recv_req_arr_sz > 0) { assert(fd->is_agg); recv_req_arr = (MPI_Request *) ADIOI_Calloc(2 * (recv_req_arr_sz), sizeof(MPI_Request)); for (i = 0; i < nprocs; i++) { if (recv_count_arr[i].count > 0) { MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices, recv_count_arr[i].count, ADIO_OFFSET, i, INDICES, fd->comm, &recv_req_arr[j]); j++; MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens, recv_count_arr[i].count, ADIO_OFFSET, i, BLOCK_LENS, fd->comm, &recv_req_arr[j]); j++; } } } if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) { j = 0; for (i = 0; i < nprocs; i++) { if (send_count_arr[i].count > 0) { MPI_Isend(flat_file_p->indices, send_count_arr[i].count, ADIO_OFFSET, i, INDICES, fd->comm, &send_req_arr[j]); j++; MPI_Isend(flat_file_p->blocklens, send_count_arr[i].count, ADIO_OFFSET, i, BLOCK_LENS, fd->comm, &send_req_arr[j]); j++; } } } else { j = 0; for (i = 0; i < fd->hints->cb_nodes; i++) { if (send_count_arr[i].count > 0) { MPI_Isend(flat_file_p->indices, send_count_arr[i].count, ADIO_OFFSET, fd->hints->ranklist[i], INDICES, fd->comm, &send_req_arr[j]); j++; MPI_Isend(flat_file_p->blocklens, send_count_arr[i].count, ADIO_OFFSET, fd->hints->ranklist[i], BLOCK_LENS, fd->comm, &send_req_arr[j]); j++; } } } /* Since ADIOI_Malloc may do other things we add the +1 * to avoid a 0-size malloc */ #ifdef MPI_STATUSES_IGNORE statuses = MPI_STATUSES_IGNORE; #else statuses = (MPI_Status *) ADIOI_Malloc(1 + 2 * MPL_MAX(send_req_arr_sz, recv_req_arr_sz) * sizeof(MPI_Status)); #endif if (send_req_arr_sz > 0) { MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses); ADIOI_Free(send_count_arr); ADIOI_Free(send_req_arr); } if (recv_req_arr_sz > 0) { MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses); ADIOI_Free(recv_count_arr); ADIOI_Free(recv_req_arr); } #ifndef MPI_STATUSES_IGNORE ADIOI_Free(statuses); #endif if (fd->is_agg == 1) { ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, TEMP_OFF); ADIOI_init_view_state(file_ptr_type, nprocs, client_file_view_state_arr, REAL_OFF); } #ifdef DEBUG if (fd->is_agg == 1) { ADIOI_Flatlist_node *fr_node_p; for (i = 0; i < nprocs; i++) { fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld," "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i, client_file_view_state_arr[i].fp_ind, client_file_view_state_arr[i].disp, client_file_view_state_arr[i].byte_off, client_file_view_state_arr[i].sz, client_file_view_state_arr[i].ext); } fr_node_p = ADIOI_Flatten_and_find(fd->file_realm_types[fd->my - cb_nodes_index]); assert(fr_node_p != NULL); fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ", fd->my_cb_nodes_index, fd->file_realm_st_offs[fd->my_cb_nodes_index]); ADIOI_Print_flatlist_node(fr_node_p); } #endif #ifdef DEBUG2 if (fd->is_agg == 1) { for (i = 0; i < nprocs; i++) { fprintf(stderr, "client_file_view_state_arr[%d]: ", i); ADIOI_Print_flatlist_node(client_file_view_state_arr[i].flat_type_p); } } #endif #ifdef AGGREGATION_PROFILE MPE_Log_event(5015, 0, NULL); #endif }
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; int lumlen, myrank, flag, set_layout=0, err; struct lov_user_md *lum = NULL; char *value; ADIO_Offset str_factor = -1, str_unit=0, start_iodev=-1; size_t value_sz = (MPI_MAX_INFO_VAL+1)*sizeof(char); #if defined(MPICH) || !defined(PRINT_ERR_MSG) static char myname[] = "ADIOI_LUSTRE_OPEN"; #endif MPI_Comm_rank(fd->comm, &myrank); if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; amode_direct = amode | O_DIRECT; /* odd length here because lov_user_md contains some fixed data and * then a list of 'lmm_objects' representing stripe */ lumlen = sizeof(struct lov_user_md) + MAX_LOV_UUID_COUNT * sizeof(struct lov_user_ost_data); lum = (struct lov_user_md *)ADIOI_Calloc(1,lumlen); value = (char *) ADIOI_Malloc(value_sz); /* we already validated in LUSTRE_SetInfo that these are going to be the same */ if (fd->info != MPI_INFO_NULL) { /* striping information */ ADIOI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_unit=atoll(value); ADIOI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); if (flag) str_factor=atoll(value); ADIOI_Info_get(fd->info, "romio_lustre_start_iodevice", MPI_MAX_INFO_VAL, value, &flag); if (flag) start_iodev=atoll(value); } if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) set_layout = 1; /* if hints were set, we need to delay creation of any lustre objects. * However, if we open the file with O_LOV_DELAY_CREATE and don't call the * follow-up ioctl, subsequent writes will fail */ if (myrank == 0 && set_layout) amode = amode | O_LOV_DELAY_CREATE; fd->fd_sys = open(fd->filename, amode, perm); if (fd->fd_sys == -1) goto fn_exit; /* we can only set these hints on new files */ /* It was strange and buggy to open the file in the hint path. Instead, * we'll apply the file tunings at open time */ if ((amode & O_CREAT) && set_layout ) { /* if user has specified striping info, first aggregator tries to set * it */ if (myrank == fd->hints->ranklist[0] || fd->comm == MPI_COMM_SELF) { lum->lmm_magic = LOV_USER_MAGIC; lum->lmm_pattern = 0; /* crude check for overflow of lustre internal datatypes. * Silently cap to large value if user provides a value * larger than lustre supports */ if (str_unit > UINT_MAX) lum->lmm_stripe_size = UINT_MAX; else lum->lmm_stripe_size = str_unit; if (str_factor > USHRT_MAX) lum->lmm_stripe_count = USHRT_MAX; else lum->lmm_stripe_count = str_factor; if (start_iodev > USHRT_MAX) lum->lmm_stripe_offset = USHRT_MAX; else lum->lmm_stripe_offset = start_iodev; err = ioctl(fd->fd_sys, LL_IOC_LOV_SETSTRIPE, lum); if (err == -1 && errno != EEXIST) { fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); /* not a fatal error, but user might care to know */ } } /* End of striping parameters validation */ } /* Pascal Deveze reports that, even though we pass a * "GETSTRIPE" (read) flag to the ioctl, if some of the values of this * struct are uninitialzed, the call can give an error. zero it out in case * there are other members that must be initialized and in case * lov_user_md struct changes in future */ memset(lum, 0, lumlen); lum->lmm_magic = LOV_USER_MAGIC; err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *)lum); if (!err) { fd->hints->striping_unit = lum->lmm_stripe_size; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_size); ADIOI_Info_set(fd->info, "striping_unit", value); fd->hints->striping_factor = lum->lmm_stripe_count; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_count); ADIOI_Info_set(fd->info, "striping_factor", value); fd->hints->start_iodevice = lum->lmm_stripe_offset; MPL_snprintf(value, value_sz, "%d", lum->lmm_stripe_offset); ADIOI_Info_set(fd->info, "romio_lustre_start_iodevice", value); } if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); fd->fd_direct = -1; if (fd->direct_write || fd->direct_read) { fd->fd_direct = open(fd->filename, amode_direct, perm); if (fd->fd_direct != -1) { fd->d_mem = fd->d_miniosz = (1<<12); } else { perror("cannot open file with O_Direct"); fd->direct_write = fd->direct_read = 0; } } fn_exit: ADIOI_Free(lum); ADIOI_Free(value); /* --BEGIN ERROR HANDLING-- */ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && (fd->direct_write || fd->direct_read))) { *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); } /* --END ERROR HANDLING-- */ else *error_code = MPI_SUCCESS; }
void ADIOI_PIOFS_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Since PIOFS does not support file locking, can't do buffered writes as on Unix */ /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; struct iovec *iov; int i, j, k, err=-1, bwr_size, fwr_size=0, st_index=0; int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; int n_filetypes, etype_in_filetype; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent, indx; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset off, disp; int flag, new_bwr_size, new_fwr_size, err_flag=0; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_WRITESTRIDED"; #endif if (fd->atomicity) { FPRINTF(stderr, "ROMIO cannot guarantee atomicity of noncontiguous accesses in atomic mode, as PIOFS doesn't support file locking. Use nonatomic mode and its associated semantics.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if ( ! filetype_size ) { *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; bufsize = buftype_size * count; if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. use writev */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; /* There is a limit of 16 on the number of iovecs for readv/writev! */ iov = (struct iovec *) ADIOI_Malloc(16*sizeof(struct iovec)); if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + etype_size * offset; llseek(fd->fd_sys, off, SEEK_SET); } else off = llseek(fd->fd_sys, fd->fp_ind, SEEK_SET); k = 0; for (j=0; j<count; j++) for (i=0; i<flat_buf->count; i++) { iov[k].iov_base = ((char *) buf) + j*buftype_extent + flat_buf->indices[i]; iov[k].iov_len = flat_buf->blocklens[i]; /*FPRINTF(stderr, "%d %d\n", iov[k].iov_base, iov[k].iov_len);*/ off += flat_buf->blocklens[i]; k = (k+1)%16; if (!k) { err = writev(fd->fd_sys, iov, 16); if (err == -1) err_flag = 1; } } if (k) { err = writev(fd->fd_sys, iov, k); if (err == -1) err_flag = 1; } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; ADIOI_Free(iov); if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } /* if (!buftype_is_contig && filetype_is_contig) ... */ else { /* noncontiguous in file */ /* split up into several contiguous writes */ /* find starting location in the file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { offset = fd->fp_ind; /* in bytes */ n_filetypes = -1; flag = 0; while (!flag) { n_filetypes++; for (i=0; i<flat_file->count; i++) { if (disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] >= offset) { st_index = i; fwr_size = disp + flat_file->indices[i] + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] - offset; flag = 1; break; } } } } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = (int) (offset / n_etypes_in_filetype); etype_in_filetype = (int) (offset % n_etypes_in_filetype); size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i=0; i<flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; } if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(fwr_size, bufsize); while (i < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } i += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + (ADIO_Offset) n_filetypes*filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i); } } } else { /* noncontiguous in memory as well as in file */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; k = num = buf_count = 0; indx = flat_buf->indices[0]; j = st_index; off = offset; bwr_size = flat_buf->blocklens[0]; while (num < bufsize) { size = ADIOI_MIN(fwr_size, bwr_size); if (size) { #ifdef PROFILE MPE_Log_event(11, 0, "start seek"); #endif llseek(fd->fd_sys, off, SEEK_SET); #ifdef PROFILE MPE_Log_event(12, 0, "end seek"); MPE_Log_event(5, 0, "start write"); #endif err = write(fd->fd_sys, ((char *) buf) + indx, size); #ifdef PROFILE MPE_Log_event(6, 0, "end write"); #endif if (err == -1) err_flag = 1; } new_fwr_size = fwr_size; new_bwr_size = bwr_size; if (size == fwr_size) { /* reached end of contiguous block in file */ if (j < (flat_file->count - 1)) j++; else { j = 0; n_filetypes++; } off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes*filetype_extent; new_fwr_size = flat_file->blocklens[j]; if (size != bwr_size) { indx += size; new_bwr_size -= size; } } if (size == bwr_size) { /* reached end of contiguous block in memory */ k = (k + 1)%flat_buf->count; buf_count++; indx = buftype_extent*(buf_count/flat_buf->count) + flat_buf->indices[k]; new_bwr_size = flat_buf->blocklens[k]; if (size != fwr_size) { off += size; new_fwr_size -= size; } } num += size; fwr_size = new_fwr_size; bwr_size = new_bwr_size; } } if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; if (err_flag) { #ifdef MPICH2 *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); #elif defined(PRINT_ERR_MSG) *error_code = MPI_ERR_UNKNOWN; #else /* MPICH-1 */ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif } else *error_code = MPI_SUCCESS; } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ #endif if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); }
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, nprocs=0, nprocs_is_valid = 0; static char myname[] = "ADIOI_GPFS_SETINFO"; int did_anything = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Assert ((value != NULL)); /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { ad_get_env_vars(); ad_gpfs_get_env_vars(); did_anything = 1; /* buffer size for collective I/O */ ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = ADIOI_HINT_ENABLE; ADIOI_Info_set(info, "romio_cb_write", "enable"); fd->hints->cb_write = ADIOI_HINT_ENABLE; if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list); fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs); ADIOI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = -1; /* hint indicating that no indep. I/O will be performed on this file */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* gpfs is not implementing file realms (ADIOI_IOStridedColl), initialize to disabled it. */ /* hint instructing the use of persistent file realms */ ADIOI_Info_set(info, "romio_cb_pfr", "disable"); fd->hints->cb_pfr = ADIOI_HINT_DISABLE; /* hint guiding the assignment of persistent file realms */ ADIOI_Info_set(info, "romio_cb_fr_types", "aar"); fd->hints->cb_fr_type = ADIOI_FR_AAR; /* hint to align file realms with a certain byte value */ ADIOI_Info_set(info, "romio_cb_fr_alignment", "1"); fd->hints->cb_fr_alignment = 1; /* hint to set a threshold percentage for a datatype's size/extent at * which data sieving should be done in collective I/O */ ADIOI_Info_set(info, "romio_cb_ds_threshold", "0"); fd->hints->cb_ds_threshold = 0; /* hint to switch between point-to-point or all-to-all for two-phase */ ADIOI_Info_set(info, "romio_cb_alltoall", "automatic"); fd->hints->cb_alltoall = ADIOI_HINT_AUTO; /* deferred_open derived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT); ADIOI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; ADIOI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; /* still to do: tune this a bit for a variety of file systems. there's * no good default value so just leave it unset */ fd->hints->min_fdomain_size = 0; fd->hints->striping_unit = 0; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size", &(fd->hints->cb_buffer_size), myname, error_code); /* new hints for enabling/disabling coll. buffering on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read", &(fd->hints->cb_read), myname, error_code); if (fd->hints->cb_read == ADIOI_HINT_DISABLE) { /* romio_cb_read overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write", &(fd->hints->cb_write), myname, error_code); if (fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* romio_cb_write overrides no_indep_rw */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } /* Has the user indicated all I/O will be done collectively? */ ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", &(fd->hints->no_indep_rw), myname, error_code); if (fd->hints->no_indep_rw == 1) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ ADIOI_Info_set(info, "romio_cb_write", "enable"); ADIOI_Info_set(info, "romio_cb_read", "enable"); fd->hints->cb_read = 1; fd->hints->cb_write = 1; } /* new hints for enabling/disabling data sieving on * reads/writes */ ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read", &(fd->hints->ds_read), myname, error_code); ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write", &(fd->hints->ds_write), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", &(fd->hints->ind_wr_buffer_size), myname, error_code); ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", &(fd->hints->ind_rd_buffer_size), myname, error_code); memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL, value, &flag); if ( flag && ((intval = atoi(value)) > 0) ) { ADIOI_Info_set(info, "romio_min_fdomain_size", value); fd->hints->min_fdomain_size = intval; } /* Now we use striping unit in common code so we should process hints for it. */ ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit", &(fd->hints->striping_unit), myname, error_code); #ifdef BGQPLATFORM memset( value, 0, MPI_MAX_INFO_VAL+1 ); ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { did_anything = 1; ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value); fd->hints->cb_nodes = intval; } #endif } /* special CB aggregator assignment */ if (did_anything) { #ifdef BGQPLATFORM ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes); #elif PEPLATFORM ADIOI_PE_gen_agg_ranklist(fd); #endif } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\ && fd->hints->no_indep_rw ) ) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ ADIOI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } /* BobC commented this out, but since hint processing runs on both bg and * bglockless, we need to keep DS writes enabled on gpfs and disabled on * PVFS */ if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) { /* disable data sieving for fs that do not support file locking */ ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ ADIOI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ ADIOI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
int MPIOI_File_write_all_begin(MPI_File fh, MPI_Offset offset, int file_ptr_type, const void *buf, int count, MPI_Datatype datatype, char *myname) { int error_code; MPI_Count datatype_size; ADIO_File adio_fh; void *e32buf=NULL; const void *xbuf=NULL; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); MPIO_CHECK_NOT_SEQUENTIAL_MODE(adio_fh, myname, error_code); if (file_ptr_type == ADIO_EXPLICIT_OFFSET && offset < 0) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "**iobadoffset", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } if (adio_fh->split_coll_count) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iosplitcoll", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ adio_fh->split_coll_count = 1; MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ xbuf = buf; if (adio_fh->is_external32) { error_code = MPIU_external32_buffer_setup(buf, count, datatype, &e32buf); if (error_code != MPI_SUCCESS) goto fn_exit; xbuf = e32buf; } adio_fh->split_datatype = datatype; ADIO_WriteStridedColl(adio_fh, xbuf, count, datatype, file_ptr_type, offset, &adio_fh->split_status, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ fn_exit: if ( e32buf != NULL) ADIOI_Free(e32buf); MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; }
/*@ MPI_File_close - Closes a file Input Parameters: . fh - file handle (handle) .N fortran @*/ int MPI_File_close(MPI_File *fh) { int error_code; ADIO_File adio_fh; static char myname[] = "MPI_FILE_CLOSE"; #ifdef MPI_hpux int fl_xmpi; HPMP_IO_WSTART(fl_xmpi, BLKMPIFILECLOSE, TRDTBLOCK, *adio_fh); #endif /* MPI_hpux */ MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(*fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); /* --END ERROR HANDLING-- */ if (ADIO_Feature(adio_fh, ADIO_SHARED_FP)) { ADIOI_Free((adio_fh)->shared_fp_fname); /* POSIX semantics say a deleted file remains available until all * processes close the file. But since when was NFS posix-compliant? */ if (!ADIO_Feature(adio_fh, ADIO_UNLINK_AFTER_CLOSE)) { MPI_Barrier((adio_fh)->comm); } if ((adio_fh)->shared_fp_fd != ADIO_FILE_NULL) { MPI_File *fh_shared = &(adio_fh->shared_fp_fd); ADIO_Close((adio_fh)->shared_fp_fd, &error_code); MPIO_File_free(fh_shared); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) goto fn_fail; /* --END ERROR HANDLING-- */ } } /* Because ROMIO expects the MPI library to provide error handler management * routines but it doesn't ever participate in MPI_File_close, we have to * somehow inform the MPI library that we no longer hold a reference to any * user defined error handler. We do this by setting the errhandler at this * point to MPI_ERRORS_RETURN. */ error_code = PMPI_File_set_errhandler(*fh, MPI_ERRORS_RETURN); if (error_code != MPI_SUCCESS) goto fn_fail; ADIO_Close(adio_fh, &error_code); MPIO_File_free(fh); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) goto fn_fail; /* --END ERROR HANDLING-- */ #ifdef MPI_hpux HPMP_IO_WEND(fl_xmpi); #endif /* MPI_hpux */ fn_exit: MPIU_THREAD_CS_EXIT(ALLFUNC,); return error_code; fn_fail: /* --BEGIN ERROR HANDLING-- */ error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; /* --END ERROR HANDLING-- */ }
void ADIOI_HFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) { MPI_Datatype copy_etype, copy_filetype; int combiner, i, j, k, filetype_is_contig, ntimes, err; ADIOI_Flatlist_node *flat_file; ADIO_Offset curr_fsize, alloc_size, size, len, done; ADIO_Status status; char *buf; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_HFS_FCNTL"; #endif switch(flag) { case ADIO_FCNTL_SET_VIEW: /* free copies of old etypes and filetypes and delete flattened version of filetype if necessary */ MPI_Type_get_envelope(fd->etype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->etype)); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (!filetype_is_contig) ADIOI_Delete_flattened(fd->filetype); MPI_Type_get_envelope(fd->filetype, &i, &j, &k, &combiner); if (combiner != MPI_COMBINER_NAMED) MPI_Type_free(&(fd->filetype)); /* set new info */ ADIO_SetInfo(fd, fcntl_struct->info, &err); /* set new etypes and filetypes */ MPI_Type_get_envelope(fcntl_struct->etype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->etype = fcntl_struct->etype; else { MPI_Type_contiguous(1, fcntl_struct->etype, ©_etype); MPI_Type_commit(©_etype); fd->etype = copy_etype; } MPI_Type_get_envelope(fcntl_struct->filetype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) fd->filetype = fcntl_struct->filetype; else { MPI_Type_contiguous(1, fcntl_struct->filetype, ©_filetype); MPI_Type_commit(©_filetype); fd->filetype = copy_filetype; ADIOI_Flatten_datatype(fd->filetype); /* this function will not flatten the filetype if it turns out to be all contiguous. */ } MPI_Type_size(fd->etype, &(fd->etype_size)); fd->disp = fcntl_struct->disp; /* reset MPI-IO file pointer to point to the first byte that can be accessed in this view. */ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (filetype_is_contig) fd->fp_ind = fcntl_struct->disp; else { flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; for (i=0; i<flat_file->count; i++) { if (flat_file->blocklens[i]) { fd->fp_ind = fcntl_struct->disp + flat_file->indices[i]; break; } } } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_GET_FSIZE: fcntl_struct->fsize = lseek64(fd->fd_sys, 0, SEEK_END); #ifdef PRINT_ERR_MSG *error_code = (fcntl_struct->fsize == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (fcntl_struct->fsize == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif break; case ADIO_FCNTL_SET_DISKSPACE: /* will be called by one process only */ #ifdef SPPUX /* SPPUX has no prealloc64. therefore, use prealloc if size < (2GB - 1), otherwise use long method. */ if (fcntl_struct->diskspace <= 2147483647) { err = prealloc(fd->fd_sys, (off_t) fcntl_struct->diskspace); if (err && (errno != ENOTEMPTY)) { #ifdef PRINT_ERR_MSG *error_code = MPI_ERR_UNKNOWN; #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(fd, *error_code, myname); #endif return; } } if ((fcntl_struct->diskspace > 2147483647) || (err && (errno == ENOTEMPTY))) { #endif /* Explicitly write to allocate space. Since there could be holes in the file, I need to read up to the current file size, write it back, and then write beyond that depending on how much preallocation is needed. read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ curr_fsize = lseek64(fd->fd_sys, 0, SEEK_END); alloc_size = fcntl_struct->diskspace; size = ADIOI_MIN(curr_fsize, alloc_size); ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); done = 0; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ); ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) { #ifdef PRINT_ERR_MSG FPRINTF(stderr, "ADIOI_HFS_Fcntl: To preallocate disk space, ROMIO needs to read the file and write it back, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.\n"); MPI_Abort(MPI_COMM_WORLD, 1); #else *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_PREALLOC_PERM, myname, (char *) 0, (char *) 0); ADIOI_Error(fd, *error_code, myname); return; #endif } ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } if (alloc_size > curr_fsize) { memset(buf, 0, ADIOI_PREALLOC_BUFSZ); size = alloc_size - curr_fsize; ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ); ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } } ADIOI_Free(buf); #ifdef SPPUX } #endif *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_IOMODE: /* for implementing PFS I/O modes. will not occur in MPI-IO implementation.*/ if (fd->iomode != fcntl_struct->iomode) { fd->iomode = fcntl_struct->iomode; MPI_Barrier(MPI_COMM_WORLD); } *error_code = MPI_SUCCESS; break; case ADIO_FCNTL_SET_ATOMICITY: fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; *error_code = MPI_SUCCESS; break; default: FPRINTF(stderr, "Unknown flag passed to ADIOI_HFS_Fcntl\n"); MPI_Abort(MPI_COMM_WORLD, 1); } }
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status * status, int *error_code) { /* offset is in units of etype relative to the filetype. */ ADIOI_Flatlist_node *flat_buf, *flat_file; ADIO_Offset i_offset, sum, size_in_filetype; int i, j, k, st_index=0; int n_etypes_in_filetype; ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes; ADIO_Offset abs_off_in_filetype=0; int filetype_size, etype_size, buftype_size; MPI_Aint filetype_extent, buftype_extent; int buf_count, buftype_is_contig, filetype_is_contig; ADIO_Offset userbuf_off; ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off; char *writebuf; unsigned bufsize, writebuf_len, write_sz; ADIO_Status status1; ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len; int stripe_size; static char myname[] = "ADIOI_LUSTRE_WriteStrided"; if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { /* if user has disabled data sieving on writes, use naive * approach instead. */ ADIOI_GEN_WriteStrided_naive(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } *error_code = MPI_SUCCESS; /* changed below if error */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); MPI_Type_size(fd->filetype, &filetype_size); if (!filetype_size) { #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, 0); #endif *error_code = MPI_SUCCESS; return; } MPI_Type_extent(fd->filetype, &filetype_extent); MPI_Type_size(datatype, &buftype_size); MPI_Type_extent(datatype, &buftype_extent); etype_size = fd->etype_size; ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count)); bufsize = buftype_size * count; /* get striping info */ stripe_size = fd->hints->striping_unit; /* Different buftype to different filetype */ if (!buftype_is_contig && filetype_is_contig) { /* noncontiguous in memory, contiguous in file. */ ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : fd->disp + (ADIO_Offset)etype_size * offset; start_off = off; end_offset = start_off + bufsize - 1; /* write stripe size buffer each time */ writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); for (j = 0; j < count; j++) { for (i = 0; i < flat_buf->count; i++) { userbuf_off = (ADIO_Offset)j * (ADIO_Offset)buftype_extent + flat_buf->indices[i]; req_off = off; req_len = flat_buf->blocklens[i]; ADIOI_BUFFERED_WRITE_WITHOUT_READ off += flat_buf->blocklens[i]; } } /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (fd->atomicity) ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); if (*error_code != MPI_SUCCESS) { ADIOI_Free(writebuf); return; } ADIOI_Free(writebuf); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off; } else { /* noncontiguous in file */ /* filetype already flattened in ADIO_Open */ flat_file = ADIOI_Flatlist; while (flat_file->type != fd->filetype) flat_file = flat_file->next; disp = fd->disp; if (file_ptr_type == ADIO_INDIVIDUAL) { /* Wei-keng reworked type processing to be a bit more efficient */ offset = fd->fp_ind - disp; n_filetypes = (offset - flat_file->indices[0]) / filetype_extent; offset -= (ADIO_Offset)n_filetypes * filetype_extent; /* now offset is local to this extent */ /* find the block where offset is located, skip blocklens[i]==0 */ for (i=0; i<flat_file->count; i++) { ADIO_Offset dist; if (flat_file->blocklens[i] == 0) continue; dist = flat_file->indices[i] + flat_file->blocklens[i] - offset; /* fwr_size is from offset to the end of block i */ if (dist == 0) { i++; offset = flat_file->indices[i]; fwr_size = flat_file->blocklens[i]; break; } if (dist > 0) { fwr_size = dist; break; } } st_index = i; /* starting index in flat_file->indices[] */ offset += disp + (ADIO_Offset)n_filetypes*filetype_extent; } else { n_etypes_in_filetype = filetype_size/etype_size; n_filetypes = offset / n_etypes_in_filetype; etype_in_filetype = offset % n_etypes_in_filetype; size_in_filetype = etype_in_filetype * etype_size; sum = 0; for (i = 0; i < flat_file->count; i++) { sum += flat_file->blocklens[i]; if (sum > size_in_filetype) { st_index = i; fwr_size = sum - size_in_filetype; abs_off_in_filetype = flat_file->indices[i] + size_in_filetype - (sum - flat_file->blocklens[i]); break; } } /* abs. offset in bytes in the file */ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + abs_off_in_filetype; } start_off = offset; /* Wei-keng Liao:write request is within single flat_file * contig block*/ /* this could happen, for example, with subarray types that are * actually fairly contiguous */ if (buftype_is_contig && bufsize <= fwr_size) { req_off = start_off; req_len = bufsize; end_offset = start_off + bufsize - 1; writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size)); writebuf_off = 0; writebuf_len = 0; userbuf_off = 0; ADIOI_BUFFERED_WRITE_WITHOUT_READ /* write the buffer out finally */ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte * that can be accessed in the fileview. */ fd->fp_ind = offset + bufsize; if (bufsize == fwr_size) { do { st_index++; if (st_index == flat_file->count) { st_index = 0; n_filetypes++; } } while (flat_file->blocklens[st_index] == 0); fd->fp_ind = disp + flat_file->indices[st_index] + (ADIO_Offset)n_filetypes*filetype_extent; } } fd->fp_sys_posn = -1; /* set it to null. */ #ifdef HAVE_STATUS_SET_BYTES MPIR_Status_set_bytes(status, datatype, bufsize); #endif ADIOI_Free(writebuf); return; } /* Calculate end_offset, the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/ st_fwr_size = fwr_size; st_n_filetypes = n_filetypes; i_offset = 0; j = st_index; off = offset; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { i_offset += fwr_size; end_offset = off + fwr_size - 1; j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } /* if atomicity is true, lock the region to be accessed */ if (fd->atomicity) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); writebuf_off = 0; writebuf_len = 0; writebuf = (char *) ADIOI_Malloc(stripe_size); memset(writebuf, -1, stripe_size); if (buftype_is_contig && !filetype_is_contig) { /* contiguous in memory, noncontiguous in file. should be the most common case. */ i_offset = 0; j = st_index; off = offset; n_filetypes = st_n_filetypes; fwr_size = ADIOI_MIN(st_fwr_size, bufsize); while (i_offset < bufsize) { if (fwr_size) { /* TYPE_UB and TYPE_LB can result in fwr_size = 0. save system call in such cases */ /* lseek(fd->fd_sys, off, SEEK_SET); err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/ req_off = off; req_len = fwr_size; userbuf_off = i_offset; ADIOI_BUFFERED_WRITE } i_offset += fwr_size; if (off + fwr_size < disp + flat_file->indices[j] + flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent) off += fwr_size; /* did not reach end of contiguous block in filetype. no more I/O needed. off is incremented by fwr_size. */ else { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; while (flat_file->blocklens[j]==0) { j = (j+1) % flat_file->count; n_filetypes += (j == 0) ? 1 : 0; } off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent; fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } } } else {
void ADIOI_GEN_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { /* if fd->info is null, create a new info object. Initialize fd->info to default values. Initialize fd->hints to default values. Examine the info object passed by the user. If it contains values that ROMIO understands, override the default. */ MPI_Info info; char *value; int flag, intval, tmp_val, nprocs, nprocs_is_valid = 0; static char myname[] = "ADIOI_GEN_SETINFO"; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); info = fd->info; /* Note that fd->hints is allocated at file open time; thus it is * not necessary to allocate it, or check for allocation, here. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); if (value == NULL) { /* NEED TO HANDLE ENOMEM */ } /* initialize info and hints to default values if they haven't been * previously initialized */ if (!fd->hints->initialized) { /* buffer size for collective I/O */ MPI_Info_set(info, "cb_buffer_size", ADIOI_CB_BUFFER_SIZE_DFLT); fd->hints->cb_buffer_size = atoi(ADIOI_CB_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use * collective buffering */ MPI_Info_set(info, "romio_cb_read", "automatic"); fd->hints->cb_read = ADIOI_HINT_AUTO; MPI_Info_set(info, "romio_cb_write", "automatic"); fd->hints->cb_write = ADIOI_HINT_AUTO; fd->hints->cb_config_list = NULL; /* number of processes that perform I/O in collective I/O */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; sprintf(value, "%d", nprocs); MPI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = nprocs; /* hint indicating that no indep. I/O will be performed on this file */ MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; /* deferred_open derrived from no_indep_rw and cb_{read,write} */ fd->hints->deferred_open = 0; /* buffer size for data sieving in independent reads */ MPI_Info_set(info, "ind_rd_buffer_size", ADIOI_IND_RD_BUFFER_SIZE_DFLT); fd->hints->ind_rd_buffer_size = atoi(ADIOI_IND_RD_BUFFER_SIZE_DFLT); /* buffer size for data sieving in independent writes */ MPI_Info_set(info, "ind_wr_buffer_size", ADIOI_IND_WR_BUFFER_SIZE_DFLT); fd->hints->ind_wr_buffer_size = atoi(ADIOI_IND_WR_BUFFER_SIZE_DFLT); /* default is to let romio automatically decide when to use data * sieving */ MPI_Info_set(info, "romio_ds_read", "automatic"); fd->hints->ds_read = ADIOI_HINT_AUTO; MPI_Info_set(info, "romio_ds_write", "automatic"); fd->hints->ds_write = ADIOI_HINT_AUTO; fd->hints->initialized = 1; } /* add in user's info if supplied */ if (users_info != MPI_INFO_NULL) { MPI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_buffer_size", error_code); return; } /* --END ERROR HANDLING-- */ MPI_Info_set(info, "cb_buffer_size", value); fd->hints->cb_buffer_size = intval; } /* new hints for enabling/disabling coll. buffering on * reads/writes */ MPI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_read overrides no_indep_rw */ MPI_Info_set(info, "romio_cb_read", value); MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_read = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_cb_read", value); fd->hints->cb_read = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_read; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_read) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_read", error_code); return; } /* --END ERROR HANDLING-- */ } MPI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { /* romio_cb_write overrides no_indep_rw, too */ MPI_Info_set(info, "romio_cb_write", value); MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->cb_write = ADIOI_HINT_DISABLE; fd->hints->no_indep_rw = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_cb_write", value); fd->hints->cb_write = ADIOI_HINT_AUTO; } tmp_val = fd->hints->cb_write; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->cb_write) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_cb_write", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hint for specifying no indep. read/write will be performed */ MPI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { /* if 'no_indep_rw' set, also hint that we will do * collective buffering: if we aren't doing independent io, * then we have to do collective */ MPI_Info_set(info, "romio_no_indep_rw", value); MPI_Info_set(info, "romio_cb_write", "enable"); MPI_Info_set(info, "romio_cb_read", "enable"); fd->hints->no_indep_rw = 1; fd->hints->cb_read = 1; fd->hints->cb_write = 1; tmp_val = 1; } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { MPI_Info_set(info, "romio_no_indep_rw", value); fd->hints->no_indep_rw = 0; tmp_val = 0; } else { /* default is above */ tmp_val = 0; } MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != fd->hints->no_indep_rw) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "romio_no_indep_rw", error_code); return; } /* --END ERROR HANDLING-- */ } /* new hints for enabling/disabling data sieving on * reads/writes */ MPI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_ds_read", value); fd->hints->ds_read = ADIOI_HINT_AUTO; } /* otherwise ignore */ } MPI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_ENABLE; } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_DISABLE; } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { MPI_Info_set(info, "romio_ds_write", value); fd->hints->ds_write = ADIOI_HINT_AUTO; } /* otherwise ignore */ } MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval=atoi(value)) > 0)) { tmp_val = intval; MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); /* --BEGIN ERROR HANDLING-- */ if (tmp_val != intval) { MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, "cb_nodes", error_code); return; } /* --END ERROR HANDLING-- */ if (!nprocs_is_valid) { /* if hints were already initialized, we might not * have already gotten this? */ MPI_Comm_size(fd->comm, &nprocs); nprocs_is_valid = 1; } if (intval < nprocs) { MPI_Info_set(info, "cb_nodes", value); fd->hints->cb_nodes = intval; } } MPI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { MPI_Info_set(info, "ind_wr_buffer_size", value); fd->hints->ind_wr_buffer_size = intval; } MPI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag && ((intval = atoi(value)) > 0)) { MPI_Info_set(info, "ind_rd_buffer_size", value); fd->hints->ind_rd_buffer_size = intval; } MPI_Info_get(users_info, "cb_config_list", MPI_MAX_INFO_VAL, value, &flag); if (flag) { if (fd->hints->cb_config_list == NULL) { /* only set cb_config_list if it isn't already set. * Note that since we set it below, this ensures that * the cb_config_list hint will be set at file open time * either by the user or to the default */ MPI_Info_set(info, "cb_config_list", value); fd->hints->cb_config_list = ADIOI_Malloc((strlen(value)+1) * sizeof(char)); if (fd->hints->cb_config_list == NULL) { /* NEED TO HANDLE ENOMEM */ } strcpy(fd->hints->cb_config_list, value); } /* if it has been set already, we ignore it the second time. * otherwise we would get an error if someone used the same * info value with a cb_config_list value in it in a couple * of calls, which would be irritating. */ } } /* handle cb_config_list default value here; avoids an extra * free/alloc and insures it is always set */ if (fd->hints->cb_config_list == NULL) { MPI_Info_set(info, "cb_config_list", ADIOI_CB_CONFIG_LIST_DFLT); fd->hints->cb_config_list = ADIOI_Malloc((strlen(ADIOI_CB_CONFIG_LIST_DFLT)+1) * sizeof(char)); if (fd->hints->cb_config_list == NULL) { /* NEED TO HANDLE ENOMEM */ } strcpy(fd->hints->cb_config_list, ADIOI_CB_CONFIG_LIST_DFLT); } /* deferred_open won't be set by callers, but if the user doesn't * explicitly disable collecitve buffering (two-phase) and does hint that * io w/o independent io is going on, we'll set this internal hint as a * convenience */ if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \ && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\ && fd->hints->no_indep_rw ) ) { fd->hints->deferred_open = 1; } else { /* setting romio_no_indep_rw enable and romio_cb_{read,write} * disable at the same time doesn't make sense. honor * romio_cb_{read,write} and force the no_indep_rw hint to * 'disable' */ MPI_Info_set(info, "romio_no_indep_rw", "false"); fd->hints->no_indep_rw = 0; fd->hints->deferred_open = 0; } if ((fd->file_system == ADIO_PIOFS) || (fd->file_system == ADIO_PVFS)) { /* no data sieving for writes in PIOFS and PVFS, because they do not support file locking */ MPI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value, &flag); if (flag) { /* get rid of this value if it is set */ MPI_Info_delete(info, "ind_wr_buffer_size"); } /* note: leave ind_wr_buffer_size alone; used for other cases * as well. -- Rob Ross, 04/22/2003 */ MPI_Info_set(info, "romio_ds_write", "disable"); fd->hints->ds_write = ADIOI_HINT_DISABLE; } ADIOI_Free(value); *error_code = MPI_SUCCESS; }
/*@ MPI_File_write_ordered - Collective write using shared file pointer Input Parameters: . fh - file handle (handle) . buf - initial address of buffer (choice) . count - number of elements in buffer (nonnegative integer) . datatype - datatype of each buffer element (handle) Output Parameters: . status - status object (Status) .N fortran @*/ int MPI_File_write_ordered(MPI_File fh, ROMIO_CONST void *buf, int count, MPI_Datatype datatype, MPI_Status *status) { int error_code, nprocs, myrank; ADIO_Offset incr; MPI_Count datatype_size; int source, dest; static char myname[] = "MPI_FILE_WRITE_ORDERED"; ADIO_Offset shared_fp; ADIO_File adio_fh; void *e32buf=NULL; const void *xbuf; MPIU_THREAD_CS_ENTER(ALLFUNC,); adio_fh = MPIO_File_resolve(fh); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_FILE_HANDLE(adio_fh, myname, error_code); MPIO_CHECK_COUNT(adio_fh, count, myname, error_code); MPIO_CHECK_DATATYPE(adio_fh, datatype, myname, error_code); /* --END ERROR HANDLING-- */ MPI_Type_size_x(datatype, &datatype_size); /* --BEGIN ERROR HANDLING-- */ MPIO_CHECK_INTEGRAL_ETYPE(adio_fh, count, datatype_size, myname, error_code); MPIO_CHECK_FS_SUPPORTS_SHARED(adio_fh, myname, error_code); MPIO_CHECK_COUNT_SIZE(adio_fh, count, datatype_size, myname, error_code); /* --END ERROR HANDLING-- */ ADIOI_TEST_DEFERRED(adio_fh, myname, &error_code); MPI_Comm_size(adio_fh->comm, &nprocs); MPI_Comm_rank(adio_fh->comm, &myrank); incr = (count*datatype_size)/adio_fh->etype_size; /* Use a message as a 'token' to order the operations */ source = myrank - 1; dest = myrank + 1; if (source < 0) source = MPI_PROC_NULL; if (dest >= nprocs) dest = MPI_PROC_NULL; MPI_Recv(NULL, 0, MPI_BYTE, source, 0, adio_fh->comm, MPI_STATUS_IGNORE); ADIO_Get_shared_fp(adio_fh, incr, &shared_fp, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, myname, __LINE__, MPI_ERR_INTERN, "**iosharedfailed", 0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; } /* --END ERROR HANDLING-- */ MPI_Send(NULL, 0, MPI_BYTE, dest, 0, adio_fh->comm); xbuf = buf; if (adio_fh->is_external32) { error_code = MPIU_external32_buffer_setup(buf, count, datatype, &e32buf); if (error_code != MPI_SUCCESS) goto fn_exit; xbuf = e32buf; } ADIO_WriteStridedColl(adio_fh, xbuf, count, datatype, ADIO_EXPLICIT_OFFSET, shared_fp, status, &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) error_code = MPIO_Err_return_file(adio_fh, error_code); /* --END ERROR HANDLING-- */ fn_exit: if (e32buf != NULL) ADIOI_Free(e32buf); MPIU_THREAD_CS_EXIT(ALLFUNC,); /* FIXME: Check for error code from WriteStridedColl? */ return error_code; }
void ADIOI_PFS_Open(ADIO_File fd, int *error_code) { int perm, amode, old_mask, np_comm, np_total, err, flag; char *value; struct sattr attr; static char myname[] = "ADIOI_PFS_OPEN"; if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; MPI_Comm_size(MPI_COMM_WORLD, &np_total); MPI_Comm_size(fd->comm, &np_comm); if (np_total == np_comm) fd->fd_sys = _gopen(fd->filename, amode, M_ASYNC, perm); else fd->fd_sys = open(fd->filename, amode, perm); fd->fd_direct = -1; if (fd->fd_sys != -1) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); /* if user has asked for pfs server buffering to be turned on, it will be set to true in fd->info in the earlier call to ADIOI_PFS_SetInfo. Turn it on now, since we now have a valid file descriptor. */ ADIOI_Info_get(fd->info, "pfs_svr_buf", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true"))) { err = fcntl(fd->fd_sys, F_PFS_SVR_BUF, TRUE); if (err) ADIOI_Info_set(fd->info, "pfs_svr_buf", "false"); } /* get file striping information and set it in info */ err = fcntl(fd->fd_sys, F_GETSATTR, &attr); if (!err) { MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sunitsize); ADIOI_Info_set(fd->info, "striping_unit", value); MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_sfactor); ADIOI_Info_set(fd->info, "striping_factor", value); MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", attr.s_start_sdir); ADIOI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Free(value); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); } if (fd->fd_sys == -1) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strerror(errno)); } else *error_code = MPI_SUCCESS; }
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error * code is created and returned in error_code. */ static void ADIOI_Exch_and_write(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *buf_idx, int *error_code) { /* Send data to appropriate processes and write in sizes of no more than coll_bufsize. The idea is to reduce the amount of extra memory required for collective I/O. If all data were written all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to write a distributed array to a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/ ADIO_Offset size=0; int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig; ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off; char *write_buf=NULL; int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size; int *partial_recv, *sent_to_proc, *start_pos, flag; int *send_buf_idx, *curr_to_proc, *done_to_proc; MPI_Status status; ADIOI_Flatlist_node *flat_buf=NULL; MPI_Aint buftype_extent; int info_flag, coll_bufsize; char *value; static char myname[] = "ADIOI_EXCH_AND_WRITE"; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of writes of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. */ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); coll_bufsize = atoi(value); ADIOI_Free(value); for (i=0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } for (i=0; i < nprocs; i++) for (j=0; j < others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); if ((st_loc==-1) && (end_loc==-1)) { ntimes = 0; /* this process does no writing. */ } MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); write_buf = fd->io_buf; curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ count = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ partial_recv = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is recd. from a process in a particular iteration, the length recd. is stored here. calloc initializes to 0. */ send_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* total size of data to be sent to each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ recv_size = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* total size of data to be recd. from each proc. in an iteration.*/ sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data sent to each proc so far. Used in ADIOI_Fill_send_buffer. initialized to 0 here. */ send_buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int)); curr_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); done_to_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* Above three are used in ADIOI_Fill_send_buffer*/ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent); /* I need to check if there are any outstanding nonblocking writes to the file, which could potentially interfere with the writes taking place in this collective write call. Since this is not likely to be common, let me do the simplest thing possible here: Each process completes all pending nonblocking operations before completing. */ /*ADIOI_Complete_async(error_code); if (*error_code != MPI_SUCCESS) return; MPI_Barrier(fd->comm); */ done = 0; off = st_loc; for (m=0; m < ntimes; m++) { /* go through all others_req and check which will be satisfied by the current write */ /* Note that MPI guarantees that displacements in filetypes are in monotonically nondecreasing order and that, for writes, the filetypes cannot specify overlapping regions in the file. This simplifies implementation a bit compared to reads. */ /* off = start offset in the file for the data to be written in this iteration size = size of data written (bytes) corresponding to off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ /* first calculate what should be communicated */ for (i=0; i < nprocs; i++) count[i] = recv_size[i] = 0; size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); for (i=0; i < nprocs; i++) { if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_recv[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_recv[i]; req_len = others_req[i].lens[j] - partial_recv[i]; partial_recv[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < off + size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)write_buf)+req_off-off) == (ADIO_Offset)(MPIR_Upint)(write_buf+req_off-off)); MPI_Address(write_buf+req_off-off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((off + size - req_off) == (int)(off + size - req_off)); recv_size[i] += (int)(ADIOI_MIN(off + size - req_off, (unsigned)req_len)); if (off+size-req_off < (unsigned)req_len) { partial_recv[i] = (int) (off + size - req_off); /* --BEGIN ERROR HANDLING-- */ if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < off+size)) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ARG, "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification)", 0); /* allow to continue since additional * communication might have to occur */ } /* --END ERROR HANDLING-- */ break; } } else break; } curr_offlen_ptr[i] = j; } } ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return; flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIOI_Assert(size == (int)size); ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) { ADIOI_Assert(size == (int)size); /* nothing to recv, but check for send. */ ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, len_list, send_size, recv_size, off, (int)size, count, start_pos, partial_recv, sent_to_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, send_buf_idx, curr_to_proc, done_to_proc, &hole, m, buftype_extent, buf_idx, error_code); if (*error_code != MPI_SUCCESS) return; } ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_recv); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(sent_to_proc); ADIOI_Free(start_pos); ADIOI_Free(send_buf_idx); ADIOI_Free(curr_to_proc); ADIOI_Free(done_to_proc); }
/* this used to be implemented in every file system as an fcntl, but the code * is identical for all file systems without a real "preallocate" system call. * This naive approach will get the job done, but not in a terribly efficient * manner. */ void ADIOI_GEN_Prealloc(ADIO_File fd, ADIO_Offset diskspace, int *error_code) { ADIO_Offset curr_fsize, alloc_size, size, len, done; ADIO_Status status; int i, ntimes; char *buf; ADIO_Fcntl_t *fcntl_struct; static char myname[] = "ADIOI_GEN_PREALLOC"; /* will be called by one process only */ /* On file systems with no preallocation function, we have to explicitly write to allocate space. Since there could be holes in the file, we need to read up to the current file size, write it back, and then write beyond that depending on how much preallocation is needed. read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ /*curr_fsize = fd->fp_ind; */ fcntl_struct = (ADIO_Fcntl_t *) ADIOI_Malloc(sizeof(ADIO_Fcntl_t)); ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, fcntl_struct, error_code); curr_fsize = fcntl_struct->fsize; /* don't rely on fd->fp_ind: might be working on a pre-existing file */ alloc_size = diskspace; size = ADIOI_MIN(curr_fsize, alloc_size); ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); done = 0; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(size-done, ADIOI_PREALLOC_BUFSZ); ADIO_ReadContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iopreallocrdwr", 0); return; } ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } if (alloc_size > curr_fsize) { memset(buf, 0, ADIOI_PREALLOC_BUFSZ); size = alloc_size - curr_fsize; ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; for (i=0; i<ntimes; i++) { len = ADIOI_MIN(alloc_size-done, ADIOI_PREALLOC_BUFSZ); ADIO_WriteContig(fd, buf, len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, done, &status, error_code); if (*error_code != MPI_SUCCESS) return; done += len; } } ADIOI_Free(fcntl_struct); ADIOI_Free(buf); *error_code = MPI_SUCCESS; }
void ADIOI_GEN_WriteStridedColl(ADIO_File fd, const void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count=0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; int *buf_idx = NULL; ADIO_Offset *len_list = NULL; int old_error, tmp_error; if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { /* Cast away const'ness as the below function is used for read * and write */ ADIOI_IOStridedColl (fd, (char *) buf, count, ADIOI_WRITE, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* the number of processes that actually perform I/O, nprocs_for_coll, * is stored in the hints off the ADIO_File structure */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i=1; i<nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* Divide the I/O workload among "nprocs_for_coll" processes. This is done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate what portions of the access requests of this process are located in what file domains */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes lie in this process's file domain. count_others_req_procs = number of processes whose requests lie in this process's file domain (including this process itself) count_others_req_per_proc[i] indicates how many separate contiguous requests of proc. i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); for (i=0; i < nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* exchange data and write in sizes of no more than coll_bufsize. */ /* Cast away const'ness for the below function */ ADIOI_Exch_and_write(fd, (char *) buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL ); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL ); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event (5012, 0, NULL); #endif if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) ) *error_code = old_error; if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES if (status) { MPI_Count bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5013, 0, NULL); #endif }
void ADIOI_XFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { char *value, * c; int flag; static char xfs_initialized = 0; if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info)); ADIOI_Info_set(fd->info, "direct_read", "false"); ADIOI_Info_set(fd->info, "direct_write", "false"); fd->direct_read = fd->direct_write = 0; if (!xfs_initialized) { xfs_initialized = 1; c = getenv("MPIO_DIRECT_READ_CHUNK_SIZE"); if (c) { int io; io = atoi(c); if (io <= 0) { fprintf(stderr, "MPI: Ignoring an invalid setting for MPIO_DIRECT_READ_CHUNK_SIZE.\n" " It must be set to a positive integer value.\n"); } else { xfs_direct_read_chunk_size = io; } } else { xfs_direct_read_chunk_size = 0; } c = getenv("MPIO_DIRECT_WRITE_CHUNK_SIZE"); if (c) { int io; io = atoi(c); if (io <= 0) { fprintf(stderr, "MPI: Ignoring an invalid setting for MPIO_DIRECT_WRITE_CHUNK_SIZE.\n" " It must be set to a positive integer value.\n"); } else { xfs_direct_write_chunk_size = io; } } else { xfs_direct_write_chunk_size = 0; } } if (!fd->hints->initialized) { fd->hints->fs_hints.xfs.read_chunk_sz = xfs_direct_read_chunk_size; fd->hints->fs_hints.xfs.write_chunk_sz = xfs_direct_write_chunk_size; } /* has user specified values for keys "direct_read" and "direct write"? */ if (users_info != MPI_INFO_NULL) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, value, &flag); if (flag && !strcmp(value, "true")) { ADIOI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } ADIOI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); if (flag && !strcmp(value, "true")) { ADIOI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } ADIOI_Free(value); } /* set the values for collective I/O and data sieving parameters */ ADIOI_GEN_SetInfo(fd, users_info, error_code); /* Environment variables override MPI_Info hints */ if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_write) fd->direct_write = 1; /* environment variables checked in ADIO_Init */ *error_code = MPI_SUCCESS; }
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code * in the case of error. */ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, ADIO_Offset off, int size, int *count, int *start_pos, int *partial_recv, int *sent_to_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIOI_Access *others_req, int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int *hole, int iter, MPI_Aint buftype_extent, int *buf_idx, int *error_code) { int i, j, k, *tmp_len, nprocs_recv, nprocs_send, err; char **send_buf = NULL; MPI_Request *requests, *send_req; MPI_Datatype *recv_types; MPI_Status *statuses, status; int *srt_len=NULL, sum; ADIO_Offset *srt_off=NULL; static char myname[] = "ADIOI_W_EXCHANGE_DATA"; /* exchange recv_size info so that each process knows how much to send to whom. */ MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm); /* create derived datatypes for recv */ nprocs_recv = 0; for (i=0; i<nprocs; i++) if (recv_size[i]) nprocs_recv++; recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv+1)*sizeof(MPI_Datatype)); /* +1 to avoid a 0-size malloc */ tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int)); j = 0; for (i=0; i<nprocs; i++) { if (recv_size[i]) { /* take care if the last off-len pair is a partial recv */ if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; tmp_len[i] = others_req[i].lens[k]; others_req[i].lens[k] = partial_recv[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, recv_types+j); /* absolute displacements; use MPI_BOTTOM in recv */ MPI_Type_commit(recv_types+j); j++; } } /* To avoid a read-modify-write, check if there are holes in the data to be written. For this, merge the (sorted) offset lists others_req using a heap-merge. */ sum = 0; for (i=0; i<nprocs; i++) sum += count[i]; /* valgrind-detcted optimization: if there is no work on this process we do * not need to search for holes */ if (sum) { srt_off = (ADIO_Offset *) ADIOI_Malloc(sum*sizeof(ADIO_Offset)); srt_len = (int *) ADIOI_Malloc(sum*sizeof(int)); ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, nprocs, nprocs_recv, sum); } /* for partial recvs, restore original lengths */ for (i=0; i<nprocs; i++) if (partial_recv[i]) { k = start_pos[i] + count[i] - 1; others_req[i].lens[k] = tmp_len[i]; } ADIOI_Free(tmp_len); /* check if there are any holes. If yes, must do read-modify-write. * holes can be in three places. 'middle' is what you'd expect: the * processes are operating on noncontigous data. But holes can also show * up at the beginning or end of the file domain (see John Bent ROMIO REQ * #835). Missing these holes would result in us writing more data than * recieved by everyone else. */ *hole = 0; if (sum) { if (off != srt_off[0]) /* hole at the front */ *hole = 1; else { /* coalesce the sorted offset-length pairs */ for (i=1; i<sum; i++) { if (srt_off[i] <= srt_off[0] + srt_len[0]) { /* ok to cast: operating on cb_buffer_size chunks */ int new_len = (int)srt_off[i] + srt_len[i] - (int)srt_off[0]; if (new_len > srt_len[0]) srt_len[0] = new_len; } else break; } if (i < sum || size != srt_len[0]) /* hole in middle or end */ *hole = 1; } ADIOI_Free(srt_off); ADIOI_Free(srt_len); } if (nprocs_recv) { if (*hole) { ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, &err); /* --BEGIN ERROR HANDLING-- */ if (err != MPI_SUCCESS) { *error_code = MPIO_Err_create_code(err, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**ioRMWrdwr", 0); return; } /* --END ERROR HANDLING-- */ } } nprocs_send = 0; for (i=0; i < nprocs; i++) if (send_size[i]) nprocs_send++; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+1)*sizeof(MPI_Request)); send_req = requests; } else { requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post receives */ j = 0; for (i=0; i<nprocs; i++) { if (recv_size[i]) { MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter, fd->comm, requests+j); j++; } } send_req = requests + nprocs_recv; } /* post sends. if buftype_is_contig, data can be directly sent from user buf at location given by buf_idx. else use send_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i=0; i < nprocs; i++) if (send_size[i]) { MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, send_req+j); j++; buf_idx[i] += send_size[i]; } } else if (nprocs_send) { /* buftype is not contig */ send_buf = (char **) ADIOI_Malloc(nprocs*sizeof(char*)); for (i=0; i < nprocs; i++) if (send_size[i]) send_buf[i] = (char *) ADIOI_Malloc(send_size[i]); ADIOI_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, len_list, send_size, send_req, sent_to_proc, nprocs, myrank, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, send_buf_idx, curr_to_proc, done_to_proc, iter, buftype_extent); /* the send is done in ADIOI_Fill_send_buffer */ } if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ j = 0; for (i=0; i<nprocs; i++) { MPI_Status wkl_status; if (recv_size[i]) { MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, myrank+i+100*iter, fd->comm, &wkl_status); j++; } } } for (i=0; i<nprocs_recv; i++) MPI_Type_free(recv_types+i); ADIOI_Free(recv_types); if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+1) * \ sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } else { statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ } #ifdef NEEDS_MPI_TEST i = 0; if (fd->atomicity) { /* bug fix from Wei-keng Liao and Kenin Coloma */ while (!i) MPI_Testall(nprocs_send, send_req, &i, statuses); } else { while (!i) MPI_Testall(nprocs_send+nprocs_recv, requests, &i, statuses); } #else if (fd->atomicity) /* bug fix from Wei-keng Liao and Kenin Coloma */ MPI_Waitall(nprocs_send, send_req, statuses); else MPI_Waitall(nprocs_send+nprocs_recv, requests, statuses); #endif #ifdef AGGREGATION_PROFILE MPE_Log_event (5033, 0, NULL); #endif ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig && nprocs_send) { for (i=0; i < nprocs; i++) if (send_size[i]) ADIOI_Free(send_buf[i]); ADIOI_Free(send_buf); } }
void ADIOI_Heap_free(heap_t *heap) { ADIOI_Free(heap->nodes); }
static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype datatype, int nprocs, int myrank, ADIOI_Access *others_req, ADIO_Offset *offset_list, ADIO_Offset *len_list, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, int *buf_idx, int *error_code) { /* Read in sizes of no more than coll_bufsize, an info parameter. Send data to appropriate processes. Place recd. data in user buf. The idea is to reduce the amount of extra memory required for collective I/O. If all data were read all at once, which is much easier, it would require temp space more than the size of user_buf, which is often unacceptable. For example, to read a distributed array from a file, where each local array is 8Mbytes, requiring at least another 8Mbytes of temp space is unacceptable. */ int i, j, m, ntimes, max_ntimes, buftype_is_contig; ADIO_Offset st_loc=-1, end_loc=-1, off, done, real_off, req_off; char *read_buf = NULL, *tmp_buf; int *curr_offlen_ptr, *count, *send_size, *recv_size; int *partial_send, *recd_from_proc, *start_pos; /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets*/ ADIO_Offset real_size, size, for_curr_iter, for_next_iter; int req_len, flag, rank; MPI_Status status; ADIOI_Flatlist_node *flat_buf=NULL; MPI_Aint buftype_extent; int coll_bufsize; *error_code = MPI_SUCCESS; /* changed below if error */ /* only I/O errors are currently reported */ /* calculate the number of reads of size coll_bufsize to be done by each process and the max among all processes. That gives the no. of communication phases as well. coll_bufsize is obtained from the hints object. */ coll_bufsize = fd->hints->cb_buffer_size; /* grab some initial values for st_loc and end_loc */ for (i=0; i < nprocs; i++) { if (others_req[i].count) { st_loc = others_req[i].offsets[0]; end_loc = others_req[i].offsets[0]; break; } } /* now find the real values */ for (i=0; i < nprocs; i++) for (j=0; j<others_req[i].count; j++) { st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); } /* calculate ntimes, the number of times this process must perform I/O * operations in order to complete all the requests it has received. * the need for multiple I/O operations comes from the restriction that * we only use coll_bufsize bytes of memory for internal buffering. */ if ((st_loc==-1) && (end_loc==-1)) { /* this process does no I/O. */ ntimes = 0; } else { /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ ntimes = (int) ((end_loc - st_loc + coll_bufsize)/coll_bufsize); } MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); read_buf = fd->io_buf; /* Allocated at open time */ curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* its use is explained below. calloc initializes to 0. */ count = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* to store count of how many off-len pairs per proc are satisfied in an iteration. */ partial_send = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* if only a portion of the last off-len pair is sent to a process in a particular iteration, the length sent is stored here. calloc initializes to 0. */ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be sent to each proc. in an iteration */ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); /* total size of data to be recd. from each proc. in an iteration. Of size nprocs so that I can use MPI_Alltoall later. */ recd_from_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); /* amount of data recd. so far from each proc. Used in ADIOI_Fill_user_buffer. initialized to 0 here. */ start_pos = (int *) ADIOI_Malloc(nprocs*sizeof(int)); /* used to store the starting value of curr_offlen_ptr[i] in this iteration */ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (!buftype_is_contig) { ADIOI_Flatten_datatype(datatype); flat_buf = ADIOI_Flatlist; while (flat_buf->type != datatype) flat_buf = flat_buf->next; } MPI_Type_extent(datatype, &buftype_extent); done = 0; off = st_loc; for_curr_iter = for_next_iter = 0; MPI_Comm_rank(fd->comm, &rank); for (m=0; m<ntimes; m++) { /* read buf of size coll_bufsize (or less) */ /* go through all others_req and check if any are satisfied by the current read */ /* since MPI guarantees that displacements in filetypes are in monotonically nondecreasing order, I can maintain a pointer (curr_offlen_ptr) to current off-len pair for each process in others_req and scan further only from there. There is still a problem of filetypes such as: (1, 2, 3 are not process nos. They are just numbers for three chunks of data, specified by a filetype.) 1 -------!-- 2 -----!---- 3 --!----- where ! indicates where the current read_size limitation cuts through the filetype. I resolve this by reading up to !, but filling the communication buffer only for 1. I copy the portion left over for 2 into a tmp_buf for use in the next iteration. i.e., 2 and 3 will be satisfied in the next iteration. This simplifies filling in the user's buf at the other end, as only one off-len pair with incomplete data will be sent. I also don't need to send the individual offsets and lens along with the data, as the data is being sent in a particular order. */ /* off = start offset in the file for the data actually read in this iteration size = size of data read corresponding to off real_off = off minus whatever data was retained in memory from previous iteration for cases like 2, 3 illustrated above real_size = size plus the extra corresponding to real_off req_off = off in file for a particular contiguous request minus what was satisfied in previous iteration req_size = size corresponding to req_off */ size = ADIOI_MIN((unsigned)coll_bufsize, end_loc-st_loc+1-done); real_off = off - for_curr_iter; real_size = size + for_curr_iter; for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for_next_iter = 0; for (i=0; i<nprocs; i++) { #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "rank %d, i %d, others_count %d\n", rank, i, others_req[i].count); #endif if (others_req[i].count) { start_pos[i] = curr_offlen_ptr[i]; for (j=curr_offlen_ptr[i]; j<others_req[i].count; j++) { if (partial_send[i]) { /* this request may have been partially satisfied in the previous iteration. */ req_off = others_req[i].offsets[j] + partial_send[i]; req_len = others_req[i].lens[j] - partial_send[i]; partial_send[i] = 0; /* modify the off-len pair to reflect this change */ others_req[i].offsets[j] = req_off; others_req[i].lens[j] = req_len; } else { req_off = others_req[i].offsets[j]; req_len = others_req[i].lens[j]; } if (req_off < real_off + real_size) { count[i]++; ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+req_off-real_off) == (ADIO_Offset)(MPIR_Upint)(read_buf+req_off-real_off)); MPI_Address(read_buf+req_off-real_off, &(others_req[i].mem_ptrs[j])); ADIOI_Assert((real_off + real_size - req_off) == (int)(real_off + real_size - req_off)); send_size[i] += (int)(ADIOI_MIN(real_off + real_size - req_off, (ADIO_Offset)(unsigned)req_len)); if (real_off+real_size-req_off < (ADIO_Offset)(unsigned)req_len) { partial_send[i] = (int) (real_off + real_size - req_off); if ((j+1 < others_req[i].count) && (others_req[i].offsets[j+1] < real_off+real_size)) { /* this is the case illustrated in the figure above. */ for_next_iter = ADIOI_MAX(for_next_iter, real_off + real_size - others_req[i].offsets[j+1]); /* max because it must cover requests from different processes */ } break; } } else break; } curr_offlen_ptr[i] = j; } } flag = 0; for (i=0; i<nprocs; i++) if (count[i]) flag = 1; if (flag) { ADIOI_Assert(size == (int)size); ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; } for_curr_iter = for_next_iter; ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); if (for_next_iter) { tmp_buf = (char *) ADIOI_Malloc(for_next_iter); ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter)); ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize)); memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter); ADIOI_Free(fd->io_buf); fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize); memcpy(fd->io_buf, tmp_buf, for_next_iter); read_buf = fd->io_buf; ADIOI_Free(tmp_buf); } off += size; done += size; } for (i=0; i<nprocs; i++) count[i] = send_size[i] = 0; for (m=ntimes; m<max_ntimes; m++) /* nothing to send, but check for recv. */ ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list, send_size, recv_size, count, start_pos, partial_send, recd_from_proc, nprocs, myrank, buftype_is_contig, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, others_req, m, buftype_extent, buf_idx); ADIOI_Free(curr_offlen_ptr); ADIOI_Free(count); ADIOI_Free(partial_send); ADIOI_Free(send_size); ADIOI_Free(recv_size); ADIOI_Free(recd_from_proc); ADIOI_Free(start_pos); }
void ADIOI_BEEGFS_SetInfo( ADIO_File fd, MPI_Info users_info, int *error_code ) { char *value, *pathname, *dname, *slash; int flag, stripe_val[2], numtargets = 0, chunksize = 0; struct BeegfsIoctl_MkFileWithStripeHints_Arg createFileArg; int err, myrank, fd_pdir, perm, old_mask; static char myname[] = "ADIOI_BEEGFS_SETINFO"; /* set error code to success */ *error_code = MPI_SUCCESS; value = ( char * )ADIOI_Malloc( ( MPI_MAX_INFO_VAL + 1 ) * sizeof( char ) ); MPI_Comm_rank( fd->comm, &myrank ); /* set hints */ if( ( fd->info ) == MPI_INFO_NULL ) { MPI_Info_create( &( fd->info ) ); ADIOI_Info_set( fd->info, "striping_unit", "0" ); ADIOI_Info_set( fd->info, "striping_factor", "0" ); /* set users infos */ if( users_info != MPI_INFO_NULL ) { /* striping information */ ADIOI_Info_get( users_info, "striping_unit", MPI_MAX_INFO_VAL, value, &flag ); if( flag ) chunksize = atoi( value ); ADIOI_Info_get( users_info, "striping_factor", MPI_MAX_INFO_VAL, value, &flag ); if( flag ) numtargets = atoi( value ); /* check stripe info consistency */ if( myrank == 0 ) { stripe_val[0] = numtargets; stripe_val[1] = chunksize; } MPI_Bcast( stripe_val, 2, MPI_INT, 0, fd->comm ); if( stripe_val[0] != numtargets || stripe_val[1] != chunksize ) { FPRINTF( stderr, "ADIOI_BEEGFS_SetInfo: All keys" "-striping_factor:striping_unit " "need to be identical across all processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); } /* if user has specified striping info, process 0 tries to set it */ if( myrank == 0 && ( fd->access_mode & ADIO_CREATE ) && numtargets && chunksize ) { /* open the parent dir to get/set striping info */ pathname = ADIOI_Strdup( fd->filename ); dname = strrchr( pathname, '/' ); if( dname != NULL ) { *dname = '\0'; // replace / with nul-character fd_pdir = open( pathname, O_RDONLY ); if( fd_pdir == -1 ) { FPRINTF( stderr, "Error opening %s: %s\n", pathname, strerror( errno ) ); } } else { /* current dir relative path */ fd_pdir = open( ".", O_RDONLY ); if( fd_pdir == -1 ) { FPRINTF( stderr, "Error opening .: %s\n", strerror( errno ) ); } } ADIOI_Free( pathname ); if( fd->perm == ADIO_PERM_NULL ) { old_mask = umask( 022 ); umask( old_mask ); perm = old_mask ^ 0666; } else perm = fd->perm; /* set create hints depending on e10 hints previously set */ slash = strrchr( fd->filename, '/' ); if( slash != NULL ) slash += 1; else slash = fd->filename; createFileArg.filename = slash; createFileArg.mode = perm; createFileArg.numtargets = numtargets; createFileArg.chunksize = chunksize; /* create the hint file */ err = ioctl( fd_pdir, BEEGFS_IOC_MKFILE_STRIPEHINTS, &createFileArg ); if( err ) { FPRINTF( stderr, "BEEGFS_IOC_MKFILE_STRIPEHINTS: %s. ", strerror( errno ) ); if( errno == EEXIST ) { /* ignore user striping and use current file info */ FPRINTF( stderr, "[rank:%d] Failure to set stripe info for %s!\n", myrank, fd->filename ); } } /* close the parent dir file descriptor */ close( fd_pdir ); } /* End of striping parameters validation */ } MPI_Barrier( fd->comm ); } /* set rest of the MPI hints (including E10 hints) */ ADIOI_GEN_SetInfo( fd, users_info, error_code ); ADIOI_Free( value ); }
void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described in "An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, (5)4:301--317, Winter 1996. http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs structures, one for each other process in whose file domain this process's request lies */ ADIOI_Access *others_req; /* array of nprocs structures, one for each other process whose request lies in this process's file domain. */ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count=0, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int *buf_idx = NULL; #ifdef HAVE_STATUS_SET_BYTES MPI_Count bufsize, size; #endif if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) { ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype, file_ptr_type, offset, status, error_code); return; } MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); /* number of aggregators, cb_nodes, is stored in the hints */ nprocs_for_coll = fd->hints->cb_nodes; orig_fp = fd->fp_ind; /* only check for interleaving if cb_read isn't disabled */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); #ifdef RDCOLL_DEBUG for (i=0; i<contig_access_count; i++) { DBG_FPRINTF(stderr, "rank %d off %lld len %lld\n", myrank, offset_list[i], len_list[i]); } #endif /* each process communicates its start and end offsets to other processes. The result is an array each of start and end offsets stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i=1; i<nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); if (fd->hints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_ReadContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else ADIO_ReadStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); return; } /* We're going to perform aggregation of I/O. Here we call * ADIOI_Calc_file_domains() to determine what processes will handle I/O * to what regions. We pass nprocs_for_coll into this function; it is * used to determine how many processes will perform I/O, which is also * the number of regions into which the range of bytes must be divided. * These regions are called "file domains", or FDs. * * When this function returns, fd_start, fd_end, fd_size, and * min_st_offset will be filled in. fd_start holds the starting byte * location for each file domain. fd_end holds the ending byte location. * min_st_offset holds the minimum byte location that will be accessed. * * Both fd_start[] and fd_end[] are indexed by an aggregator number; this * needs to be mapped to an actual rank in the communicator later. * */ ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, fd->hints->min_fdomain_size, &fd_size, fd->hints->striping_unit); /* calculate where the portions of the access requests of this process * are located in terms of the file domains. this could be on the same * process or on other processes. this function fills in: * count_my_req_procs - number of processes (including this one) for which * this process has requests in their file domain * count_my_req_per_proc - count of requests for each process, indexed * by rank of the process * my_req[] - array of data structures describing the requests to be * performed by each process (including self). indexed by rank. * buf_idx[] - array of locations into which data can be directly moved; * this is only valid for contiguous buffer case */ ADIOI_Calc_my_req(fd, offset_list, len_list, contig_access_count, min_st_offset, fd_start, fd_end, fd_size, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* perform a collective communication in order to distribute the * data calculated above. fills in the following: * count_others_req_procs - number of processes (including this * one) which have requests in this process's file domain. * count_others_req_per_proc[] - number of separate contiguous * requests from proc i lie in this process's file domain. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); /* my_req[] and count_my_req_per_proc aren't needed at this point, so * let's free the memory */ ADIOI_Free(count_my_req_per_proc); for (i=0; i<nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); /* read data in sizes of no more than ADIOI_Coll_bufsize, * communicate, and fill user buf. */ ADIOI_Read_and_exch(fd, buf, datatype, nprocs, myrank, others_req, offset_list, len_list, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buf_idx, error_code); if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ for (i=0; i<nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(fd_start); ADIOI_Free(fd_end); #ifdef HAVE_STATUS_SET_BYTES MPI_Type_size_x(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); /* This is a temporary way of filling in status. The right way is to keep track of how much data was actually read and placed in buf during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
/* * ADIOI_Sync_thread_start - start the synchronisation routine */ void *ADIOI_Sync_thread_start(void *ptr) { ADIOI_Sync_thread_t t = (ADIOI_Sync_thread_t)ptr; ADIOI_Atomic_queue_t q = (ADIOI_Atomic_queue_t)t->sub_; ADIOI_Sync_req_t r; size_t wr_count; MPI_Count datatype_size; char *buf; ADIO_Offset bytes_xfered, len, buf_size, offset, off; int type, count, fflags, error_code; ADIO_Request *req; MPI_Datatype datatype; /* get sync buffer size */ t->fd_; buf_size = t->fd_->hints->ind_wr_buffer_size; buf = (char *)ADIOI_Malloc(buf_size); for(;;) { /* get a new sync request */ #ifndef _USE_PTHREAD_MUTEX_ if ((r = ADIOI_Atomic_queue_front(q)) == NULL) continue; #else r = ADIOI_Atomic_queue_front(q); #endif /* pop sync request */ ADIOI_Atomic_queue_pop(q); /* get request type */ ADIOI_Sync_req_get_key(r, ADIOI_SYNC_TYPE, &type); /* check for shutdown type */ if (type == ADIOI_THREAD_SHUTDOWN) { break; } /* if sync type get all the fields */ ADIOI_Sync_req_get_key(r, ADIOI_SYNC_ALL, &offset, &datatype, &count, &req, &error_code, &fflags); /* init I/O req */ MPI_Type_size_x(datatype, &datatype_size); len = (ADIO_Offset)datatype_size * (ADIO_Offset)count; bytes_xfered = 0; off = offset; /* satisfy sync req */ while (bytes_xfered < len) { wr_count = (size_t)ADIOI_MIN(buf_size, len - bytes_xfered); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_read_a, 0, NULL); #endif /* read data from cache file */ pread(t->fd_->cache_fd->fd_sys, buf, wr_count, offset); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_read_b, 0, NULL); MPE_Log_event(ADIOI_MPE_thread_write_a, 0, NULL); #endif /* write data to global file */ pwrite(t->fd_->fd_sys, buf, wr_count, offset); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_thread_write_b, 0, NULL); #endif /* update offset */ bytes_xfered += (ADIO_Offset)wr_count; offset += (ADIO_Offset)wr_count; } /* unlock extent locked in ADIO_WriteContig() */ if (t->fd_->hints->e10_cache_coherent == ADIOI_HINT_ENABLE) ADIOI_UNLOCK(t->fd_, off, SEEK_SET, len); /* ---Begin Error Handling--- */ /* --- End Error Handling --- */ /* complete Grequest */ MPI_Grequest_complete(*req); } ADIOI_Free(buf); pthread_exit(NULL); }
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node *flat_buf, ADIO_Offset *offset_list, ADIO_Offset *len_list, int *send_size, int *recv_size, int *count, int *start_pos, int *partial_send, int *recd_from_proc, int nprocs, int myrank, int buftype_is_contig, int contig_access_count, ADIO_Offset min_st_offset, ADIO_Offset fd_size, ADIO_Offset *fd_start, ADIO_Offset *fd_end, ADIOI_Access *others_req, int iter, MPI_Aint buftype_extent, int *buf_idx) { int i, j, k=0, tmp=0, nprocs_recv, nprocs_send; char **recv_buf = NULL; MPI_Request *requests; MPI_Datatype send_type; MPI_Status *statuses; /* exchange send_size info so that each process knows how much to receive from whom and how much memory to allocate. */ MPI_Alltoall(send_size, 1, MPI_INT, recv_size, 1, MPI_INT, fd->comm); nprocs_recv = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) nprocs_recv++; nprocs_send = 0; for (i=0; i<nprocs; i++) if (send_size[i]) nprocs_send++; requests = (MPI_Request *) ADIOI_Malloc((nprocs_send+nprocs_recv+1)*sizeof(MPI_Request)); /* +1 to avoid a 0-size malloc */ /* post recvs. if buftype_is_contig, data can be directly recd. into user buf at location given by buf_idx. else use recv_buf. */ #ifdef AGGREGATION_PROFILE MPE_Log_event (5032, 0, NULL); #endif if (buftype_is_contig) { j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; buf_idx[i] += recv_size[i]; } } else { /* allocate memory for recv_buf and post receives */ recv_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char*)); for (i=0; i < nprocs; i++) if (recv_size[i]) recv_buf[i] = (char *) ADIOI_Malloc(recv_size[i]); j = 0; for (i=0; i < nprocs; i++) if (recv_size[i]) { MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, myrank+i+100*iter, fd->comm, requests+j); j++; #ifdef RDCOLL_DEBUG DBG_FPRINTF(stderr, "node %d, recv_size %d, tag %d \n", myrank, recv_size[i], myrank+i+100*iter); #endif } } /* create derived datatypes and send data */ j = 0; for (i=0; i<nprocs; i++) { if (send_size[i]) { /* take care if the last off-len pair is a partial send */ if (partial_send[i]) { k = start_pos[i] + count[i] - 1; tmp = others_req[i].lens[k]; others_req[i].lens[k] = partial_send[i]; } ADIOI_Type_create_hindexed_x(count[i], &(others_req[i].lens[start_pos[i]]), &(others_req[i].mem_ptrs[start_pos[i]]), MPI_BYTE, &send_type); /* absolute displacement; use MPI_BOTTOM in send */ MPI_Type_commit(&send_type); MPI_Isend(MPI_BOTTOM, 1, send_type, i, myrank+i+100*iter, fd->comm, requests+nprocs_recv+j); MPI_Type_free(&send_type); if (partial_send[i]) others_req[i].lens[k] = tmp; j++; } } statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send+nprocs_recv+1) * \ sizeof(MPI_Status)); /* +1 to avoid a 0-size malloc */ /* wait on the receives */ if (nprocs_recv) { #ifdef NEEDS_MPI_TEST j = 0; while (!j) MPI_Testall(nprocs_recv, requests, &j, statuses); #else MPI_Waitall(nprocs_recv, requests, statuses); #endif /* if noncontiguous, to the copies from the recv buffers */ if (!buftype_is_contig) ADIOI_Fill_user_buffer(fd, buf, flat_buf, recv_buf, offset_list, len_list, (unsigned*)recv_size, requests, statuses, recd_from_proc, nprocs, contig_access_count, min_st_offset, fd_size, fd_start, fd_end, buftype_extent); } /* wait on the sends*/ MPI_Waitall(nprocs_send, requests+nprocs_recv, statuses+nprocs_recv); ADIOI_Free(statuses); ADIOI_Free(requests); if (!buftype_is_contig) { for (i=0; i < nprocs; i++) if (recv_size[i]) ADIOI_Free(recv_buf[i]); ADIOI_Free(recv_buf); } #ifdef AGGREGATION_PROFILE MPE_Log_event (5033, 0, NULL); #endif }
void ADIOI_PIOFS_Open(ADIO_File fd, int *error_code) { int amode, perm, old_mask, err; piofs_fstat_t piofs_fstat; char *value; #ifndef PRINT_ERR_MSG static char myname[] = "ADIOI_PIOFS_OPEN"; #endif if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; #ifdef PROFILE MPE_Log_event(1, 0, "start open"); #endif fd->fd_sys = open(fd->filename, amode, perm); #ifdef PROFILE MPE_Log_event(2, 0, "end open"); #endif llseek(fd->fd_sys, 0, SEEK_SET); /* required to initiate use of 64-bit offset */ if (fd->fd_sys != -1) { value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); /* get file striping information and set it in info */ err = piofsioctl(fd->fd_sys, PIOFS_FSTAT, &piofs_fstat); if (!err) { sprintf(value, "%d", piofs_fstat.st_bsu); MPI_Info_set(fd->info, "striping_unit", value); sprintf(value, "%d", piofs_fstat.st_cells); MPI_Info_set(fd->info, "striping_factor", value); sprintf(value, "%d", piofs_fstat.st_base_node); MPI_Info_set(fd->info, "start_iodevice", value); } ADIOI_Free(value); if (fd->access_mode & ADIO_APPEND) fd->fp_ind = fd->fp_sys_posn = llseek(fd->fd_sys, 0, SEEK_END); } #ifdef PRINT_ERR_MSG *error_code = (fd->fd_sys == -1) ? MPI_ERR_UNKNOWN : MPI_SUCCESS; #else if (fd->fd_sys == -1) { *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, myname, "I/O Error", "%s", strerror(errno)); ADIOI_Error(ADIO_FILE_NULL, *error_code, myname); } else *error_code = MPI_SUCCESS; #endif }
void ADIOI_P2PContigReadAggregation(ADIO_File fd, const void *buf, int *error_code, ADIO_Offset *st_offsets, ADIO_Offset *end_offsets, ADIO_Offset *fd_start, ADIO_Offset* fd_end) { *error_code = MPI_SUCCESS; /* initialize to success */ #ifdef ROMIO_GPFS double startTimeBase,endTimeBase; #endif MPI_Status status; pthread_t io_thread; void *thread_ret; ADIOI_IO_ThreadFuncData io_thread_args; #ifdef ROMIO_GPFS startTimeBase = MPI_Wtime(); #endif int nprocs,myrank; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); ADIO_Offset myOffsetStart = st_offsets[myrank], myOffsetEnd = end_offsets[myrank]; int myAggRank = -1; /* if I am an aggregor this is my index into fd->hints->ranklist */ int iAmUsedAgg = 0; int naggs = fd->hints->cb_nodes; int coll_bufsize = fd->hints->cb_buffer_size; #ifdef ROMIO_GPFS if (gpfsmpio_pthreadio == 1) /* share buffer between working threads */ coll_bufsize = coll_bufsize/2; #endif int j; for (j=0;j<naggs;j++) { if (fd->hints->ranklist[j] == myrank) { myAggRank = j; if (fd_end[j] > fd_start[j]) { iAmUsedAgg = 1; } } } /* for my offset range determine how much data and from whom I need to get * it. For source ag targets, also determine the source file domain * offsets locally to reduce communication overhead */ int *sourceAggsForMyData = (int *)ADIOI_Malloc(naggs * sizeof(int)); ADIO_Offset *sourceAggsForMyDataFDStart = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset)); ADIO_Offset *sourceAggsForMyDataFDEnd = (ADIO_Offset *)ADIOI_Malloc(naggs * sizeof(ADIO_Offset)); int numSourceAggs = 0; int i; for (i=0;i<naggs;i++) { if ( ((myOffsetStart >= fd_start[i]) && (myOffsetStart <= fd_end[i])) || ((myOffsetEnd >= fd_start[i]) && (myOffsetEnd <= fd_end[i]))) { sourceAggsForMyData[numSourceAggs] = fd->hints->ranklist[i]; sourceAggsForMyDataFDStart[numSourceAggs] = fd_start[i]; sourceAggsForMyDataFDEnd[numSourceAggs] = fd_end[i]; numSourceAggs++; } } /* these 3 arrays track info on the procs that are fed from an aggregtor - * to sacrifice some performance at setup to save on memory instead of * using max size of nprocs for the arrays could determine exact size first * and then allocate that size */ int *targetProcsForMyData=NULL; int *remainingDataAmountToSendPerProc=NULL; ADIO_Offset *remainingDataOffsetToSendPerProc=NULL; int numTargetProcs = 0; if (iAmUsedAgg) { /* for the used aggregators figure out how much data I need from what procs */ /* count numTargetProcs so we know how large to make the arrays */ for (i=0;i<nprocs;i++) if ( ((st_offsets[i] >= fd_start[myAggRank]) && (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) && (end_offsets[i] <= fd_end[myAggRank])) ) numTargetProcs++; targetProcsForMyData = (int *)ADIOI_Malloc(numTargetProcs * sizeof(int)); remainingDataAmountToSendPerProc = (int *)ADIOI_Malloc(numTargetProcs * sizeof(int)); remainingDataOffsetToSendPerProc = (ADIO_Offset *)ADIOI_Malloc(numTargetProcs * sizeof(ADIO_Offset)); /* everybody has the st_offsets and end_offsets for all ranks so if I am a * used aggregator go thru them and figure out which ranks have data that * falls into my file domain assigned to me */ numTargetProcs = 0; for (i=0;i<nprocs;i++) { if ( ((st_offsets[i] >= fd_start[myAggRank]) && (st_offsets[i] <= fd_end[myAggRank])) || ((end_offsets[i] >= fd_start[myAggRank]) && (end_offsets[i] <= fd_end[myAggRank]))) { targetProcsForMyData[numTargetProcs] = i; if ( ((st_offsets[i] >= fd_start[myAggRank]) && (st_offsets[i] <= fd_end[myAggRank])) && ((end_offsets[i] >= fd_start[myAggRank]) && (end_offsets[i] <= fd_end[myAggRank]))) { remainingDataAmountToSendPerProc[numTargetProcs] = (end_offsets[i] - st_offsets[i])+1; remainingDataOffsetToSendPerProc[numTargetProcs] = st_offsets[i]; } else if ((st_offsets[i] >= fd_start[myAggRank]) && (st_offsets[i] <= fd_end[myAggRank])) {/* starts in this fd and goes past it */ remainingDataAmountToSendPerProc[numTargetProcs] = (fd_end[myAggRank] - st_offsets[i]) +1; remainingDataOffsetToSendPerProc[numTargetProcs] = st_offsets[i]; } else { /* starts in fd before this and ends in it */ remainingDataAmountToSendPerProc[numTargetProcs] = (end_offsets[i] - fd_start[myAggRank]) +1; remainingDataOffsetToSendPerProc[numTargetProcs] = fd_start[myAggRank]; } numTargetProcs++; } } } MPI_Request *mpiRecvDataFromSourceAggsRequest = (MPI_Request *) ADIOI_Malloc(numSourceAggs * sizeof(MPI_Request)); MPI_Request *mpiSendDataToTargetProcRequest = (MPI_Request *) ADIOI_Malloc(numTargetProcs * sizeof(MPI_Request)); MPI_Status mpiWaitAnyStatusFromSourceProcs,mpiIsendStatusForData; /* use the two-phase buffer allocated in the file_open - no app should ever * be both reading and writing at the same time */ char *read_buf0 = fd->io_buf; char *read_buf1 = fd->io_buf + coll_bufsize; /* if threaded i/o selected, we'll do a kind of double buffering */ char *read_buf = read_buf0; /* compute number of rounds */ ADIO_Offset numberOfRounds = (ADIO_Offset)((((ADIO_Offset)(end_offsets[nprocs-1]-st_offsets[0]))/((ADIO_Offset)((ADIO_Offset)coll_bufsize*(ADIO_Offset)naggs)))) + 1; ADIO_Offset currentRoundFDStart = 0, nextRoundFDStart = 0; ADIO_Offset currentRoundFDEnd = 0, nextRoundFDEnd = 0; if (iAmUsedAgg) { currentRoundFDStart = fd_start[myAggRank]; nextRoundFDStart = fd_start[myAggRank]; } int *dataSizeSentThisRoundPerProc = (int *)ADIOI_Malloc(numTargetProcs * sizeof(int)); int *sourceAggIndexesForMyDataThisRound = (int *)ADIOI_Malloc(numSourceAggs * sizeof(int)); int *recvBufferOffsetsThisRound = (int *)ADIOI_Malloc(numSourceAggs * sizeof(int)); int *bufferAmountsToGetThisRound = (int *)ADIOI_Malloc(numSourceAggs * sizeof(int)); *error_code = MPI_SUCCESS; int currentReadBuf = 0; int useIOBuffer = 0; #ifdef ROMIO_GPFS if (gpfsmpio_pthreadio && (numberOfRounds>1)) { useIOBuffer = 1; io_thread = pthread_self(); } #endif #ifdef ROMIO_GPFS endTimeBase = MPI_Wtime(); gpfsmpio_prof_cw[GPFSMPIO_CIO_T_MYREQ] += (endTimeBase-startTimeBase); #endif /* each iteration of this loop reads a coll_bufsize portion of the file domain */ int roundIter; for (roundIter=0;roundIter<numberOfRounds;roundIter++) { int irecv,isend; /* determine what offsets define the portion of the file domain the agg is reading this round */ if (iAmUsedAgg) { currentRoundFDStart = nextRoundFDStart; if (!useIOBuffer || (roundIter == 0)) { int amountDataToReadThisRound; if ((fd_end[myAggRank] - currentRoundFDStart) < coll_bufsize) { currentRoundFDEnd = fd_end[myAggRank]; amountDataToReadThisRound = ((currentRoundFDEnd-currentRoundFDStart)+1); } else { currentRoundFDEnd = currentRoundFDStart + coll_bufsize - 1; amountDataToReadThisRound = coll_bufsize; } /* read currentRoundFDEnd bytes */ ADIO_ReadContig(fd, read_buf,amountDataToReadThisRound, MPI_BYTE, ADIO_EXPLICIT_OFFSET, currentRoundFDStart, &status, error_code); currentReadBuf = 1; #ifdef ROMIO_GPFS endTimeBase = MPI_Wtime(); #endif } if (useIOBuffer) { /* use the thread reader for the next round */ /* switch back and forth between the read buffers so that the data aggregation code is diseminating 1 buffer while the thread is reading into the other */ if (roundIter > 0) currentRoundFDEnd = nextRoundFDEnd; if (roundIter < (numberOfRounds-1)) { nextRoundFDStart += coll_bufsize; int amountDataToReadNextRound; if ((fd_end[myAggRank] - nextRoundFDStart) < coll_bufsize) { nextRoundFDEnd = fd_end[myAggRank]; amountDataToReadNextRound = ((nextRoundFDEnd-nextRoundFDStart)+1); } else { nextRoundFDEnd = nextRoundFDStart + coll_bufsize - 1; amountDataToReadNextRound = coll_bufsize; } if(!pthread_equal(io_thread, pthread_self())) { pthread_join(io_thread, &thread_ret); *error_code = *(int *)thread_ret; if (*error_code != MPI_SUCCESS) return; io_thread = pthread_self(); } io_thread_args.fd = fd; /* do a little pointer shuffling: background I/O works from one * buffer while two-phase machinery fills up another */ if (currentReadBuf == 0) { io_thread_args.buf = read_buf0; currentReadBuf = 1; read_buf = read_buf1; } else { io_thread_args.buf = read_buf1; currentReadBuf = 0; read_buf = read_buf0; } io_thread_args.io_kind = ADIOI_READ; io_thread_args.size = amountDataToReadNextRound; io_thread_args.offset = nextRoundFDStart; io_thread_args.status = &status; io_thread_args.error_code = *error_code; if ( (pthread_create(&io_thread, NULL, ADIOI_IO_Thread_Func, &(io_thread_args))) != 0) io_thread = pthread_self(); } else { /* last round */ if(!pthread_equal(io_thread, pthread_self())) { pthread_join(io_thread, &thread_ret); *error_code = *(int *)thread_ret; if (*error_code != MPI_SUCCESS) return; io_thread = pthread_self(); } if (currentReadBuf == 0) { read_buf = read_buf1; } else { read_buf = read_buf0; } } } /* useIOBuffer */ } /* IAmUsedAgg */ /* determine what source aggs I need to get data from this round and * recv only from them */ int numSourceAggsThisRound = 0; for (i=0;i<numSourceAggs;i++) { if ( ((myOffsetStart >= sourceAggsForMyDataFDStart[i]) && (myOffsetStart <= sourceAggsForMyDataFDEnd[i])) || ((myOffsetEnd >= sourceAggsForMyDataFDStart[i]) && (myOffsetEnd <= sourceAggsForMyDataFDEnd[i])) ) { /* we know that we need to get data from this source agg at * some point, now need to figure out how much this round */ /* here are the offsets currently being sent by the aggregator * during this round */ ADIO_Offset currentRoundFDStartForMySourceAgg = (ADIO_Offset)((ADIO_Offset)sourceAggsForMyDataFDStart[i] + (ADIO_Offset)((ADIO_Offset)roundIter*(ADIO_Offset)coll_bufsize)); ADIO_Offset currentRoundFDEndForMySourceAgg = (ADIO_Offset)((ADIO_Offset)sourceAggsForMyDataFDStart[i] + (ADIO_Offset)((ADIO_Offset)(roundIter+1)*(ADIO_Offset)coll_bufsize) - (ADIO_Offset)1); if (currentRoundFDEndForMySourceAgg > sourceAggsForMyDataFDEnd[i]) currentRoundFDEndForMySourceAgg = sourceAggsForMyDataFDEnd[i]; #ifdef p2pcontigtrace printf("roundIter %d source iter %d sourceAggsForMyData is %d myOffsetStart is %ld myOffsetEnd is %ld sourceAggsForMyDataFDStart is %ld sourceAggsForMyDataFDEnd is %ld currentRoundFDStartForMySourceAgg is %ld currentRoundFDEndForMySourceAgg is %ld\n",roundIter,i,sourceAggsForMyData[i],myOffsetStart,myOffsetEnd,sourceAggsForMyDataFDStart[i],sourceAggsForMyDataFDEnd[i],currentRoundFDStartForMySourceAgg,currentRoundFDEndForMySourceAgg); #endif /* get the portion of my data that is within currentRoundFDStartForMySourceAgg to currentRoundFDEndForMySourceAgg */ /* find the offset into the recv buffer and the amount of data to get */ int recvBufferOffset = 0; int bufferAmountToGet = 0; if ((myOffsetStart >= currentRoundFDStartForMySourceAgg) && (myOffsetStart <= currentRoundFDEndForMySourceAgg)) { if (myOffsetEnd > currentRoundFDEndForMySourceAgg) bufferAmountToGet = (currentRoundFDEndForMySourceAgg - myOffsetStart) +1; else bufferAmountToGet = (myOffsetEnd - myOffsetStart) +1; } else if ((myOffsetEnd >= currentRoundFDStartForMySourceAgg) && (myOffsetEnd <= currentRoundFDEndForMySourceAgg)) { recvBufferOffset = (int) (currentRoundFDStartForMySourceAgg - myOffsetStart); if (myOffsetEnd > currentRoundFDEndForMySourceAgg) bufferAmountToGet = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) +1; else bufferAmountToGet = (myOffsetEnd - currentRoundFDStartForMySourceAgg) +1; } else if ((myOffsetStart <= currentRoundFDStartForMySourceAgg) && (myOffsetEnd >= currentRoundFDEndForMySourceAgg)) { recvBufferOffset = (int) (currentRoundFDStartForMySourceAgg - myOffsetStart); bufferAmountToGet = (currentRoundFDEndForMySourceAgg - currentRoundFDStartForMySourceAgg) +1; } if (bufferAmountToGet > 0) { /* we have data to get this round */ sourceAggIndexesForMyDataThisRound[numSourceAggsThisRound] = i; recvBufferOffsetsThisRound[numSourceAggsThisRound] = recvBufferOffset; bufferAmountsToGetThisRound[numSourceAggsThisRound] = bufferAmountToGet; #ifdef p2pcontigtrace printf("bufferAmountToGet is %d recvBufferOffset is %d\n",bufferAmountToGet,recvBufferOffset); #endif numSourceAggsThisRound++; } } } /* the aggs determine the amount of data they will be sending to their * source procs */ for (i=0;i<numTargetProcs;i++) { if ((remainingDataOffsetToSendPerProc[i] >= currentRoundFDStart) && (remainingDataOffsetToSendPerProc[i] <= currentRoundFDEnd)) { if ((remainingDataOffsetToSendPerProc[i] + remainingDataAmountToSendPerProc[i]) <= currentRoundFDEnd) dataSizeSentThisRoundPerProc[i] = remainingDataAmountToSendPerProc[i]; else dataSizeSentThisRoundPerProc[i] = (currentRoundFDEnd - remainingDataOffsetToSendPerProc[i]) +1; } else if (((remainingDataOffsetToSendPerProc[i]+ remainingDataAmountToSendPerProc[i]) >= currentRoundFDStart) && ((remainingDataOffsetToSendPerProc[i]+ remainingDataAmountToSendPerProc[i]) <= currentRoundFDEnd)) { if ((remainingDataOffsetToSendPerProc[i]) >= currentRoundFDStart) dataSizeSentThisRoundPerProc[i] = remainingDataAmountToSendPerProc[i]; else dataSizeSentThisRoundPerProc[i] = (remainingDataOffsetToSendPerProc[i]-currentRoundFDStart) +1; } else dataSizeSentThisRoundPerProc[i] = 0; } /* the target procs get the data from the source aggs */ for (i = 0; i < numSourceAggsThisRound; i++) { MPI_Irecv(&((char*)buf)[recvBufferOffsetsThisRound[i]], bufferAmountsToGetThisRound[i],MPI_BYTE, sourceAggsForMyData[sourceAggIndexesForMyDataThisRound[i]],0,fd->comm, &mpiRecvDataFromSourceAggsRequest[i]); } /* the source aggs send the data to the target procs */ int numTargetProcsSentThisRound = 0; for (i=0;i<numTargetProcs;i++) { int currentWBOffset = 0; for (j=0;j<i;j++) currentWBOffset += dataSizeSentThisRoundPerProc[j]; /* only send to target procs that will recv > 0 count data */ if (dataSizeSentThisRoundPerProc[i] > 0) { MPI_Isend(&((char*)read_buf)[currentWBOffset], dataSizeSentThisRoundPerProc[i], MPI_BYTE,targetProcsForMyData[i],0, fd->comm,&mpiSendDataToTargetProcRequest[numTargetProcsSentThisRound]); numTargetProcsSentThisRound++; remainingDataAmountToSendPerProc[i] -= dataSizeSentThisRoundPerProc[i]; remainingDataOffsetToSendPerProc[i] += dataSizeSentThisRoundPerProc[i]; } } /* wait for the target procs to get their data */ for (i = 0; i < numSourceAggsThisRound; i++) { MPI_Waitany(numSourceAggsThisRound,mpiRecvDataFromSourceAggsRequest, &irecv,&mpiWaitAnyStatusFromSourceProcs); } nextRoundFDStart = currentRoundFDStart + coll_bufsize; /* clean up the MPI_Isend MPI_Requests */ for (i=0;i<numTargetProcsSentThisRound;i++) { MPI_Waitany(numTargetProcsSentThisRound,mpiSendDataToTargetProcRequest, &isend,&mpiIsendStatusForData); } MPI_Barrier(fd->comm); /* need to sync up the source aggs which did the isend with the target procs which did the irecvs to give the target procs time to get the data before overwriting with next round readcontig */ } /* for-loop roundIter */ if (useIOBuffer) { /* thread reader cleanup */ if ( !pthread_equal(io_thread, pthread_self()) ) { pthread_join(io_thread, &thread_ret); *error_code = *(int *)thread_ret; } } if (iAmUsedAgg) { ADIOI_Free(targetProcsForMyData); ADIOI_Free(remainingDataAmountToSendPerProc); ADIOI_Free(remainingDataOffsetToSendPerProc); } ADIOI_Free(sourceAggsForMyData); ADIOI_Free(sourceAggsForMyDataFDStart); ADIOI_Free(sourceAggsForMyDataFDEnd); ADIOI_Free(mpiRecvDataFromSourceAggsRequest); ADIOI_Free(mpiSendDataToTargetProcRequest); ADIOI_Free(dataSizeSentThisRoundPerProc); ADIOI_Free(sourceAggIndexesForMyDataThisRound); ADIOI_Free(recvBufferOffsetsThisRound); ADIOI_Free(bufferAmountsToGetThisRound); /* TODO: is Barrier here needed? */ MPI_Barrier(fd->comm); return; }