/* ADIOI_ZOIDFS_Open: * one process opens (or creates) the file, then broadcasts the result to the * remaining processors. * * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before * that, MPI_MODE_EXCL) was set. Because ZoidFS handles file lookup and * creation more scalably than traditional file systems, ADIO_Open now skips any * special handling when CREATE is set. */ void ADIOI_ZOIDFS_Open(ADIO_File fd, int *error_code) { int rank; static char myname[] = "ADIOI_ZOIDFS_OPEN"; ADIOI_ZOIDFS_object *zoidfs_obj_ptr; /* since one process is doing the open, that means one process is also * doing the error checking. define a struct for both the object reference * and the error code to broadcast to all the processors */ open_status o_status; MPI_Datatype open_status_type; MPI_Datatype types[2] = {MPI_INT, MPI_BYTE}; int lens[2] = {1, sizeof(ADIOI_ZOIDFS_object)}; MPI_Aint offsets[2]; memset(&o_status, 0, sizeof(o_status)); zoidfs_obj_ptr = (ADIOI_ZOIDFS_object *) ADIOI_Malloc(sizeof(ADIOI_ZOIDFS_object)); /* --BEGIN ERROR HANDLING-- */ if (zoidfs_obj_ptr == NULL) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, "Error allocating memory", 0); return; } /* --END ERROR HANDLING-- */ MPI_Comm_rank(fd->comm, &rank); ADIOI_ZOIDFS_Init(rank, error_code); if (*error_code != MPI_SUCCESS) { /* ADIOI_ZOIDFS_INIT handles creating error codes on its own */ ADIOI_Free(zoidfs_obj_ptr); return; } /* one process resolves name and will later bcast to others */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_open_a, 0, NULL ); #endif if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) { fake_an_open(fd->filename, fd->access_mode, fd->hints->striping_factor, fd->hints->striping_unit, zoidfs_obj_ptr, &o_status); /* store credentials and object reference in fd */ *zoidfs_obj_ptr = o_status.handle; fd->fs_ptr = zoidfs_obj_ptr; } #ifdef ADIOI_MPE_LOGGING MPE_Log_event( ADIOI_MPE_open_b, 0, NULL ); #endif /* broadcast status and (possibly valid) object reference */ MPI_Get_address(&o_status.error, &offsets[0]); MPI_Get_address(&o_status.handle, &offsets[1]); MPI_Type_struct(2, lens, offsets, types, &open_status_type); MPI_Type_commit(&open_status_type); /* Assertion: if we hit this Bcast, then all processes collectively * called this open. * * That's because deferred open never happens with this fs. */ MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm); MPI_Type_free(&open_status_type); /* --BEGIN ERROR HANDLING-- */ if (o_status.error != ZFS_OK) { ADIOI_Free(zoidfs_obj_ptr); fd->fs_ptr = NULL; *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_ZOIDFS_error_convert(o_status.error), "Unknown error", 0); /* TODO: FIX STRING */ return; } /* --END ERROR HANDLING-- */ *zoidfs_obj_ptr = o_status.handle; fd->fs_ptr = zoidfs_obj_ptr; *error_code = MPI_SUCCESS; return; }
/* * file_open_pvfs2: This is the same strategy as ROMIO's pvfs2 open * * Function: - opens a new file * Accepts: - same arguments as MPI_File_open() * Returns: - Success if new file handle */ int mca_fs_pvfs2_file_open (struct ompi_communicator_t *comm, const char* filename, int access_mode, struct ompi_info_t *info, mca_io_ompio_file_t *fh) { int ret; mca_fs_pvfs2 *pvfs2_fs; PVFS_fs_id pvfs2_id; char pvfs2_path[OMPIO_MAX_NAME] = {0}; char * ncache_timeout; open_status o_status = {0, {0, 0}}; struct ompi_datatype_t *open_status_type; struct ompi_datatype_t *types[2] = {&ompi_mpi_int.dt, &ompi_mpi_byte.dt}; int lens[2] = {1, sizeof(PVFS_object_ref)}; OPAL_PTRDIFF_TYPE offsets[2]; char char_stripe[MPI_MAX_INFO_KEY]; int flag; int fs_pvfs2_stripe_size = -1; int fs_pvfs2_stripe_width = -1; /* We are going to do what ROMIO does with one process resolving * the name and broadcasting to others */ pvfs2_fs = (mca_fs_pvfs2 *) malloc(sizeof(mca_fs_pvfs2)); if (NULL == pvfs2_fs) { opal_output (1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } if (!mca_fs_pvfs2_IS_INITIALIZED) { /* disable the pvfs2 ncache */ ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT"); if (ncache_timeout == NULL ) { setenv("PVFS2_NCACHE_TIMEOUT", "0", 1); } ret = PVFS_util_init_defaults(); if (ret < 0) { PVFS_perror("PVFS_util_init_defaults", ret); return OMPI_ERROR; } mca_fs_pvfs2_IS_INITIALIZED = 1; } memset(&(pvfs2_fs->credentials), 0, sizeof(PVFS_credentials)); PVFS_util_gen_credentials(&(pvfs2_fs->credentials)); /* check for stripe size and stripe depth in the info object and update mca_fs_pvfs2_stripe_width and mca_fs_pvfs2_stripe_size before calling fake_an_open() */ ompi_info_get (info, "stripe_size", MPI_MAX_INFO_VAL, char_stripe, &flag); if ( flag ) { sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_size ); } ompi_info_get (info, "stripe_width", MPI_MAX_INFO_VAL, char_stripe, &flag); if ( flag ) { sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_width ); } if (fs_pvfs2_stripe_size < 0) { fs_pvfs2_stripe_size = mca_fs_pvfs2_stripe_size; } if (fs_pvfs2_stripe_width < 0) { fs_pvfs2_stripe_width = mca_fs_pvfs2_stripe_width; } if (OMPIO_ROOT == fh->f_rank) { ret = PVFS_util_resolve(filename, &pvfs2_id, pvfs2_path, OMPIO_MAX_NAME); if (ret < 0 ) { PVFS_perror("PVFS_util_resolve", ret); o_status.error = -1; } else { fake_an_open (pvfs2_id, pvfs2_path, access_mode, fs_pvfs2_stripe_width, (PVFS_size)fs_pvfs2_stripe_size, pvfs2_fs, &o_status); } pvfs2_fs->object_ref = o_status.object_ref; fh->f_fs_ptr = pvfs2_fs; } /* broadcast status and (possibly valid) object reference */ offsets[0] = (MPI_Aint)(&o_status.error); offsets[1] = (MPI_Aint)(&o_status.object_ref); ompi_datatype_create_struct (2, lens, offsets, types, &open_status_type); ompi_datatype_commit (&open_status_type); fh->f_comm->c_coll.coll_bcast (MPI_BOTTOM, 1, open_status_type, OMPIO_ROOT, fh->f_comm, fh->f_comm->c_coll.coll_bcast_module); ompi_datatype_destroy (&open_status_type); if (o_status.error != 0) { /* No need to free the pvfs2_fs structure, since it will be deallocated in file_close in case of an error */ fh->f_fs_ptr = NULL; return OMPI_ERROR; } pvfs2_fs->object_ref = o_status.object_ref; fh->f_fs_ptr = pvfs2_fs; /* update the internal ompio structure to store stripe size and stripe depth correctly. Hadi(to be done): For this read the stripe size and stripe depth from the file itself */ if (fs_pvfs2_stripe_size > 0 && fs_pvfs2_stripe_width > 0) { fh->f_stripe_size = fs_pvfs2_stripe_size; fh->f_stripe_count = fs_pvfs2_stripe_width; } return OMPI_SUCCESS; }
/* ADIOI_PVFS2_Open: * one process opens (or creates) the file, then broadcasts the result to the * remaining processors. * * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before * that, MPI_MODE_EXCL) was set. Because PVFS2 handles file lookup and * creation more scalably than other file systems, ADIO_Open now skips any * special handling when CREATE is set. */ void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code) { int rank, ret; PVFS_fs_id cur_fs; static char myname[] = "ADIOI_PVFS2_OPEN"; char pvfs_path[PVFS_NAME_MAX] = {0}; ADIOI_PVFS2_fs *pvfs2_fs; /* since one process is doing the open, that means one process is also * doing the error checking. define a struct for both the object reference * and the error code to broadcast to all the processors */ open_status o_status = {0, {0, 0}}; MPI_Datatype open_status_type; MPI_Datatype types[2] = {MPI_INT, MPI_BYTE}; int lens[2] = {1, sizeof(PVFS_object_ref)}; MPI_Aint offsets[2]; pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs)); /* --BEGIN ERROR HANDLING-- */ if (pvfs2_fs == NULL) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNKNOWN, "Error allocating memory", 0); return; } /* --END ERROR HANDLING-- */ MPI_Comm_rank(fd->comm, &rank); ADIOI_PVFS2_Init(error_code); if (*error_code != MPI_SUCCESS) { /* ADIOI_PVFS2_INIT handles creating error codes on its own */ return; } /* currently everyone gets their own credentials */ ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials)); /* one process resolves name and will later bcast to others */ if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) { /* given the filename, figure out which pvfs filesystem it is on */ ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX); if (ret < 0 ) { PVFS_perror("PVFS_util_resolve", ret); /* TODO: pick a good error for this */ o_status.error = -1; } else { fake_an_open(cur_fs, pvfs_path, fd->access_mode, fd->hints->striping_factor, fd->hints->striping_unit, pvfs2_fs, &o_status); } /* store credentials and object reference in fd */ pvfs2_fs->object_ref = o_status.object_ref; fd->fs_ptr = pvfs2_fs; } /* broadcast status and (possibly valid) object reference */ MPI_Address(&o_status.error, &offsets[0]); MPI_Address(&o_status.object_ref, &offsets[1]); MPI_Type_struct(2, lens, offsets, types, &open_status_type); MPI_Type_commit(&open_status_type); /* Assertion: if we hit this Bcast, then all processes collectively * called this open. * * That's because deferred open never happens with PVFS2. */ MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm); MPI_Type_free(&open_status_type); /* --BEGIN ERROR HANDLING-- */ if (o_status.error != 0) { ADIOI_Free(pvfs2_fs); *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, ADIOI_PVFS2_error_convert(o_status.error), "Unknown error", 0); /* TODO: FIX STRING */ return; } /* --END ERROR HANDLING-- */ pvfs2_fs->object_ref = o_status.object_ref; fd->fs_ptr = pvfs2_fs; *error_code = MPI_SUCCESS; return; }