MPI_File ADIO_Open(MPI_Comm orig_comm, MPI_Comm comm, const char *filename, int file_system, ADIOI_Fns *ops, int access_mode, ADIO_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, MPI_Info info, int perm, int *error_code) { MPI_File mpi_fh; ADIO_File fd; int err, rank, procs; static char myname[] = "ADIO_OPEN"; int max_error_code; MPI_Info dupinfo; int syshints_processed, can_skip; char *p; *error_code = MPI_SUCCESS; /* obtain MPI_File handle */ mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD)); if (mpi_fh == MPI_FILE_NULL) { fd = MPI_FILE_NULL; *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd = MPIO_File_resolve(mpi_fh); fd->cookie = ADIOI_FILE_COOKIE; fd->fp_ind = disp; fd->fp_sys_posn = 0; fd->comm = comm; /* dup'ed in MPI_File_open */ fd->filename = ADIOI_Strdup(filename); fd->file_system = file_system; fd->fs_ptr = NULL; fd->fns = ops; fd->disp = disp; fd->split_coll_count = 0; fd->shared_fp_fd = ADIO_FILE_NULL; fd->atomicity = 0; fd->etype = etype; /* MPI_BYTE by default */ fd->filetype = filetype; /* MPI_BYTE by default */ fd->etype_size = 1; /* default etype is MPI_BYTE */ fd->file_realm_st_offs = NULL; fd->file_realm_types = NULL; fd->perm = perm; fd->async_count = 0; fd->fortran_handle = -1; fd->err_handler = ADIOI_DFLT_ERR_HANDLER; fd->io_buf_window = MPI_WIN_NULL; fd->io_buf_put_amounts_window = MPI_WIN_NULL; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &procs); /* create and initialize info object */ fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct)); if (fd->hints == NULL) { *error_code = MPIO_Err_create_code(*error_code, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_OTHER, "**nomem2",0); goto fn_exit; } fd->hints->cb_config_list = NULL; fd->hints->ranklist = NULL; fd->hints->initialized = 0; fd->info = MPI_INFO_NULL; /* move system-wide hint processing *back* into open, but this time the * hintfile reader will do a scalable read-and-broadcast. The global * ADIOI_syshints will get initialized at first open. subsequent open * calls will just use result from first open. * * We have two goals here: * 1: avoid processing the hintfile multiple times * 2: have all processes participate in hintfile processing (so we can read-and-broadcast) * * a code might do an "initialize from 0", so we can only skip hint * processing once everyone has particpiated in hint processing */ if (ADIOI_syshints == MPI_INFO_NULL) syshints_processed = 0; else syshints_processed = 1; MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm); if (!can_skip) { if (ADIOI_syshints == MPI_INFO_NULL) MPI_Info_create(&ADIOI_syshints); ADIOI_process_system_hints(fd, ADIOI_syshints); } ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo); ADIO_SetInfo(fd, dupinfo, &err); if (dupinfo != MPI_INFO_NULL) { *error_code = MPI_Info_free(&dupinfo); if (*error_code != MPI_SUCCESS) goto fn_exit; } ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname); /* Instead of repeatedly allocating this buffer in collective read/write, * allocating up-front might make memory management on small platforms * (e.g. Blue Gene) more efficent */ fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size); /* deferred open: * we can only do this optimization if 'fd->hints->deferred_open' is set * (which means the user hinted 'no_indep_rw' and collective buffering). * Furthermore, we only do this if our collective read/write routines use * our generic function, and not an fs-specific routine (we can defer opens * only if we use our aggreagation code). */ if (fd->hints->deferred_open && !(uses_generic_read(fd) \ && uses_generic_write(fd))) { fd->hints->deferred_open = 0; } if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN)) /* disable deferred open on these fs so that scalable broadcast * will always use the propper communicator */ fd->hints->deferred_open = 0; /* on BlueGene, the cb_config_list is built when hints are processed. No * one else does that right now */ if (fd->hints->ranklist == NULL) { build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code); if (*error_code != MPI_SUCCESS) goto fn_exit; } fd->is_open = 0; fd->my_cb_nodes_index = -2; fd->is_agg = is_aggregator(rank, fd); /* deferred open used to split the communicator to create an "aggregator * communicator", but we only used it as a way to indicate that deferred * open happened. fd->is_open and fd->is_agg are sufficient */ /* actual opens start here */ /* generic open: one process opens to create the file, all others open */ /* nfs open: everybody opens or else you'll end up with "file not found" * due to stupid nfs consistency semantics */ /* scalable open: one process opens and broadcasts results to everyone */ ADIOI_OpenColl(fd, rank, access_mode, error_code); /* for debugging, it can be helpful to see the hints selected. Some file * systes set up the hints in the open call (e.g. lustre) */ p = getenv("ROMIO_PRINT_HINTS"); if (rank == 0 && p != NULL ) { ADIOI_Info_print_keyvals(fd->info); } fn_exit: MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm); if (max_error_code != MPI_SUCCESS) { /* If the file was successfully opened, close it */ if (*error_code == MPI_SUCCESS) { /* in the deferred open case, only those who have actually opened the file should close it */ if (fd->hints->deferred_open) { if (fd->is_agg) { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } else { (*(fd->fns->ADIOI_xxx_Close))(fd, error_code); } } ADIOI_Free(fd->filename); ADIOI_Free(fd->hints->ranklist); ADIOI_Free(fd->hints->cb_config_list); ADIOI_Free(fd->hints); if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info)); ADIOI_Free(fd->io_buf); ADIOI_Free(fd); fd = ADIO_FILE_NULL; if (*error_code == MPI_SUCCESS) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**oremote_fail", 0); } } return fd; }
void ADIO_Init(int *argc, char ***argv, int *error_code) { #if defined(ROMIO_XFS) || defined(ROMIO_LUSTRE) char *c; #endif ADIOI_UNREFERENCED_ARG(argc); ADIOI_UNREFERENCED_ARG(argv); /* initialize the linked list containing flattened datatypes */ ADIOI_Flatlist = (ADIOI_Flatlist_node *) ADIOI_Malloc(sizeof(ADIOI_Flatlist_node)); ADIOI_Flatlist->type = MPI_DATATYPE_NULL; ADIOI_Flatlist->next = NULL; ADIOI_Flatlist->blocklens = NULL; ADIOI_Flatlist->indices = NULL; #if defined(ROMIO_XFS) || defined(ROMIO_LUSTRE) c = getenv("MPIO_DIRECT_READ"); if (c && (!strcmp(c, "true") || !strcmp(c, "TRUE"))) ADIOI_Direct_read = 1; else ADIOI_Direct_read = 0; c = getenv("MPIO_DIRECT_WRITE"); if (c && (!strcmp(c, "true") || !strcmp(c, "TRUE"))) ADIOI_Direct_write = 1; else ADIOI_Direct_write = 0; #endif /* Assume system-wide hints won't change between runs: move hint processing * from ADIO_Open to here */ /* FIXME should be checking error code from MPI_Info_create here */ MPI_Info_create(&ADIOI_syshints); ADIOI_process_system_hints(ADIOI_syshints); #ifdef ADIOI_MPE_LOGGING { MPE_Log_get_state_eventIDs( &ADIOI_MPE_open_a, &ADIOI_MPE_open_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_read_a, &ADIOI_MPE_read_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_write_a, &ADIOI_MPE_write_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_lseek_a, &ADIOI_MPE_lseek_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_close_a, &ADIOI_MPE_close_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_writelock_a, &ADIOI_MPE_writelock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_readlock_a, &ADIOI_MPE_readlock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_unlock_a, &ADIOI_MPE_unlock_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_postwrite_a, &ADIOI_MPE_postwrite_b ); MPE_Log_get_state_eventIDs( &ADIOI_MPE_openinternal_a, &ADIOI_MPE_openinternal_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_stat_a, &ADIOI_MPE_stat_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_iread_a, &ADIOI_MPE_iread_b); MPE_Log_get_state_eventIDs( &ADIOI_MPE_iwrite_a, &ADIOI_MPE_iwrite_b); int comm_world_rank; MPI_Comm_rank( MPI_COMM_WORLD, &comm_world_rank ); if ( comm_world_rank == 0 ) { MPE_Describe_state( ADIOI_MPE_open_a, ADIOI_MPE_open_b, "open", "orange" ); MPE_Describe_state( ADIOI_MPE_read_a, ADIOI_MPE_read_b, "read", "green" ); MPE_Describe_state( ADIOI_MPE_write_a, ADIOI_MPE_write_b, "write", "blue" ); MPE_Describe_state( ADIOI_MPE_lseek_a, ADIOI_MPE_lseek_b, "lseek", "red" ); MPE_Describe_state( ADIOI_MPE_close_a, ADIOI_MPE_close_b, "close", "grey" ); MPE_Describe_state( ADIOI_MPE_writelock_a, ADIOI_MPE_writelock_b, "writelock", "plum" ); MPE_Describe_state( ADIOI_MPE_readlock_a, ADIOI_MPE_readlock_b, "readlock", "magenta" ); MPE_Describe_state( ADIOI_MPE_unlock_a, ADIOI_MPE_unlock_b, "unlock", "purple" ); MPE_Describe_state( ADIOI_MPE_postwrite_a, ADIOI_MPE_postwrite_b, "postwrite", "ivory" ); MPE_Describe_state( ADIOI_MPE_openinternal_a, ADIOI_MPE_openinternal_b, "open system", "blue"); MPE_Describe_state( ADIOI_MPE_stat_a, ADIOI_MPE_stat_b, "stat", "purple"); MPE_Describe_state( ADIOI_MPE_iread_a, ADIOI_MPE_iread_b, "iread", "purple"); MPE_Describe_state( ADIOI_MPE_iwrite_a, ADIOI_MPE_iwrite_b, "iwrite", "purple"); } } #endif *error_code = MPI_SUCCESS; MPI_Op_create(my_consensus, 1, &ADIO_same_amode); }