static void gpfs_free_all_locks(int fd) { int rc; struct { gpfsFcntlHeader_t header; gpfsFreeRange_t release; } release_all; release_all.header.totalLength = sizeof(release_all); release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION; release_all.header.fcntlReserved = 0; release_all.release.structLen = sizeof(release_all.release); release_all.release.structType = GPFS_FREE_RANGE; release_all.release.start = 0; release_all.release.length = 0; rc = gpfs_fcntl(fd, &release_all); if (rc != 0) { DBGV_FPRINTF(stderr,"GPFS fcntl release failed with rc=%d, errno=%d\n", rc,errno); } }
static void scaleable_stat(ADIO_File fd) { struct stat64 bg_stat; struct statfs bg_statfs; int rank, rc; char * dir; long buf[2]; MPI_Comm_rank(fd->comm, &rank); if (rank == fd->hints->ranklist[0]) { /* Get the (real) underlying file system block size */ rc = stat64(fd->filename, &bg_stat); if (rc >= 0) { buf[0] = bg_stat.st_blksize; DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n", fd->filename,bg_stat.st_blksize); } else { DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); } /* Get the (real) underlying file system type so we can * plan our fsync scaling strategy */ rc = statfs(fd->filename,&bg_statfs); if (rc >= 0) { DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n", fd->filename,bg_statfs.f_type); buf[1] = bg_statfs.f_type; } else { DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); ADIO_FileSysType_parentdir(fd->filename, &dir); rc = statfs(dir,&bg_statfs); if (rc >= 0) { DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",dir,bg_statfs.f_type); buf[1] = bg_statfs.f_type; } else { /* Hmm. Guess we'll assume the worst-case, that it's not GPFS * or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */ buf[1] = -1; /* bogus magic number */ DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno); } free(dir); } } /* now we can broadcast the stat/statfs data to everyone else */ if (fd->comm != MPI_COMM_SELF) { /* if indep open, there's no one to talk to*/ if (fd->agg_comm != MPI_COMM_NULL) /* deferred open: only a subset of processes participate */ MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->agg_comm); else MPI_Bcast(buf, 2, MPI_LONG, fd->hints->ranklist[0], fd->comm); } bg_stat.st_blksize = buf[0]; bg_statfs.f_type = buf[1]; /* data from stat64 */ /* store the blksize in the file system specific storage */ ((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize; /* data from statfs */ if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) || (bg_statfs.f_type == bglocklessmpio_f_type)) { ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr = ADIOI_BG_FSYNC_AGGREGATION_ENABLED; /* Only one rank is an "fsync aggregator" because only one * fsync is needed */ if (rank == fd->hints->ranklist[0]) { ((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |= ADIOI_BG_FSYNC_AGGREGATOR; DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank); } else ; /* aggregation enabled but this rank is not an aggregator*/ } else ; /* Other filesystems default to no fsync aggregation */ }
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, rank, rc; static char myname[] = "ADIOI_GPFS_OPEN"; /* set internal variables for tuning environment variables */ ad_gpfs_get_env_vars(); if (fd->perm == ADIO_PERM_NULL) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else perm = fd->perm; amode = 0; if (fd->access_mode & ADIO_CREATE) amode = amode | O_CREAT; if (fd->access_mode & ADIO_RDONLY) amode = amode | O_RDONLY; if (fd->access_mode & ADIO_WRONLY) amode = amode | O_WRONLY; if (fd->access_mode & ADIO_RDWR) amode = amode | O_RDWR; if (fd->access_mode & ADIO_EXCL) amode = amode | O_EXCL; #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_a, 0, NULL); #endif fd->fd_sys = open(fd->filename, amode, perm); #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_open_b, 0, NULL); #endif DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno); fd->fd_direct = -1; if (gpfsmpio_devnullio == 1) { fd->null_fd = open("/dev/null", O_RDWR); } else { fd->null_fd = -1; } if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); if(fd->fd_sys != -1) { fd->blksize = 1048576; /* default to 1M */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL); #endif /* in this fs-specific routine, we might not be called over entire * communicator (deferred open). Collect statistics on one process. * ADIOI_GEN_Opencoll (common-code caller) will take care of the * broadcast */ MPI_Comm_rank(fd->comm, &rank); if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) { struct stat64 gpfs_statbuf; /* Get the (real) underlying file system block size */ rc = stat64(fd->filename, &gpfs_statbuf); if (rc >= 0) { fd->blksize = gpfs_statbuf.st_blksize; DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n", fd->filename,gpfs_statbuf.st_blksize); } else { DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n", fd->filename,rc,errno); } } /* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll * will take care of that in both standard and deferred-open case */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL); #endif #ifdef HAVE_GPFS_FCNTL_H /* in parallel workload, might be helpful to immediately release block * tokens. Or, system call overhead will outweigh any benefits... */ if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL) gpfs_free_all_locks(fd->fd_sys); #endif } if (fd->fd_sys == -1) { *error_code = ADIOI_Err_create_code(myname, fd->filename, errno); } else *error_code = MPI_SUCCESS; }