int main (int argc, char **argv) { const char *target; plfs_error_t ret; Plfs_fd *fd; int nr; if (argc > 1) { target = argv[1]; } else { return -1; } plfs_handle_version_arg(argc, argv[1]); fd = NULL; ret = plfs_open(&fd, target, O_RDONLY, getpid(), 0777, NULL); if (ret != PLFS_SUCCESS) { fprintf(stderr, "Couldn't open path %s\n", target); exit(1); } ret = fd->optimize_access(); if (ret != PLFS_SUCCESS) { fprintf(stderr, "optimize_access: flattenIndex of %s failed (%s)\n", target, strplfserr(ret)); } else { printf("Successfully flattened index of %s\n",target); } (void) plfs_close(fd, getpid(), getuid(), O_RDONLY, NULL, &nr); exit(( ret == PLFS_SUCCESS) ? 0 : 1); }
/* return PLFS_SUCCESS or PLFS_E* */ plfs_error_t Access( const string& path, IOStore *store, int mask ) { // there used to be some concern here that the accessfile might not // exist yet but the way containers are made ensures that an accessfile // will exist if the container exists // doing just Access is insufficient when plfs daemon run as root // root can access everything. so, we must also try the open mode_t open_mode = 0; plfs_error_t ret; IOSHandle *fh; mlog(FOP_DAPI, "%s Check existence of %s", __FUNCTION__, path.c_str()); ret = store->Access( path.c_str(), F_OK ); if ( ret == PLFS_SUCCESS ) { // at this point, we know the file exists if(checkMask(mask,W_OK|R_OK)) { open_mode = O_RDWR; } else if(checkMask(mask,R_OK)||checkMask(mask,X_OK)) { open_mode = O_RDONLY; } else if(checkMask(mask,W_OK)) { open_mode = O_WRONLY; } else if(checkMask(mask,F_OK)) { return PLFS_SUCCESS; // we already know this } mlog(FOP_DCOMMON, "The file exists attempting open"); ret = store->Open(path.c_str(),open_mode,&fh); mlog(FOP_DCOMMON, "Open %s: %s",path.c_str(),ret==PLFS_SUCCESS?"Success":strplfserr(ret)); if (fh != NULL) { store->Close(fh); } // else, ret was set already } return ret; }
/** * ByteRangeIndex::flush_writebuf: flush out the write buffer to the * backing dropping. the BRI should already be locked by the caller. * * @return PLFS_SUCCESS or an error code */ plfs_error_t ByteRangeIndex::flush_writebuf() { plfs_error_t ret = PLFS_SUCCESS; size_t len; void *start; ssize_t bytes; len = this->writebuf.size(); /* iwritefh check is just for sanity, should be non-null */ if (len && this->iwritefh != NULL) { /* note: c++ vectors are guaranteed to be contiguous */ len = len * sizeof(HostEntry); start = &(this->writebuf.front()); ret = Util::Writen(start, len, this->iwritefh, &bytes); if (ret != PLFS_SUCCESS) { mlog(IDX_DRARE, "%s: failed to write fh %p: %s", __FUNCTION__, this->iwritefh, strplfserr(ret)); } this->writebuf.clear(); /* clear buffer */ } return(ret); }
int main (int argc, char **argv) { const char *target; struct plfs_physpathinfo ppi; struct plfs_pathback pb; plfs_error_t perr; Index *index; if (argc > 1) { target = argv[1]; } else { return -1; } plfs_handle_version_arg(argc, argv[1]); plfs_error_t ret = PLFS_SUCCESS; ret = plfs_resolvepath(target, &ppi); if (ret != PLFS_SUCCESS) { fprintf(stderr, "Couldn't resolve path %s\n", target); exit(1); } /* * XXXCDC: clearly we assume containerfs here and we totally * bypass the logicalfs layer. maybe "compress_metadata" should * be a logicalfs operation? */ pb.bpath = ppi.canbpath; pb.back = ppi.canback; index = new Index(pb.bpath, pb.back); perr = Container::populateIndex(pb.bpath, pb.back, index, false, false, 0); if (perr != PLFS_SUCCESS) { fprintf(stderr, "populateIndex of %s failed (%s)\n", target, strplfserr(perr)); exit(1); } perr = Container::flattenIndex(pb.bpath, pb.back, index); delete index; if (perr != PLFS_SUCCESS) { fprintf(stderr, "flattenIndex of %s failed (%s)\n", target, strplfserr(perr)); exit(1); } else { printf("Successfully flattened index of %s\n",target); } exit(0); }
void ADIOI_PLFS_WriteContig(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* --BEGIN CRAY MODIFICATION-- */ plfs_error_t err = PLFS_TBD; int datatype_size, rank; ssize_t bytes_written; ADIO_Offset len; /* --END CRAY MODIFICATION-- */ ADIO_Offset myoff; static char myname[] = "ADIOI_PLFS_WRITECONTIG"; #ifdef ROMIO_CRAY MPIIO_TIMER_START(WSYSIO); #endif /* ROMIO_CRAY */ MPI_Type_size(datatype, &datatype_size); /* --BEGIN CRAY MODIFICATION-- */ len = (ADIO_Offset)datatype_size * (ADIO_Offset)count; /* --END CRAY MODIFICATION-- */ MPI_Comm_rank( fd->comm, &rank ); // for the romio/test/large_file we always get an offset of 0 // maybe we need to increment fd->fp_ind ourselves? if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { myoff = offset; } else { myoff = fd->fp_ind; } if (file_ptr_type == ADIO_INDIVIDUAL) { myoff = fd->fp_ind; } plfs_debug( "%s: offset %ld len %ld rank %d\n", myname, (long)myoff, (long)len, rank ); err = plfs_write( fd->fs_ptr, buf, len, myoff, rank, &bytes_written ); #ifdef HAVE_STATUS_SET_BYTES if (err == PLFS_SUCCESS ) { MPIR_Status_set_bytes(status, datatype, (int)bytes_written); } #endif if (err != PLFS_SUCCESS ) { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", "**io %s", strplfserr(err)); } else { if (file_ptr_type == ADIO_INDIVIDUAL) { fd->fp_ind += bytes_written; } *error_code = MPI_SUCCESS; } #ifdef ROMIO_CRAY MPIIO_TIMER_END(WSYSIO); #endif /* ROMIO_CRAY */ }
/* return PLFS_SUCCESS or PLFS_E* */ plfs_error_t FileOp::op(const char *path, unsigned char type, IOStore *store) { // the parent function is just a wrapper to insert a debug message plfs_error_t rv; rv = this->do_op(path,type,store); if (this->ignores.find(rv) != this->ignores.end()) { rv = PLFS_SUCCESS; } mlog(FOP_DAPI, "FileOp:%s on %s: %d (%s)",name(),path,rv, (rv == PLFS_SUCCESS) ? "AOK" : strplfserr(rv)); return(rv); }
// helper routine for plfs_dump_config // changes ret to new error or leaves it alone plfs_error_t plfs_check_dir(string type, const char *prefix, IOStore *store, string bpath, plfs_error_t previous_ret, bool make_dir) { const char *directory = bpath.c_str(); plfs_error_t rv; if(Util::isDirectory(directory, store)) { return(previous_ret); } if (!make_dir) { printf("Error: Required %s directory %s%s not found/not a directory\n", type.c_str(), prefix, directory); return(PLFS_ENOENT); } rv = mkdir_dash_p(bpath, false, store); if (rv != PLFS_SUCCESS) { printf("Attempt to create directory %s%s failed (%s)\n", prefix, directory, strplfserr(rv)); return(rv); } return(previous_ret); }
/** * ByteRangeIndex::merge_dropping: merge HostEntry records from * dropping file into map/chunks. * * XXX: if it fails, it may leave idxout/cmapout in a partially * modified state. do we need better error recovery? failure is * pretty unlikely (the most likely thing to happen is to run out * memory, but that is going to kill our process with a C++ * exception). * * @param idxout entries are merged in here * @param cmapout new ChunkFiles are appended here * @param dropbpath bpath to index dropping file * @param dropback backend that dropping lives on * @return PLFS_SUCCESS or error code */ plfs_error_t ByteRangeIndex::merge_dropping(map<off_t,ContainerEntry> &idxout, vector<ChunkFile> &cmapout, off_t *eof_trk, off_t *bbytes, string dropbpath, struct plfs_backend *dropback) { plfs_error_t rv = PLFS_SUCCESS; plfs_error_t rv2; IOSHandle *xfh; off_t len; void *ibuf = NULL; /* get the dropping data */ rv = dropback->store->Open(dropbpath.c_str(), O_RDONLY, &xfh); if (rv != PLFS_SUCCESS) { mlog(IDX_DRARE, "%s: WTF open: %s", __FUNCTION__, strplfserr(rv)); return(rv); } rv = xfh->Size(&len); if (rv == PLFS_SUCCESS) { if (len > 0) { rv = xfh->GetDataBuf(&ibuf, len); } else { /* zero length dropping (can happen after a ftruncate) */ ibuf = NULL; } } if (rv != PLFS_SUCCESS) { mlog(IDX_DRARE, "%s WTF Size/GetDataBuf(%s,%ld): %s", __FUNCTION__, dropbpath.c_str(), len, strplfserr(rv)); dropback->store->Close(xfh); return(rv); } /* * known_chunks: maps PID from HostIndex to slot number in cmap vector */ map<pid_t,pid_t> known_chunks; map<pid_t,pid_t>::iterator known_chunks_itr; HostEntry *h_index = (HostEntry *)ibuf; /* dropping file data! */ size_t entries = len / sizeof(HostEntry); /* ignore partials (unlikely) */ int new_id; mlog(IDX_DCOMMON, "merge_droppings: %s has %lu entries", dropbpath.c_str(), entries); for (size_t i = 0 ; rv == PLFS_SUCCESS && i < entries ; i++) { HostEntry h_entry = h_index[i]; /* input (asssume alignment ok?) */ ContainerEntry c_entry; /* build this and add it */ /* check for new pid from this file we haven't see yet */ if (known_chunks.find(h_entry.id) == known_chunks.end()) { ChunkFile cf; rv2 = indexpath2chunkpath(dropbpath, h_entry.id, cf.bpath); if (rv2 != PLFS_SUCCESS) { mlog(IDX_ERR, "merge_droppings: i2c error %s (%s)", dropbpath.c_str(), strplfserr(rv2)); continue; /* just skip it, shouldn't ever happen */ } cf.backend = dropback; cf.fh = NULL; new_id = cmapout.size(); cmapout.push_back( cf ); known_chunks[h_entry.id] = new_id; mlog(IDX_DCOMMON, "Inserting chunk %s (id=%lu)", cf.bpath.c_str(), (unsigned long)new_id); } /* ok, setup the ContainerEntry for adding ... */ c_entry.logical_offset = h_entry.logical_offset; c_entry.length = h_entry.length; c_entry.id = known_chunks[h_entry.id]; /* slot# */ c_entry.original_chunk = h_entry.id; /* save old pid for rewrites */ c_entry.physical_offset = h_entry.physical_offset; c_entry.begin_timestamp = h_entry.begin_timestamp; c_entry.end_timestamp = h_entry.end_timestamp; /* now add it! */ rv = ByteRangeIndex::insert_entry(idxout, eof_trk, bbytes, &c_entry); if (rv != PLFS_SUCCESS) { mlog(IDX_DRARE, "Inserting chunk failed: %s", strplfserr(rv)); } } mlog(IDX_DAPI, "After %s now are %lu chunks, %lu ents", __FUNCTION__, (unsigned long)cmapout.size(), idxout.size()); rv2 = (ibuf == NULL)? PLFS_SUCCESS : xfh->ReleaseDataBuf(ibuf, len); if (rv2 != PLFS_SUCCESS) { mlog(IDX_DRARE, "%s WTF ReleaseDataBuf failed: %s", __FUNCTION__, strplfserr(rv)); } rv2 = dropback->store->Close(xfh); if (rv2 != PLFS_SUCCESS) { mlog(IDX_DRARE, "%s WTF dropping Close failed: %s", __FUNCTION__, strplfserr(rv)); } return(rv); }
/** * ByteRangeIndex::index_droppings_getattrsize: the file is open and * we want to get the best version of the size that we can. our index * may or may not be the one that is open. we've already pull some * info out of meta directory and we want to see if we can improve it. * * @param ppip path info for the file we are getting the size of * @param stbuf where save the result * @param openset list of hosts with the file open (from metadir) * @param metaset list of hosts we got metadir info on (from metadir) * @return PLFS_SUCCESS or error code */ plfs_error_t ByteRangeIndex::index_droppings_getattrsize(struct plfs_physpathinfo *ppip, struct stat *stbuf, set<string> *openset, set<string> *metaset) { plfs_error_t ret = PLFS_SUCCESS; vector<plfs_pathback> indices; vector<plfs_pathback>::iterator pitr; mlog(IDX_DAPI, "BRI::dropget: %s", ppip->canbpath.c_str()); /* generate a list of all index droppings in container */ ret = ByteRangeIndex::collectIndices(ppip->canbpath, ppip->canback, indices, true); mlog(IDX_DCOMMON, "BRI::dropget: %s collected=%d", ppip->canbpath.c_str(), (int)indices.size()); /* walk the list and read the ones we think are useful */ for (pitr = indices.begin() ; ret == PLFS_SUCCESS && pitr != indices.end() ; pitr++) { plfs_pathback mydrop; size_t pt; string host; struct stat dropping_st; off_t drop_eof, drop_bytes; /* extract hostname from dropping filename */ mydrop = *pitr; pt = mydrop.bpath.rfind("/"); if (pt == string::npos) continue; /* shouldn't happen */ pt = mydrop.bpath.find(INDEXPREFIX, pt); if (pt == string::npos) continue; /* shouldn't happen */ host = mydrop.bpath.substr(pt+sizeof(INDEXPREFIX)-1, string::npos); pt = host.find("."); if (pt == string::npos) continue; /* shouldn't happen */ host.erase(0, pt + 1); /* remove 'SEC.' */ pt = host.find("."); if (pt == string::npos) continue; /* shouldn't happen */ host.erase(0, pt + 1); /* remove 'USEC.' */ pt = host.rfind("."); if (pt == string::npos) continue; /* shouldn't happen */ host.erase(pt, host.size()); /* remove '.PID' */ /* * we can skip the dropping if the host doesn't have the file * open and we've already found valid metadata from that host. * otherwise, the dropping may have more recent info than the * metadata so we read it in. */ if (openset->find(host) == openset->end() && metaset->find(host) != metaset->end()) { continue; } /* * stat dropping to get timestamps and if they are more recent * than what we've currently got in stbuf, update our copy. */ if (mydrop.back->store->Lstat(mydrop.bpath.c_str(), &dropping_st) != PLFS_SUCCESS) { continue; /* went away before we could stat it */ } stbuf->st_ctime = max(dropping_st.st_ctime, stbuf->st_ctime); stbuf->st_atime = max(dropping_st.st_atime, stbuf->st_atime); stbuf->st_mtime = max(dropping_st.st_mtime, stbuf->st_mtime); /* * now read the dropping itself to update our offset/size info */ ret = ByteRangeIndex::scan_idropping(mydrop.bpath, mydrop.back, &drop_eof, &drop_bytes); if (ret == PLFS_SUCCESS) { stbuf->st_blocks += Container::bytesToBlocks(drop_bytes); stbuf->st_size = max(stbuf->st_size, drop_eof); } } mlog(IDX_DAPI, "BRI::dropget: %s => %s", ppip->canbpath.c_str(), strplfserr(ret)); return(ret); }
/* this is so we can find the time that each bin will be for the graphs * it uses MPI_reduce to find the maximum and minimum times of all the * given IO entries. This also finds the final time that each proc wrote until * to be used in the per processor graphs if requested. Finally calculates the * sum of the end times for the standard deviation and average calculations*/ int getMaxMinTimes(int numIndexFiles, int size, char* mount, double* minMax, double* endSum, double* endTimes, int* pids) { double sendMinMax[2]; double *sendEndTimes = (double *)calloc(numIndexFiles, sizeof(double)); int rank, i; plfs_error_t retv; char buffer[4096]; char name[4096]; FILE* tmp; int id; long long offset, length, tail; int pid; char io; double beg, end; sendMinMax[0] = DBL_MAX; sendMinMax[1] = DBL_MIN; MPI_Comm_rank(MPI_COMM_WORLD, &rank); double sendEndSum = 0; double fileEnd = DBL_MIN; for (i = 0; i<numIndexFiles; i++) { if (i % size == rank) { pid = pids[i]; /* get the pid from the constant list */ /* Create a temporary file */ sprintf(name, "tempFile%d.txt", rank); tmp = fopen(name, "w+"); if (tmp == NULL) { printf("ERROR: Could not create temp file\n"); return -1; } retv = container_dump_index(tmp, mount, 0, 1, pid); fclose(tmp); if (retv == PLFS_SUCCESS) { tmp = fopen(name, "r"); if (tmp == NULL) { printf("ERROR: Could not open temp file\n"); return -1; } while( fgets(buffer, sizeof(buffer), tmp) != NULL) { if (buffer[0] != '#') { int items = sscanf(buffer, "%d %c %lld %lld %lf %lf %lld", &id, &io, &offset, &length, &beg, &end, &tail); if (items != 7) { printf("ERROR: sscanf failed. Buffer: %s\n", buffer); return -1; } if (fileEnd < end) { fileEnd = end; } if (sendMinMax[0] > beg) { sendMinMax[0] = beg; } if (sendMinMax[1] < end) { sendMinMax[1] = end; } } } fclose(tmp); } else { printf("ERROR: Container_dump_index did not succeed: %s\n", strplfserr(retv)); return -1; } sendEndSum += fileEnd; /* this is the max for this pid */ sendEndTimes[i] = fileEnd; fileEnd = DBL_MIN; } else { continue; } } /* use MPI to get the global max and min */ double min, max; double localMin = sendMinMax[0]; double localMax = sendMinMax[1]; MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&localMin, &min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&localMax, &max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&sendEndSum, endSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(sendEndTimes, endTimes, numIndexFiles, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); minMax[0] = min; minMax[1] = max; unlink(name); free(sendEndTimes); return 0; }
ssize_t mca_fbtl_plfs_preadv (mca_io_ompio_file_t *fh ) { Plfs_fd *pfd = NULL; plfs_error_t plfs_ret; pfd = fh->f_fs_ptr; ssize_t total_bytes_read=0; int i, block=1; struct iovec *iov = NULL; int iov_count = 0; OMPI_MPI_OFFSET_TYPE iov_offset = 0; if (NULL == fh->f_io_array) { return OMPI_ERROR; } iov = (struct iovec *) malloc (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec)); if (NULL == iov) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } for (i=0 ; i<fh->f_num_of_io_entries ; i++) { if (0 == iov_count) { iov[iov_count].iov_base = fh->f_io_array[i].memory_address; iov[iov_count].iov_len = fh->f_io_array[i].length; iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset; iov_count ++; } if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) { block ++; iov = (struct iovec *)realloc (iov, OMPIO_IOVEC_INITIAL_SIZE * block * sizeof(struct iovec)); if (NULL == iov) { opal_output(1, "OUT OF MEMORY\n"); return OMPI_ERR_OUT_OF_RESOURCE; } } if (fh->f_num_of_io_entries != i+1) { if (((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset + (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].length) == (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset) { iov[iov_count].iov_base = fh->f_io_array[i+1].memory_address; iov[iov_count].iov_len = fh->f_io_array[i+1].length; iov_count ++; continue; } } // Find the total number of bytes to be read. size_t bytes = 0; for (int i = 0; i < iov_count; ++i) { bytes += iov[i].iov_len; } // Allocate a temporary buffer to hold the data char *buffer; buffer = (char *) malloc (bytes); if (buffer == NULL) { return OMPI_ERROR; } // Read the data ssize_t bytes_read; plfs_ret = plfs_read( pfd, buffer, bytes, iov_offset, &bytes_read ); if (PLFS_SUCCESS != plfs_ret) { opal_output(0, "fbtl_plfs_preadv: Error in plfs_read:\n%s\n", strplfserr(plfs_ret)); return OMPI_ERROR; } if (bytes_read < 0) return OMPI_ERROR; total_bytes_read += bytes_read; // Copy the data from BUFFER into the memory specified by IOV bytes = bytes_read; for (int i = 0; i < iov_count; ++i) { size_t copy = MIN (iov[i].iov_len, bytes); (void) memcpy ((void *) iov[i].iov_base, (void *) buffer, copy); buffer += copy; bytes -= copy; if (bytes == 0) { break; } } iov_count = 0; if ( NULL != buffer ) { free (buffer); buffer=NULL; } } if (NULL != iov) { free (iov); iov = NULL; } return total_bytes_read; }
/* utilizes MPI-IO in order to write out the data required to graph * the per processor graphs. Writes to a given filesystem so that it * can be read by python */ int writeProcessorData(int numIndexFiles, int size, char* mount, char* offsetMPI, double average, double stdev, double* endTimes, char* timeMPI, double start, bool above, bool below, int numStdDev, int* pids) { int i, rank, pid, id; plfs_error_t retv; char name[4096]; char buffer[4096]; FILE * tmp; long long offset, length, tail; long long writeID; char io; double beg, end; MPI_File offsetsFile; MPI_File timeFile; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_File_open(MPI_COMM_WORLD, offsetMPI, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &offsetsFile); MPI_File_open(MPI_COMM_WORLD, timeMPI, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &timeFile); int writesOffset = 0; /* the number of offset writes this rank has done */ int writesTime = 0; /* the number of time writes the rank has done */ double topCutoff = 0; double bottomCutoff = 0; if (above) { topCutoff = average + numStdDev*stdev; /* any proc that ends above this will be drawn */ } if (below) { bottomCutoff = average - numStdDev*stdev; /* any proc that ends before this will be drawn */ } /* how big the writes are to the file. used to get the file offsets*/ int offsetSize = 3*sizeof(long long); int timeSize = 3*sizeof(double); for (i = 0; i < numIndexFiles; i++) { if (i % size == rank) { pid = pids[i]; /* get the pid from the constant list */ /* Create a temporary file */ sprintf(name, "tempFile%d.txt", rank); tmp = fopen(name, "w+"); if (tmp == NULL) { printf("ERROR:Could not open temp file\n"); return -1; } retv = container_dump_index(tmp, mount, 0, 1, pid); fclose(tmp); if (retv == PLFS_SUCCESS) { tmp = fopen(name, "r"); if (tmp == NULL) { printf("ERROR:Could not open temp file\n"); return -1; } while( fgets(buffer, sizeof(buffer), tmp) != NULL) { if(buffer[0] != '#') { int items = sscanf(buffer, "%d %c %lld %lld %lf %lf %lld", &id, &io, &offset, &length, &beg, &end, &tail); if (items != 7) { printf("ERROR: sscanf failed. Buffer: %s\n", buffer); return -1; } /* this id was arbitrary so I will change * it to match the index id */ writeID = pid; long long mpiSendBuffer[3] = {writeID, offset, tail}; MPI_File_write_at(offsetsFile, rank*offsetSize + writesOffset*size*offsetSize, &mpiSendBuffer, 3, MPI_LONG_LONG, MPI_STATUS_IGNORE); writesOffset ++; /* see if we need to write this time out */ if ((numIndexFiles <= 16) || ((above) && (endTimes[i] > topCutoff)) || ((below) && (endTimes[i] < bottomCutoff))) { double newId = pid; double mpiSendBuffer2[3] = {newId, beg, end}; /* we want to only graph the time of the job*/ beg -= start; end -= start; MPI_File_write_at(timeFile, rank*timeSize + writesTime*size*timeSize, &mpiSendBuffer2, 3, MPI_DOUBLE, MPI_STATUS_IGNORE); writesTime ++; } } } fclose(tmp); } else { printf("ERROR: Container_dump_index did not succeed: %s\n", strplfserr(retv)); return -1; } } } unlink(name); MPI_File_close(&offsetsFile); MPI_File_close(&timeFile); return 0; }
/* Gains all the information needed for the graphs and finds the * standard deviation of the end times*/ int parseData(int numIndexFiles, int size, char* mount, double binSize, int numBins, double* bandwidths, long long* iosTime, long long* iosFin, long long* writeCount, double min, double average, double* stdev, int* pids) { double* sendBandwidths = (double *)calloc(numBins, sizeof(double)); long long* sendIOsTime = (long long *)calloc(numBins, sizeof(long long)); long long* sendIOsFin = (long long *)calloc(numBins, sizeof(long long)); /* 51 is the number of bins in the write histogram */ long long* sendWriteCount = (long long *)calloc(51, sizeof(long long)); double sendSumDiffSquare = 0; double sumDiffSquare = 0; if ((sendBandwidths == NULL) | (sendIOsTime == NULL) | (sendIOsFin == NULL) | (sendWriteCount == NULL)) { printf("Could not allocate enough memory\n"); return -1; } char buffer[4096]; char name[4096]; int rank, i, pid; plfs_error_t retv; FILE* tmp; int id; long long length, tail, offset; char io; double beg, end, delta, averageBan; int startBin, binsSpanned; MPI_Comm_rank(MPI_COMM_WORLD, &rank); double fileEnd = -1*DBL_MAX; for (i = 0; i<numIndexFiles; i++) { if (i % size == rank) { /*Create a temporary file */ pid = pids[i]; /* get pid from the constant list*/ sprintf(name, "tempFile%d.txt", rank); tmp = fopen(name, "w+"); if (tmp == NULL) { printf("ERROR:Could not open temp file\n"); return -1; } retv = container_dump_index(tmp, mount, 0, 1, pid); fclose(tmp); if (retv == PLFS_SUCCESS) { tmp = fopen(name, "r"); if (tmp == NULL) { printf("ERROR:Could not open temp file\n"); return -1; } while( fgets(buffer, sizeof(buffer), tmp) != NULL) { if (buffer[0] != '#') { int items = sscanf(buffer, "%d %c %lld %lld %lf %lf %lld", &id, &io, &offset, &length, &beg, &end, &tail); if (items != 7) { printf("ERROR: sscanf failed. Buffer: %s\n", buffer); return -1; } /* this id was arbitrary so I will change it to * match the index id, which is the pid */ id = pid; delta = end - beg; /* this calculates how many bins the time spans * for the ios and the bandwidths so that each * time bin gets everything from the current write */ binsSpanned = ceil((delta/binSize)); /* calculates the average of the bandwidth * for the current write in terms of bins covered*/ averageBan = (length/(delta*1024*1024) * (delta/binSize - floor(delta/binSize))); startBin = floor((beg-min)/binSize); for (int j =0; j<binsSpanned; j++) { /* writing at these bins */ sendBandwidths[startBin+j] += averageBan; if (j < (binsSpanned - 1)) sendIOsTime[startBin+j]++; } // here we increment all further bins for a running sum int i = 0; while (startBin + binsSpanned + i < numBins) { sendIOsFin[startBin+binsSpanned+i]++; i++; } int writeIndex = powersOfTwo(length); /* The last bin is for all that is above 1 PiB * which is 51 as writeCounts is of length 51 */ if (writeIndex >= 51) { sendWriteCount[50]++; } else { sendWriteCount[writeIndex]++; } if (fileEnd < end) { fileEnd = end; } } } fclose(tmp); } else { printf("ERROR: Container_dump_index did not succeed: %s\n", strplfserr(retv)); return -1; } /* get the difference of the end of the pid to the average and * use to get the standard deviation */ double diffAvg = fileEnd - average; double squareDiff = diffAvg * diffAvg; sendSumDiffSquare += squareDiff; fileEnd = DBL_MIN; } else { continue; } } /* use MPI Reduce to get the sums of each of the arrays */ MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(sendBandwidths, bandwidths, numBins, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(sendIOsTime, iosTime, numBins, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(sendIOsFin, iosFin, numBins, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(sendWriteCount, writeCount, 51, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(&sendSumDiffSquare, &sumDiffSquare, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { /* calculate standard deviation */ *stdev = sqrt(sumDiffSquare/numIndexFiles); } free(sendBandwidths); free(sendIOsTime); free(sendIOsFin); free(sendWriteCount); unlink(name); return 0; }
/* * file_open_plfs * * Function: - opens a new file * Accepts: - same arguments as MPI_File_open() * Returns: - Success if new file handle */ int mca_fs_plfs_file_open (struct ompi_communicator_t *comm, const char* filename, int access_mode, struct ompi_info_t *info, mca_io_ompio_file_t *fh) { int rank; int amode; int old_mask, perm; plfs_error_t plfs_ret; Plfs_fd *pfd = NULL; char wpath[1024]; size_t len = sizeof(int); char key[] = "num_hostdirs"; rank = ompi_comm_rank ( comm ); getcwd( wpath, sizeof(wpath) ); sprintf( wpath,"%s/%s",wpath,filename ); if (OMPIO_PERM_NULL == fh->f_perm) { old_mask = umask(022); umask(old_mask); perm = old_mask ^ 0666; } else { perm = fh->f_perm; } amode = 0; if (access_mode & MPI_MODE_RDONLY) amode = amode | O_RDONLY; if (access_mode & MPI_MODE_WRONLY) amode = amode | O_WRONLY; if (access_mode & MPI_MODE_RDWR) amode = amode | O_RDWR; if (access_mode & MPI_MODE_EXCL) { if( is_plfs_path(wpath) == 1 ) { //the file already exists return OMPI_ERROR; } } if (0 == rank) { /* MODE_CREATE and MODE_EXCL can only be set by one process */ if (access_mode & MPI_MODE_CREATE) amode = amode | O_CREAT; plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL ); fh->f_fs_ptr = pfd; } comm->c_coll.coll_bcast ( &plfs_ret, 1, MPI_INT, 0, comm, comm->c_coll.coll_bcast_module); if ( PLFS_SUCCESS != plfs_ret ) { return OMPI_ERROR; } if (0 != rank) { plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL ); if (PLFS_SUCCESS != plfs_ret) { opal_output(0, "fs_plfs_file_open: Error in plfs_open:\n%s\n", strplfserr(plfs_ret)); return OMPI_ERROR; } else { fh->f_fs_ptr = pfd; } } if (mca_fs_plfs_num_hostdir > 0) { plfs_ret = plfs_setxattr( pfd, &mca_fs_plfs_num_hostdir, key, len ); if (PLFS_SUCCESS != plfs_ret) { opal_output(0, "fs_plfs_file_open: Error in plfs_setxattr:\n%s\n", strplfserr(plfs_ret)); return OMPI_ERROR; } } return OMPI_SUCCESS; }
static plfs_error_t nextdropping(const string& canbpath, struct plfs_backend *canback, string *droppingpath, struct plfs_backend **dropback, const char *dropping_filter, IOSDirHandle **candir, IOSDirHandle **subdir, string *hostdirpath, int *dropping) { struct dirent dirstore; string tmppath, resolved; plfs_error_t ret; *dropping = -1; mlog(IDX_DAPI, "nextdropping: %slooking in %s", (*candir != NULL) ? "still " : "", canbpath.c_str()); /* if *candir is null, then this is the first call to nextdropping */ if (*candir == NULL) { plfs_error_t rv; rv = canback->store->Opendir(canbpath.c_str(), candir); if (rv != PLFS_SUCCESS) { return rv; } } ReTry: /* candir is open. now get an open subdir (if we don't have it) */ if (*subdir == NULL) { if (getnextent(*candir, HOSTDIRPREFIX, &dirstore) == NULL) { /* no more subdirs ... */ canback->store->Closedir(*candir); *candir = NULL; *dropping = 0; return PLFS_SUCCESS; /* success, we are done! */ } /* a new subdir in dirstore, must resolve possible metalinks now */ tmppath = canbpath + "/" + dirstore.d_name; *dropback = canback; /* assume no metalink */ ret = Container::resolveMetalink(tmppath, canback, NULL, resolved, dropback); if (ret == PLFS_SUCCESS) { *hostdirpath = resolved; /* via metalink */ /* resolveMetalink also updated dropback */ } else { *hostdirpath = tmppath; /* no metalink */ } /* now open up the subdir */ ret = (*dropback)->store->Opendir(hostdirpath->c_str(), subdir); if (ret != PLFS_SUCCESS) { mlog(IDX_DRARE, "opendir %s: %s", hostdirpath->c_str(), strplfserr(ret)); return ret; } mlog(IDX_DCOMMON, "%s opened dir %s", __FUNCTION__, hostdirpath->c_str()); } /* now all directories are open, try and read next entry */ if (getnextent(*subdir, dropping_filter, &dirstore) == NULL) { /* we hit EOF on the subdir, need to advance to next subdir */ (*dropback)->store->Closedir(*subdir); *dropback = NULL; /* just to be safe */ *subdir = NULL; /* signals we are ready for next one */ goto ReTry; /* or could recurse(used to) */ } /* success, we have the next entry... */ droppingpath->clear(); droppingpath->assign(*hostdirpath + "/" + dirstore.d_name); *dropping = 1; return PLFS_SUCCESS; }
/** * ByteRangeIndex::trunc_edit_nz: edit droppings to shrink container * to a non-zero size. if we are truncating an open file and we * edit the current index dropping, then update the filehandle for * it since the offsets may have changed. * * @param ppip pathinfo for container * @param nzo the non-zero offset * @param openidrop filename of currently open index dropping * @return PLFS_SUCCESS or error code */ plfs_error_t ByteRangeIndex::trunc_edit_nz(struct plfs_physpathinfo *ppip, off_t nzo, string openidrop) { plfs_error_t ret = PLFS_SUCCESS; size_t slashoff; string openifn, indexfile; struct plfs_backend *indexback; IOSDirHandle *candir, *subdir; string hostdirpath; int dropping; mlog(IDX_DAPI, "%s on %s to %ld", __FUNCTION__, ppip->canbpath.c_str(), (unsigned long)nzo); /* * isolate the filename in openidrop. in theory we should be * able to just compare the whole thing, but PLFS sometimes * puts in extra "/" chars in filenames and that could mess * us up: e.g. /m/plfs/dir/file vs /m/plfs/dir//file * should be the same, but the "//" will fool strcmps. */ if (openidrop.size() == 0) { openifn = ""; } else { slashoff = openidrop.rfind("/"); if (slashoff == string::npos) { openifn = openidrop; } else { openifn = openidrop.substr(slashoff + 1, string::npos); } } /* * this code goes through each index dropping and rewrites it * preserving only entries that contain data prior to truncate * offset... */ candir = subdir = NULL; while ((ret = nextdropping(ppip->canbpath, ppip->canback, &indexfile, &indexback, INDEXPREFIX, &candir, &subdir, &hostdirpath, &dropping)) == PLFS_SUCCESS) { if (dropping != 1) { break; } /* read dropping file into a tmp index map */ map<off_t,ContainerEntry> tmpidx; vector<ChunkFile> tmpcnk; off_t eof, bytes; IOSHandle *fh; eof = bytes = 0; ret = ByteRangeIndex::merge_dropping(tmpidx, tmpcnk, &eof, &bytes, indexfile, indexback); if (ret != PLFS_SUCCESS) { mlog(IDX_CRIT, "Failed to read index file %s: %s", indexfile.c_str(), strplfserr( ret )); break; } /* we have to rewrite only if it had data past our new eof (nzo) */ if (eof > nzo) { mlog(IDX_DCOMMON, "%s %s at %ld", __FUNCTION__, indexfile.c_str(), (unsigned long)nzo); ByteRangeIndex::trunc_map(tmpidx, nzo); /* * XXX: copied from old code. should this write to a tmp * file and then rename? */ ret = indexback->store->Open(indexfile.c_str(), O_TRUNC|O_WRONLY, &fh); if ( ret != PLFS_SUCCESS ) { mlog(IDX_CRIT, "Couldn't overwrite index file %s: %s", indexfile.c_str(), strplfserr( ret )); return(ret); } ret = ByteRangeIndex::trunc_writemap(tmpidx, fh); /* * if we just rewrote our currently open index, swap our * iwritefh to be the rewritten fh and let the old one get * closed off (below). we do this because we've edited * the open write index dropping (made it smaller) and * the old filehandle is now pointing to a bad spot in * the file. */ if (openifn.size() > 0 && indexback == this->iwriteback && filematch(indexfile, openifn)) { IOSHandle *tmp; tmp = this->iwritefh; this->iwritefh = fh; fh = tmp; } /* XXX: do we care about return value from Close? */ indexback->store->Close(fh); } } mlog(IDX_DAPI, "%s on %s to %ld ret: %d", __FUNCTION__, ppip->canbpath.c_str(), (long)nzo, ret); return(ret); }