int main (int argc, char **argv) {
    const char *target;
    plfs_error_t ret;
    Plfs_fd *fd;
    int nr;

    if (argc > 1) {
        target = argv[1];
    } else {
        return -1;
    }

    plfs_handle_version_arg(argc, argv[1]);

    fd = NULL;
    ret = plfs_open(&fd, target, O_RDONLY, getpid(), 0777, NULL);
    if (ret != PLFS_SUCCESS) {
        fprintf(stderr, "Couldn't open path %s\n", target);
        exit(1);
    }

    ret = fd->optimize_access();

    if (ret != PLFS_SUCCESS) {
        fprintf(stderr, "optimize_access: flattenIndex of %s failed (%s)\n",
                target, strplfserr(ret));
    } else {
        printf("Successfully flattened index of %s\n",target);
    }

    (void) plfs_close(fd, getpid(), getuid(), O_RDONLY, NULL, &nr);

    exit(( ret == PLFS_SUCCESS) ?  0  : 1);
}
Esempio n. 2
0
/* return PLFS_SUCCESS or PLFS_E* */
plfs_error_t Access( const string& path, IOStore *store, int mask )
{
    // there used to be some concern here that the accessfile might not
    // exist yet but the way containers are made ensures that an accessfile
    // will exist if the container exists
    // doing just Access is insufficient when plfs daemon run as root
    // root can access everything.  so, we must also try the open
    mode_t open_mode = 0;
    plfs_error_t ret;
    IOSHandle *fh;
    mlog(FOP_DAPI, "%s Check existence of %s", __FUNCTION__, path.c_str());

    ret = store->Access( path.c_str(), F_OK );
    if ( ret == PLFS_SUCCESS ) {
        // at this point, we know the file exists
        if(checkMask(mask,W_OK|R_OK)) {
            open_mode = O_RDWR;
        } else if(checkMask(mask,R_OK)||checkMask(mask,X_OK)) {
            open_mode = O_RDONLY;
        } else if(checkMask(mask,W_OK)) {
            open_mode = O_WRONLY;
        } else if(checkMask(mask,F_OK)) {
            return PLFS_SUCCESS;   // we already know this
        }
        mlog(FOP_DCOMMON, "The file exists attempting open");
        ret = store->Open(path.c_str(),open_mode,&fh);
        mlog(FOP_DCOMMON, "Open %s: %s",path.c_str(),ret==PLFS_SUCCESS?"Success":strplfserr(ret));
        if (fh != NULL) {
            store->Close(fh);
        } // else, ret was set already
    }
    return ret;
}
Esempio n. 3
0
/**
 * ByteRangeIndex::flush_writebuf: flush out the write buffer to the
 * backing dropping.   the BRI should already be locked by the caller.
 *
 * @return PLFS_SUCCESS or an error code
 */
plfs_error_t
ByteRangeIndex::flush_writebuf() {
    plfs_error_t ret = PLFS_SUCCESS;
    size_t len;
    void *start;
    ssize_t bytes;

    len = this->writebuf.size();

    /* iwritefh check is just for sanity, should be non-null */
    if (len && this->iwritefh != NULL) {
        /* note: c++ vectors are guaranteed to be contiguous */
        len = len * sizeof(HostEntry);
        start = &(this->writebuf.front());

        ret = Util::Writen(start, len, this->iwritefh, &bytes);
        if (ret != PLFS_SUCCESS) {
            mlog(IDX_DRARE, "%s: failed to write fh %p: %s",
                 __FUNCTION__, this->iwritefh, strplfserr(ret));
        }

        this->writebuf.clear();   /* clear buffer */
    }

    return(ret);
}
Esempio n. 4
0
int main (int argc, char **argv) {
    const char *target;
    struct plfs_physpathinfo ppi;
    struct plfs_pathback pb;
    plfs_error_t perr;
    Index *index;

    if (argc > 1) {
        target = argv[1];
    } else {
        return -1;
    }
    plfs_handle_version_arg(argc, argv[1]);
    plfs_error_t ret = PLFS_SUCCESS;
    ret = plfs_resolvepath(target, &ppi);
    if (ret != PLFS_SUCCESS) {
        fprintf(stderr, "Couldn't resolve path %s\n", target);
        exit(1);
    }
    
    /*
     * XXXCDC: clearly we assume containerfs here and we totally
     * bypass the logicalfs layer.  maybe "compress_metadata" should
     * be a logicalfs operation?
     */
    pb.bpath = ppi.canbpath;
    pb.back = ppi.canback;

    index = new Index(pb.bpath, pb.back);
    perr = Container::populateIndex(pb.bpath, pb.back, index, false, false, 0);
    if (perr != PLFS_SUCCESS) {
        fprintf(stderr, "populateIndex of %s failed (%s)\n",
                target, strplfserr(perr));
        exit(1);
    }
    perr = Container::flattenIndex(pb.bpath, pb.back, index);
    delete index;
    
    if (perr != PLFS_SUCCESS) {
        fprintf(stderr, "flattenIndex of %s failed (%s)\n",
                target, strplfserr(perr));
        exit(1);
    } else {
        printf("Successfully flattened index of %s\n",target);
    }
    exit(0);
}
Esempio n. 5
0
void ADIOI_PLFS_WriteContig(ADIO_File fd, void *buf, int count,
                            MPI_Datatype datatype, int file_ptr_type,
                            ADIO_Offset offset, ADIO_Status *status,
                            int *error_code)
{
    /* --BEGIN CRAY MODIFICATION-- */
    plfs_error_t err = PLFS_TBD;
    int datatype_size, rank;
    ssize_t bytes_written;
    ADIO_Offset len;
    /* --END CRAY MODIFICATION-- */
    ADIO_Offset myoff;
    static char myname[] = "ADIOI_PLFS_WRITECONTIG";
#ifdef ROMIO_CRAY
MPIIO_TIMER_START(WSYSIO);
#endif /* ROMIO_CRAY */
    MPI_Type_size(datatype, &datatype_size);
    /* --BEGIN CRAY MODIFICATION-- */
    len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
    /* --END CRAY MODIFICATION-- */
    MPI_Comm_rank( fd->comm, &rank );
    // for the romio/test/large_file we always get an offset of 0
    // maybe we need to increment fd->fp_ind ourselves?
    if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
        myoff = offset;
    } else {
        myoff = fd->fp_ind;
    }
    if (file_ptr_type == ADIO_INDIVIDUAL) {
        myoff = fd->fp_ind;
    }
    plfs_debug( "%s: offset %ld len %ld rank %d\n",
                myname, (long)myoff, (long)len, rank );
    err = plfs_write( fd->fs_ptr, buf, len, myoff, rank, &bytes_written );
#ifdef HAVE_STATUS_SET_BYTES
    if (err == PLFS_SUCCESS ) {
        MPIR_Status_set_bytes(status, datatype, (int)bytes_written);
    }
#endif
    if (err != PLFS_SUCCESS ) {
        *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__, MPI_ERR_IO,
                                           "**io",
                                           "**io %s", strplfserr(err));
    } else {
        if (file_ptr_type == ADIO_INDIVIDUAL) {
            fd->fp_ind += bytes_written;
        }
        *error_code = MPI_SUCCESS;
    }
#ifdef ROMIO_CRAY
MPIIO_TIMER_END(WSYSIO);
#endif /* ROMIO_CRAY */
}
Esempio n. 6
0
/* return PLFS_SUCCESS or PLFS_E* */
plfs_error_t
FileOp::op(const char *path, unsigned char type, IOStore *store)
{
    // the parent function is just a wrapper to insert a debug message
    plfs_error_t rv;
    rv = this->do_op(path,type,store);
    if (this->ignores.find(rv) != this->ignores.end()) {
        rv = PLFS_SUCCESS;
    }
    mlog(FOP_DAPI, "FileOp:%s on %s: %d (%s)",name(),path,rv,
         (rv == PLFS_SUCCESS) ? "AOK" : strplfserr(rv));
    return(rv);
}
Esempio n. 7
0
// helper routine for plfs_dump_config
// changes ret to new error or leaves it alone
plfs_error_t
plfs_check_dir(string type, const char *prefix, IOStore *store, string bpath,
               plfs_error_t previous_ret, bool make_dir)
{
    const char *directory = bpath.c_str();
    plfs_error_t rv;

    if(Util::isDirectory(directory, store)) {
        return(previous_ret);
    }
    if (!make_dir) {
        printf("Error: Required %s directory %s%s not found/not a directory\n",
               type.c_str(), prefix, directory);
        return(PLFS_ENOENT);
    }
    rv = mkdir_dash_p(bpath, false, store);
    if (rv != PLFS_SUCCESS) {
        printf("Attempt to create directory %s%s failed (%s)\n",
               prefix, directory, strplfserr(rv));
        return(rv);
    }
    return(previous_ret);
}
Esempio n. 8
0
/**
 * ByteRangeIndex::merge_dropping: merge HostEntry records from
 * dropping file into map/chunks.
 *
 * XXX: if it fails, it may leave idxout/cmapout in a partially
 * modified state.  do we need better error recovery?  failure is
 * pretty unlikely (the most likely thing to happen is to run out
 * memory, but that is going to kill our process with a C++
 * exception).
 *
 * @param idxout entries are merged in here
 * @param cmapout new ChunkFiles are appended here
 * @param dropbpath bpath to index dropping file
 * @param dropback backend that dropping lives on
 * @return PLFS_SUCCESS or error code
 */
plfs_error_t
ByteRangeIndex::merge_dropping(map<off_t,ContainerEntry> &idxout,
                               vector<ChunkFile> &cmapout, 
                               off_t *eof_trk, off_t *bbytes,
                               string dropbpath,
                               struct plfs_backend *dropback) {

    plfs_error_t rv = PLFS_SUCCESS;
    plfs_error_t rv2;
    IOSHandle *xfh;
    off_t len;
    void *ibuf = NULL;

    /* get the dropping data */
    rv = dropback->store->Open(dropbpath.c_str(), O_RDONLY, &xfh);
    if (rv != PLFS_SUCCESS) {
        mlog(IDX_DRARE, "%s: WTF open: %s", __FUNCTION__, strplfserr(rv));
        return(rv);
    }
    rv = xfh->Size(&len);
    if (rv == PLFS_SUCCESS) {
        if (len > 0) {
            rv = xfh->GetDataBuf(&ibuf, len);
        } else {
            /* zero length dropping (can happen after a ftruncate) */
            ibuf = NULL;
        }
    }
    if (rv != PLFS_SUCCESS) {
        mlog(IDX_DRARE, "%s WTF Size/GetDataBuf(%s,%ld): %s", __FUNCTION__,
             dropbpath.c_str(), len, strplfserr(rv));
        dropback->store->Close(xfh);
        return(rv);
    }

    /*
     * known_chunks: maps PID from HostIndex to slot number in cmap vector
     */
    map<pid_t,pid_t> known_chunks;
    map<pid_t,pid_t>::iterator known_chunks_itr;
    HostEntry *h_index = (HostEntry *)ibuf;  /* dropping file data! */
    size_t entries = len / sizeof(HostEntry); /* ignore partials (unlikely) */
    int new_id;

    mlog(IDX_DCOMMON, "merge_droppings: %s has %lu entries",
         dropbpath.c_str(), entries);

    for (size_t i = 0 ; rv == PLFS_SUCCESS && i < entries ; i++) {
        HostEntry h_entry = h_index[i];  /* input (asssume alignment ok?) */
        ContainerEntry c_entry;          /* build this and add it */

        /* check for new pid from this file we haven't see yet */
        if (known_chunks.find(h_entry.id) == known_chunks.end()) {
            ChunkFile cf;
            rv2 = indexpath2chunkpath(dropbpath, h_entry.id, cf.bpath);
            if (rv2 != PLFS_SUCCESS) {
                mlog(IDX_ERR, "merge_droppings: i2c error %s (%s)",
                     dropbpath.c_str(), strplfserr(rv2));
                continue;   /* just skip it, shouldn't ever happen */
            }
            cf.backend = dropback;
            cf.fh = NULL;
            new_id = cmapout.size();
            cmapout.push_back( cf );
            known_chunks[h_entry.id] = new_id;
            mlog(IDX_DCOMMON, "Inserting chunk %s (id=%lu)", cf.bpath.c_str(),
                 (unsigned long)new_id);
        }

        /* ok, setup the ContainerEntry for adding ... */
        c_entry.logical_offset    = h_entry.logical_offset;
        c_entry.length            = h_entry.length;
        c_entry.id                = known_chunks[h_entry.id]; /* slot# */
        c_entry.original_chunk    = h_entry.id; /* save old pid for rewrites */
        c_entry.physical_offset   = h_entry.physical_offset;
        c_entry.begin_timestamp   = h_entry.begin_timestamp;
        c_entry.end_timestamp     = h_entry.end_timestamp;

        /* now add it! */
        rv = ByteRangeIndex::insert_entry(idxout, eof_trk, bbytes, &c_entry);
        if (rv != PLFS_SUCCESS) {
            mlog(IDX_DRARE, "Inserting chunk failed: %s", strplfserr(rv));
        }
    }

    mlog(IDX_DAPI, "After %s now are %lu chunks, %lu ents",
         __FUNCTION__, (unsigned long)cmapout.size(), idxout.size());

    rv2 = (ibuf == NULL)? PLFS_SUCCESS : xfh->ReleaseDataBuf(ibuf, len);
    if (rv2 != PLFS_SUCCESS) {
        mlog(IDX_DRARE, "%s WTF ReleaseDataBuf failed: %s", __FUNCTION__,
             strplfserr(rv));
    }
    rv2 = dropback->store->Close(xfh);
    if (rv2 != PLFS_SUCCESS) {
        mlog(IDX_DRARE, "%s WTF dropping Close failed: %s", __FUNCTION__,
             strplfserr(rv));
    }
    
    return(rv);
}
Esempio n. 9
0
/**
 * ByteRangeIndex::index_droppings_getattrsize: the file is open and
 * we want to get the best version of the size that we can.  our index
 * may or may not be the one that is open.  we've already pull some
 * info out of meta directory and we want to see if we can improve it.
 *
 * @param ppip path info for the file we are getting the size of
 * @param stbuf where save the result
 * @param openset list of hosts with the file open (from metadir)
 * @param metaset list of hosts we got metadir info on (from metadir)
 * @return PLFS_SUCCESS or error code
 */
plfs_error_t
ByteRangeIndex::index_droppings_getattrsize(struct plfs_physpathinfo *ppip, 
                                            struct stat *stbuf,
                                            set<string> *openset,
                                            set<string> *metaset) {
    plfs_error_t ret = PLFS_SUCCESS;
    vector<plfs_pathback> indices;
    vector<plfs_pathback>::iterator pitr;

    mlog(IDX_DAPI, "BRI::dropget: %s", ppip->canbpath.c_str());

    /* generate a list of all index droppings in container */
    ret = ByteRangeIndex::collectIndices(ppip->canbpath, ppip->canback,
                                         indices, true);
    mlog(IDX_DCOMMON, "BRI::dropget: %s collected=%d", ppip->canbpath.c_str(),
         (int)indices.size());

    /* walk the list and read the ones we think are useful */
    for (pitr = indices.begin() ;
         ret == PLFS_SUCCESS && pitr != indices.end() ; pitr++) {
        plfs_pathback mydrop;
        size_t pt;
        string host;
        struct stat dropping_st;
        off_t drop_eof, drop_bytes;

        /* extract hostname from dropping filename */
        mydrop = *pitr;
        pt = mydrop.bpath.rfind("/");
        if (pt == string::npos) continue;   /* shouldn't happen */
        pt = mydrop.bpath.find(INDEXPREFIX, pt);
        if (pt == string::npos) continue;   /* shouldn't happen */
        host = mydrop.bpath.substr(pt+sizeof(INDEXPREFIX)-1, string::npos);
        pt = host.find(".");
        if (pt == string::npos) continue;   /* shouldn't happen */
        host.erase(0, pt + 1);              /* remove 'SEC.' */
        pt = host.find(".");
        if (pt == string::npos) continue;   /* shouldn't happen */
        host.erase(0, pt + 1);              /* remove 'USEC.' */
        pt = host.rfind(".");
        if (pt == string::npos) continue;   /* shouldn't happen */
        host.erase(pt, host.size());        /* remove '.PID' */

        /*
         * we can skip the dropping if the host doesn't have the file
         * open and we've already found valid metadata from that host.
         * otherwise, the dropping may have more recent info than the
         * metadata so we read it in.
         */
        if (openset->find(host) == openset->end() &&
            metaset->find(host) != metaset->end()) {
            continue;
        }
            
        /*
         * stat dropping to get timestamps and if they are more recent
         * than what we've currently got in stbuf, update our copy.
         */
        if (mydrop.back->store->Lstat(mydrop.bpath.c_str(),
                                      &dropping_st) != PLFS_SUCCESS) {
            continue;  /* went away before we could stat it */
        }
        stbuf->st_ctime = max(dropping_st.st_ctime, stbuf->st_ctime);
        stbuf->st_atime = max(dropping_st.st_atime, stbuf->st_atime);
        stbuf->st_mtime = max(dropping_st.st_mtime, stbuf->st_mtime);

        /*
         * now read the dropping itself to update our offset/size info
         */
        ret = ByteRangeIndex::scan_idropping(mydrop.bpath, mydrop.back,
                                             &drop_eof, &drop_bytes);
        if (ret == PLFS_SUCCESS) {
            stbuf->st_blocks += Container::bytesToBlocks(drop_bytes);
            stbuf->st_size = max(stbuf->st_size, drop_eof);
        }
    }
    
    mlog(IDX_DAPI, "BRI::dropget: %s => %s", ppip->canbpath.c_str(),
         strplfserr(ret));
    return(ret);
}
Esempio n. 10
0
/* this is so we can find the time that each bin will be for the graphs
 * it uses MPI_reduce to find the maximum and minimum times of all the 
 * given IO entries. This also finds the final time that each proc wrote until
 * to be used in the per processor graphs if requested. Finally calculates the
 * sum of the end times for the standard deviation and average calculations*/
int 
getMaxMinTimes(int numIndexFiles, int size, char* mount, double* minMax,
         double* endSum, double* endTimes, int* pids)
{
    double sendMinMax[2]; 
    double *sendEndTimes = (double *)calloc(numIndexFiles, sizeof(double)); 
    int rank, i; 
    plfs_error_t retv;  
    char buffer[4096];
    char name[4096];
    FILE* tmp; 
    int id; 
    long long offset, length, tail; 
    int pid; 
    char io; 
    double beg, end;
    sendMinMax[0] = DBL_MAX; 
    sendMinMax[1] = DBL_MIN; 
    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
    double sendEndSum = 0; 
    double fileEnd = DBL_MIN; 
    for (i = 0; i<numIndexFiles; i++) {
        if (i % size == rank) {
            pid = pids[i]; /* get the pid from the constant list */ 
            /* Create a temporary file */
            sprintf(name, "tempFile%d.txt", rank); 
            tmp = fopen(name, "w+");
            if (tmp == NULL) {
                printf("ERROR: Could not create temp file\n"); 
                return -1; 
            }
            retv = container_dump_index(tmp, mount, 0, 1, pid); 
            fclose(tmp); 
            if (retv == PLFS_SUCCESS) {
                tmp = fopen(name, "r"); 
                if (tmp == NULL) {
                    printf("ERROR: Could not open temp file\n"); 
                    return -1; 
                }
                while( fgets(buffer, sizeof(buffer), tmp) != NULL)
                {
                    if (buffer[0] != '#') 
                    {
                        int items = sscanf(buffer, 
                            "%d %c %lld %lld %lf %lf %lld", 
                            &id, &io, &offset, &length, 
                            &beg, &end, &tail);
                        if (items != 7) {
                            printf("ERROR: sscanf failed. Buffer: %s\n",
                                   buffer);
                            return -1;
                        } 
                        if (fileEnd < end) {
                            fileEnd = end; 
                        }
                        if (sendMinMax[0] > beg) {
                            sendMinMax[0] = beg; 
                        }
                        if (sendMinMax[1] < end) {
                            sendMinMax[1] = end; 
                        }
                    }
                }
                fclose(tmp); 
            }
            else {
                printf("ERROR: Container_dump_index did not succeed: %s\n",
                        strplfserr(retv));
                return -1; 
            }
            sendEndSum += fileEnd; /* this is the max for this pid */
            sendEndTimes[i] = fileEnd; 
            fileEnd = DBL_MIN; 
        }
        else {
            continue;
        }
    }
    /* use MPI to get the global max and min  */
    double min, max; 
    double localMin = sendMinMax[0]; 
    double localMax = sendMinMax[1];
    MPI_Barrier(MPI_COMM_WORLD); 
    MPI_Allreduce(&localMin, &min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(&localMax, &max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(&sendEndSum, endSum, 1, MPI_DOUBLE, MPI_SUM,
                MPI_COMM_WORLD); 
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(sendEndTimes, endTimes, numIndexFiles, MPI_DOUBLE, MPI_SUM, 
                MPI_COMM_WORLD); 
    minMax[0] = min; 
    minMax[1] = max; 
    unlink(name);
    free(sendEndTimes); 
    return 0; 
}
Esempio n. 11
0
ssize_t  mca_fbtl_plfs_preadv (mca_io_ompio_file_t *fh )
{

    Plfs_fd *pfd = NULL;
    plfs_error_t plfs_ret;
    pfd = fh->f_fs_ptr;
    ssize_t total_bytes_read=0;

    int i, block=1;
    struct iovec *iov = NULL;
    int iov_count = 0;
    OMPI_MPI_OFFSET_TYPE iov_offset = 0;

    if (NULL == fh->f_io_array) {
        return OMPI_ERROR;
    }

    iov = (struct iovec *) malloc 
        (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec));
    if (NULL == iov) {
        opal_output(1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
	if (0 == iov_count) {
	    iov[iov_count].iov_base = fh->f_io_array[i].memory_address;
	    iov[iov_count].iov_len = fh->f_io_array[i].length;
	    iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset;
	    iov_count ++;
	}
	
	if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) {
	    block ++;
	    iov = (struct iovec *)realloc
		(iov, OMPIO_IOVEC_INITIAL_SIZE * block *
		 sizeof(struct iovec));
	    if (NULL == iov) {
		opal_output(1, "OUT OF MEMORY\n");
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }
	}
	
	if (fh->f_num_of_io_entries != i+1) {
	    if (((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset + 
		 (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].length) == 
		(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset) {                    
		iov[iov_count].iov_base = 
		    fh->f_io_array[i+1].memory_address;
		iov[iov_count].iov_len = fh->f_io_array[i+1].length;
		iov_count ++;
		continue;
	    }
	}
	
	// Find the total number of bytes to be read.
	size_t bytes = 0;
	for (int i = 0; i < iov_count; ++i) {
	    bytes += iov[i].iov_len;
	}
	
	// Allocate a temporary buffer to hold the data
	char *buffer;
	buffer = (char *) malloc (bytes);
	if (buffer == NULL) {
	    return OMPI_ERROR;
	}
	
	// Read the data
	ssize_t bytes_read;
	plfs_ret = plfs_read( pfd, buffer, bytes, iov_offset, &bytes_read );
	if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fbtl_plfs_preadv: Error in plfs_read:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
	
	if (bytes_read < 0)
	    return OMPI_ERROR;
	total_bytes_read += bytes_read;
	// Copy the data from BUFFER into the memory specified by IOV
	bytes = bytes_read;
	for (int i = 0; i < iov_count; ++i) {
	    size_t copy = MIN (iov[i].iov_len, bytes);
	    (void) memcpy ((void *) iov[i].iov_base, (void *) buffer, copy);
	    buffer += copy;
	    bytes -= copy;
	    if (bytes == 0) {
		break;
	    }
	}
	iov_count = 0;
	if ( NULL != buffer ) {
	    free (buffer);
	    buffer=NULL;
	}
    }

    if (NULL != iov) {
	free (iov);
	iov = NULL;
    }

    return total_bytes_read;
}
Esempio n. 12
0
/* utilizes MPI-IO in order to write out the data required to graph
 * the per processor graphs. Writes to a given filesystem so that it
 * can be read by python */
int
writeProcessorData(int numIndexFiles, int size, char* mount,
            char* offsetMPI, double average, double stdev, 
            double* endTimes, char* timeMPI, double start, 
            bool above, bool below, int numStdDev, int* pids)
{
    int i, rank, pid, id; 
    plfs_error_t retv; 
    char name[4096];
    char buffer[4096]; 
    FILE * tmp; 
    long long offset, length, tail;
    long long writeID; 
    char io; 
    double beg, end; 
    MPI_File offsetsFile; 
    MPI_File timeFile; 
    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
    MPI_File_open(MPI_COMM_WORLD, offsetMPI, 
        MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &offsetsFile); 
    MPI_File_open(MPI_COMM_WORLD, timeMPI, 
        MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &timeFile); 
    int writesOffset = 0; /* the number of offset writes this rank has done */
    int writesTime = 0;  /* the number of time writes the rank has done */
    double topCutoff = 0; 
    double bottomCutoff = 0; 
    if (above) {
        topCutoff = average + numStdDev*stdev; 
        /* any proc that ends above this will be drawn */
    }
    if (below) {
        bottomCutoff = average - numStdDev*stdev; 
        /* any proc that ends before this will be drawn */
    }
    /* how big the writes are to the file. used to get the file offsets*/
    int offsetSize = 3*sizeof(long long); 
    int timeSize = 3*sizeof(double); 
    for (i = 0; i < numIndexFiles; i++) {
        if (i % size == rank) { 
            pid = pids[i];  /* get the pid from the constant list */
            /* Create a temporary file */
            sprintf(name, "tempFile%d.txt", rank); 
            tmp = fopen(name, "w+"); 
            if (tmp == NULL) {
                printf("ERROR:Could not open temp file\n"); 
                return -1; 
            }
            retv = container_dump_index(tmp, mount, 0, 1, pid); 
            fclose(tmp); 
            if (retv == PLFS_SUCCESS) {
                tmp = fopen(name, "r"); 
                if (tmp == NULL) {
                    printf("ERROR:Could not open temp file\n"); 
                    return -1; 
                }
                while( fgets(buffer, sizeof(buffer), tmp) != NULL)
                {
                    if(buffer[0] != '#')
                    {
                        int items = sscanf(buffer, 
                            "%d %c %lld %lld %lf %lf %lld", 
                            &id, &io, &offset, &length, 
                            &beg, &end, &tail);
                        if (items != 7) {
                            printf("ERROR: sscanf failed. Buffer: %s\n", 
                                   buffer);
                            return -1;
                        } 
                        /* this id was arbitrary so I will change
                        * it to match the index id */
                        writeID = pid; 
                        long long mpiSendBuffer[3] = {writeID, offset, tail};
                        MPI_File_write_at(offsetsFile, 
                                    rank*offsetSize + 
                                    writesOffset*size*offsetSize, 
                                    &mpiSendBuffer, 3, MPI_LONG_LONG, 
                                    MPI_STATUS_IGNORE); 
                        writesOffset ++; 
                        /* see if we need to write this time out */
                        if ((numIndexFiles <= 16) ||
                            ((above) && (endTimes[i] > topCutoff)) || 
                            ((below) && (endTimes[i] < bottomCutoff))) {
                            double newId = pid;
                            double mpiSendBuffer2[3] = {newId, beg, end};
                            /* we want to only graph the time of the job*/
                            beg -= start; 
                            end -= start; 
                            MPI_File_write_at(timeFile, rank*timeSize +
                                    writesTime*size*timeSize, 
                                    &mpiSendBuffer2, 3, MPI_DOUBLE, 
                                    MPI_STATUS_IGNORE); 
                            writesTime ++; 
                        }
                    }
                }
                fclose(tmp); 
            }
            else {
                printf("ERROR: Container_dump_index did not succeed: %s\n",
                        strplfserr(retv));
                return -1; 
            }
        }
    }
    unlink(name); 
    MPI_File_close(&offsetsFile); 
    MPI_File_close(&timeFile); 
    return 0; 
}
Esempio n. 13
0
/* Gains all the information needed for the graphs and finds the 
 * standard deviation of the end times*/
int
parseData(int numIndexFiles, int size, char* mount, double binSize,
        int numBins, double* bandwidths, long long* iosTime, long long* iosFin, 
        long long* writeCount, double min, double average, double* stdev, 
        int* pids)
{
    double* sendBandwidths = (double *)calloc(numBins, sizeof(double)); 
    long long* sendIOsTime = (long long *)calloc(numBins, sizeof(long long)); 
    long long* sendIOsFin = (long long *)calloc(numBins, sizeof(long long));  
    /* 51 is the number of bins in the write histogram */
    long long* sendWriteCount = (long long *)calloc(51, sizeof(long long)); 
    double sendSumDiffSquare = 0; 
    double sumDiffSquare = 0; 
    if ((sendBandwidths == NULL) | (sendIOsTime == NULL) | (sendIOsFin == NULL) 
            | (sendWriteCount == NULL)) 
    {
        printf("Could not allocate enough memory\n"); 
        return -1; 
    }
    char buffer[4096];
    char name[4096];
    int rank, i, pid; 
    plfs_error_t retv;  
    FILE* tmp; 
    int id; 
    long long length, tail, offset; 
    char io; 
    double beg, end, delta, averageBan; 
    int startBin, binsSpanned; 
    MPI_Comm_rank(MPI_COMM_WORLD, &rank); 
    double fileEnd = -1*DBL_MAX; 
    for (i = 0; i<numIndexFiles; i++) {
        if (i % size == rank) {
            /*Create a temporary file */
            pid = pids[i]; /* get pid from the constant list*/
            sprintf(name, "tempFile%d.txt", rank); 
            tmp = fopen(name, "w+"); 
            if (tmp == NULL) {
                printf("ERROR:Could not open temp file\n"); 
                return -1; 
            }
            retv = container_dump_index(tmp, mount, 0, 1, pid); 
            fclose(tmp); 
            if (retv == PLFS_SUCCESS) {
                tmp = fopen(name, "r"); 
                if (tmp == NULL) {
                    printf("ERROR:Could not open temp file\n"); 
                    return -1; 
                }
                while( fgets(buffer, sizeof(buffer), tmp) != NULL)
                {
                    if (buffer[0] != '#') 
                    {
                        int items = sscanf(buffer, 
                            "%d %c %lld %lld %lf %lf %lld",
                            &id, &io, &offset, &length, 
                            &beg, &end, &tail);
                        if (items != 7) {
                            printf("ERROR: sscanf failed. Buffer: %s\n",
                                   buffer);
                            return -1;
                        }
                        /* this id was arbitrary so I will change it to
                        * match the index id, which is the pid */
                        id = pid;
                        delta = end - beg; 
                        /* this calculates how many bins the time spans
                         * for the ios and the bandwidths so that each
                         * time bin gets everything from the current write */
                        binsSpanned = ceil((delta/binSize));
                        /* calculates the average of the bandwidth 
                         * for the current write in terms of bins covered*/
                        averageBan = (length/(delta*1024*1024) *
                                      (delta/binSize - floor(delta/binSize)));
                        startBin = floor((beg-min)/binSize); 
                        for (int j =0; j<binsSpanned; j++) 
                        {
                            /* writing at these bins */
                            sendBandwidths[startBin+j] += averageBan; 
                            if (j < (binsSpanned - 1))
                                sendIOsTime[startBin+j]++; 
                        }
                        // here we increment all further bins for a running sum
                        int i = 0;
                        while (startBin + binsSpanned + i < numBins) 
                        {
                            sendIOsFin[startBin+binsSpanned+i]++;
                            i++;
                        }
                        int writeIndex = powersOfTwo(length); 
                        /* The last bin is for all that is above 1 PiB 
                         * which is 51 as writeCounts is of length 51 */
                        if (writeIndex >= 51)
                        {
                            sendWriteCount[50]++; 
                        }
                        else {
                            sendWriteCount[writeIndex]++; 
                        }
                        if (fileEnd < end) 
                        {
                            fileEnd = end; 
                        }
                    }
                }
                fclose(tmp); 
            }
            else {
                printf("ERROR: Container_dump_index did not succeed: %s\n", 
                        strplfserr(retv)); 
                return -1; 
            }
            /* get the difference of the end of the pid to the average and
             * use to get the standard deviation */
            double diffAvg = fileEnd - average; 
            double squareDiff = diffAvg * diffAvg;
            sendSumDiffSquare += squareDiff;
            fileEnd = DBL_MIN; 
        }
        else {
            continue; 
        }
    }

    /* use MPI Reduce to get the sums of each of the arrays */
    MPI_Barrier(MPI_COMM_WORLD); 
    MPI_Allreduce(sendBandwidths, bandwidths, numBins, MPI_DOUBLE, 
                MPI_SUM, MPI_COMM_WORLD); 
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(sendIOsTime, iosTime, numBins, MPI_LONG_LONG_INT, MPI_SUM, 
                MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(sendIOsFin, iosFin, numBins, MPI_LONG_LONG_INT, 
                MPI_SUM, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(sendWriteCount, writeCount, 51, MPI_LONG_LONG_INT, 
                MPI_SUM, MPI_COMM_WORLD); 
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Allreduce(&sendSumDiffSquare, &sumDiffSquare, 1, MPI_DOUBLE, 
                MPI_SUM, MPI_COMM_WORLD); 
    if (rank == 0) {
        /* calculate standard deviation */
        *stdev = sqrt(sumDiffSquare/numIndexFiles); 
    }
    free(sendBandwidths); 
    free(sendIOsTime); 
    free(sendIOsFin); 
    free(sendWriteCount); 
    unlink(name); 
    return 0;
}
Esempio n. 14
0
/*
 *	file_open_plfs
 *
 *	Function:	- opens a new file
 *	Accepts:	- same arguments as MPI_File_open()
 *	Returns:	- Success if new file handle
 */
int
mca_fs_plfs_file_open (struct ompi_communicator_t *comm,
                     const char* filename,
                     int access_mode,
                     struct ompi_info_t *info,
                     mca_io_ompio_file_t *fh)
{
    int rank;
    int amode;
    int old_mask, perm;
    plfs_error_t plfs_ret;
    Plfs_fd *pfd = NULL;
    char wpath[1024];
    size_t len = sizeof(int);
    char key[] = "num_hostdirs";

    rank = ompi_comm_rank ( comm );

    getcwd( wpath, sizeof(wpath) );
    sprintf( wpath,"%s/%s",wpath,filename );

    if (OMPIO_PERM_NULL == fh->f_perm) {
        old_mask = umask(022);
        umask(old_mask);
        perm = old_mask ^ 0666;
    }
    else {
        perm = fh->f_perm;
    }

    amode = 0;

    if (access_mode & MPI_MODE_RDONLY)
        amode = amode | O_RDONLY;
    if (access_mode & MPI_MODE_WRONLY)
        amode = amode | O_WRONLY;
    if (access_mode & MPI_MODE_RDWR)
        amode = amode | O_RDWR;
    if (access_mode & MPI_MODE_EXCL) {
        if( is_plfs_path(wpath) == 1 ) { //the file already exists
	    return OMPI_ERROR;
	}
    }

    if (0 == rank) {
        /* MODE_CREATE and MODE_EXCL can only be set by one process */
        if (access_mode & MPI_MODE_CREATE)
	    amode = amode | O_CREAT;

	plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL );
	fh->f_fs_ptr = pfd;
    }

    comm->c_coll.coll_bcast ( &plfs_ret, 1, MPI_INT, 0, comm, comm->c_coll.coll_bcast_module);
    if ( PLFS_SUCCESS != plfs_ret ) {
        return OMPI_ERROR;
    }

    if (0 != rank) {
        plfs_ret = plfs_open( &pfd, wpath, amode, 0, perm, NULL );
	if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fs_plfs_file_open: Error in plfs_open:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
	else {
	    fh->f_fs_ptr = pfd;
	}
    }

    if (mca_fs_plfs_num_hostdir > 0) {
        plfs_ret = plfs_setxattr( pfd, &mca_fs_plfs_num_hostdir, key, len );
        if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fs_plfs_file_open: Error in plfs_setxattr:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
    }
    return OMPI_SUCCESS;
}
Esempio n. 15
0
static plfs_error_t
nextdropping(const string& canbpath, struct plfs_backend *canback,
             string *droppingpath, struct plfs_backend **dropback,
             const char *dropping_filter,
             IOSDirHandle **candir, IOSDirHandle **subdir,
             string *hostdirpath, int *dropping) {
    struct dirent dirstore;
    string tmppath, resolved;
    plfs_error_t ret;
    *dropping = -1;
    
    mlog(IDX_DAPI, "nextdropping: %slooking in %s",
         (*candir != NULL) ? "still " : "", canbpath.c_str());

    /* if *candir is null, then this is the first call to nextdropping */
    if (*candir == NULL) {
        plfs_error_t rv;
        rv = canback->store->Opendir(canbpath.c_str(), candir);
        if (rv != PLFS_SUCCESS) {
            return rv;
        }
    }

 ReTry:
    /* candir is open.  now get an open subdir (if we don't have it) */
    if (*subdir == NULL) {

        if (getnextent(*candir, HOSTDIRPREFIX, &dirstore) == NULL) {
            /* no more subdirs ... */
            canback->store->Closedir(*candir);
            *candir = NULL;
            *dropping = 0;
            return PLFS_SUCCESS;                  /* success, we are done! */
        }

        /* a new subdir in dirstore, must resolve possible metalinks now */
        tmppath = canbpath + "/" + dirstore.d_name;
        *dropback = canback;   /* assume no metalink */
        ret = Container::resolveMetalink(tmppath, canback, NULL,
                                         resolved, dropback);
        if (ret == PLFS_SUCCESS) {
            *hostdirpath = resolved;   /* via metalink */
            /* resolveMetalink also updated dropback */
        } else {
            *hostdirpath = tmppath;    /* no metalink */
        }
            
        /* now open up the subdir */
        ret = (*dropback)->store->Opendir(hostdirpath->c_str(), subdir);
        if (ret != PLFS_SUCCESS) {
            mlog(IDX_DRARE, "opendir %s: %s", hostdirpath->c_str(),
                 strplfserr(ret));
            return ret;
        }
        mlog(IDX_DCOMMON, "%s opened dir %s", __FUNCTION__,
             hostdirpath->c_str());
    }

    /* now all directories are open, try and read next entry */
    if (getnextent(*subdir, dropping_filter, &dirstore) == NULL) {
        /* we hit EOF on the subdir, need to advance to next subdir */

        (*dropback)->store->Closedir(*subdir);
        *dropback = NULL;            /* just to be safe */
        *subdir = NULL;              /* signals we are ready for next one */
        goto ReTry;                  /* or could recurse(used to) */
    }
    
    /* success, we have the next entry... */
    droppingpath->clear();
    droppingpath->assign(*hostdirpath + "/" + dirstore.d_name);
    *dropping = 1;
    return PLFS_SUCCESS;
}
Esempio n. 16
0
/**
 * ByteRangeIndex::trunc_edit_nz: edit droppings to shrink container
 * to a non-zero size.  if we are truncating an open file and we
 * edit the current index dropping, then update the filehandle for
 * it since the offsets may have changed.
 *
 * @param ppip pathinfo for container
 * @param nzo the non-zero offset
 * @param openidrop filename of currently open index dropping
 * @return PLFS_SUCCESS or error code
 */
plfs_error_t
ByteRangeIndex::trunc_edit_nz(struct plfs_physpathinfo *ppip, off_t nzo,
                              string openidrop) {
    plfs_error_t ret = PLFS_SUCCESS;
    size_t slashoff;
    string openifn, indexfile;
    struct plfs_backend *indexback;
    IOSDirHandle *candir, *subdir;
    string hostdirpath;
    int dropping;

    mlog(IDX_DAPI, "%s on %s to %ld", __FUNCTION__, ppip->canbpath.c_str(),
         (unsigned long)nzo);
 
    /*
     * isolate the filename in openidrop.  in theory we should be
     * able to just compare the whole thing, but PLFS sometimes
     * puts in extra "/" chars in filenames and that could mess
     * us up: e.g.  /m/plfs/dir/file  vs /m/plfs/dir//file
     * should be the same, but the "//" will fool strcmps.
     */
    if (openidrop.size() == 0) {
        openifn = "";
    } else {
        slashoff = openidrop.rfind("/");
        if (slashoff == string::npos) {
            openifn = openidrop;
        } else {
            openifn = openidrop.substr(slashoff + 1, string::npos);
        }
    }

    /*
     * this code goes through each index dropping and rewrites it
     * preserving only entries that contain data prior to truncate
     * offset...
     */
    candir = subdir = NULL;
    while ((ret = nextdropping(ppip->canbpath, ppip->canback,
                               &indexfile, &indexback,
                               INDEXPREFIX, &candir, &subdir,
                               &hostdirpath, &dropping)) == PLFS_SUCCESS) {
        if (dropping != 1) {
            break;
        }

        /* read dropping file into a tmp index map */
        map<off_t,ContainerEntry> tmpidx;
        vector<ChunkFile> tmpcnk;
        off_t eof, bytes;
        IOSHandle *fh;

        eof = bytes = 0;
        ret = ByteRangeIndex::merge_dropping(tmpidx, tmpcnk, &eof,
                                             &bytes, indexfile, indexback);
        
        if (ret != PLFS_SUCCESS) {
            mlog(IDX_CRIT, "Failed to read index file %s: %s",
                 indexfile.c_str(), strplfserr( ret ));
            break;
        }
            
        /* we have to rewrite only if it had data past our new eof (nzo) */
        if (eof > nzo) {
            mlog(IDX_DCOMMON, "%s %s at %ld", __FUNCTION__, indexfile.c_str(),
                 (unsigned long)nzo);

            ByteRangeIndex::trunc_map(tmpidx, nzo);

            /*
             * XXX: copied from old code.  should this write to a tmp
             * file and then rename?
             */
            ret = indexback->store->Open(indexfile.c_str(),
                                         O_TRUNC|O_WRONLY, &fh);
            if ( ret != PLFS_SUCCESS ) {
                mlog(IDX_CRIT, "Couldn't overwrite index file %s: %s",
                     indexfile.c_str(), strplfserr( ret ));
                return(ret);
            }

            ret = ByteRangeIndex::trunc_writemap(tmpidx, fh);

            /*
             * if we just rewrote our currently open index, swap our
             * iwritefh to be the rewritten fh and let the old one get
             * closed off (below).  we do this because we've edited
             * the open write index dropping (made it smaller) and
             * the old filehandle is now pointing to a bad spot in
             * the file.
             */
            if (openifn.size() > 0 && indexback == this->iwriteback &&
                filematch(indexfile, openifn)) {
                IOSHandle *tmp;
                tmp = this->iwritefh;
                this->iwritefh = fh;
                fh = tmp;
            }

            /* XXX: do we care about return value from Close? */
            indexback->store->Close(fh);
        }
    }

    mlog(IDX_DAPI, "%s on %s to %ld ret: %d",
         __FUNCTION__, ppip->canbpath.c_str(), (long)nzo, ret);
    return(ret);
}