Exemplo n.º 1
0
/*-------------------------------------------------------------------------*/
int FTI_WritePosixVar(int varID, FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
        FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
        FTIT_dataset* FTI_Data)
{
    FILE *fd;
    int res;
    memcpy( &fd, FTI_Exec->iCPInfo.fh, sizeof(FTI_PO_FH) );

    char str[FTI_BUFS];

    long offset = 0;

    // write data into ckpt file
    int i;
    for (i = 0; i < FTI_Exec->nbVar; i++) {
        if( FTI_Data[i].id == varID ) {
            clearerr(fd);
            if ( fseek( fd, offset, SEEK_SET ) == -1 ) {
                FTI_Print("Error on fseek in writeposixvar", FTI_EROR );
                return FTI_NSCS;
            }
            if ( !(FTI_Data[i].isDevicePtr) ){
                FTI_Print(str,FTI_INFO);
                if (( res = FTI_Try(write_posix(FTI_Data[i].ptr, FTI_Data[i].size, fd),"Storing Data to Checkpoint file")) != FTI_SCES){
                    snprintf(str, FTI_BUFS, "Dataset #%d could not be written.", FTI_Data[i].id);
                    FTI_Print(str, FTI_EROR);
                    fclose(fd);
                    return FTI_NSCS;
                }
            }
#ifdef GPUSUPPORT            
            // if data are stored to the GPU move them from device
            // memory to cpu memory and store them.
            else {
                FTI_Print(str,FTI_INFO);
                if ((res = FTI_Try(
                                FTI_TransferDeviceMemToFileAsync(&FTI_Data[i],  write_posix, fd),
                                "moving data from GPU to storage")) != FTI_SCES) {
                    snprintf(str, FTI_BUFS, "Dataset #%d could not be written.", FTI_Data[i].id);
                    FTI_Print(str, FTI_EROR);
                    fclose(fd);
                    return FTI_NSCS;
                }
            }
#endif  
            if (ferror(fd)) {
                return FTI_NSCS;
            }
        }
        offset += FTI_Data[i].count*FTI_Data[i].eleSize;
    }

    FTI_Exec->iCPInfo.result = FTI_SCES;

    return FTI_SCES;

}
Exemplo n.º 2
0
/*-------------------------------------------------------------------------*/
int FTI_WriteMpiVar(int varID, FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
        FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
        FTIT_dataset* FTI_Data)
{
    char str[FTI_BUFS];
    WriteMPIInfo_t write_info;
    int res;
    memcpy( &write_info.pfh, FTI_Exec->iCPInfo.fh, sizeof(FTI_MI_FH) );

    write_info.offset = FTI_Exec->iCPInfo.offset; 
    write_info.FTI_Conf = FTI_Conf;

    int i;
    for (i = 0; i < FTI_Exec->nbVar; i++) {
        if ( FTI_Data[i].id == varID ) {
            if ( !(FTI_Data[i].isDevicePtr) ){
                FTI_Print(str,FTI_INFO);
                if (( res = write_mpi(FTI_Data[i].ptr, FTI_Data[i].size, &write_info), "Storing Data to checkpoint file")!=FTI_SCES){
                    snprintf(str, FTI_BUFS, "Dataset #%d could not be written.", FTI_Data[i].id);
                    FTI_Print(str, FTI_EROR);
                    MPI_File_close(&write_info.pfh);
                    return res;
                }
            }
#ifdef GPUSUPPORT
            // dowload data from the GPU if necessary
            // Data are stored in the GPU side.
            else {
                snprintf(str, FTI_BUFS, "Dataset #%d Writing GPU Data.", FTI_Data[i].id);
                FTI_Print(str,FTI_INFO);
                if ((res = FTI_Try(
                                FTI_TransferDeviceMemToFileAsync(&FTI_Data[i], write_mpi, &write_info),
                                "moving data from GPU to storage")) != FTI_SCES) {
                    snprintf(str, FTI_BUFS, "Dataset #%d could not be written.", FTI_Data[i].id);
                    FTI_Print(str, FTI_EROR);
                    MPI_File_close(&write_info.pfh);
                    return res;
                }
            }
#endif
        }
        write_info.offset += FTI_Data[i].size;
    }

    FTI_Exec->iCPInfo.result = FTI_SCES;

    return FTI_SCES;

}
Exemplo n.º 3
0
/*-------------------------------------------------------------------------*/
int FTI_WriteCkpt(FTIT_dataset* FTI_Data) {
    int i, res;
    FILE *fd;
    double tt = MPI_Wtime();
    char fn[FTI_BUFS], str[FTI_BUFS];
    snprintf(FTI_Exec.ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec.ckptID, FTI_Topo.myRank);
    if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4)
    {
        sprintf(fn,"%s/%s",FTI_Conf.gTmpDir, FTI_Exec.ckptFile);
        mkdir(FTI_Conf.gTmpDir, 0777);
    } else {
        sprintf(fn,"%s/%s",FTI_Conf.lTmpDir, FTI_Exec.ckptFile);
        mkdir(FTI_Conf.lTmpDir, 0777);
    }
    fd = fopen(fn, "wb");
    if (fd == NULL)
    {
        FTI_Print("FTI checkpoint file could not be opened.", FTI_EROR);
        return FTI_NSCS;
    }
    for(i = 0; i < FTI_Exec.nbVar; i++)
    {
        if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count)
        {
            sprintf(str, "Dataset #%d could not be written.", FTI_Data[i].id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
    }
    if (fflush(fd) != 0)
    {
        FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR);
        return FTI_NSCS;
    }
    if (fclose(fd) != 0)
    {
        FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR);
        return FTI_NSCS;
    }
    sprintf(str, "Time writing checkpoint file : %f seconds.", MPI_Wtime()-tt);
    FTI_Print(str, FTI_DBUG);
    int globalTmp = (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) ? 1 : 0;
    res = FTI_Try(FTI_CreateMetadata(globalTmp), "create metadata.");
    return res;
}
Exemplo n.º 4
0
/*-------------------------------------------------------------------------*/
int FTI_Listen() {
    MPI_Status status;
    char str[FTI_BUFS];
    int i, buf, res, flags[7];
    for (i = 0; i < 7; i++)
    { // Initialize flags
        flags[i] = 0;
    }
    FTI_Print("Head listening...", FTI_DBUG);
    for(i = 0; i < FTI_Topo.nbApprocs; i++)
    { // Iterate on the application processes in the node
        MPI_Recv(&buf, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm, &status);
        sprintf(str, "The head received a %d message", buf);
        FTI_Print(str, FTI_DBUG);
        fflush(stdout);
        flags[buf-FTI_BASE] = flags[buf-FTI_BASE] + 1;
    }
    for (i = 1; i < 7; i++)
    {
        if (flags[i] == FTI_Topo.nbApprocs)
        { // Determining checkpoint level
            FTI_Exec.ckptLvel = i;
        }
    }
    if (flags[6] > 0)
    {
        FTI_Exec.ckptLvel = 6;
    }
    if (FTI_Exec.ckptLvel == 5)
    { // If we were asked to finalize
        return FTI_ENDW;
    }
    res = FTI_Try(FTI_PostCkpt(1, 0, FTI_Topo.nbApprocs), "postprocess the checkpoint.");
    if (res == FTI_SCES)
    {
        FTI_Exec.wasLastOffline = 1;
        FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel;
        res = FTI_Exec.ckptLvel;
    }
    for(i = 0; i < FTI_Topo.nbApprocs; i++)
    { // Send msg. to avoid checkpoint collision
        MPI_Send(&res, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm);
    }
    return FTI_SCES;
}
Exemplo n.º 5
0
/*-------------------------------------------------------------------------*/
int FTI_FinalizeFtiffICP(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
        FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
        FTIT_dataset* FTI_Data)
{   
    if ( FTI_Exec->iCPInfo.status == FTI_ICP_FAIL ) {
        return FTI_NSCS;
    }

    int fd; 
    memcpy( &fd, FTI_Exec->iCPInfo.fh, sizeof(FTI_FF_FH) );

    if ( FTI_Try( FTIFF_CreateMetadata( FTI_Exec, FTI_Topo, FTI_Data, FTI_Conf ), "Create FTI-FF meta data" ) != FTI_SCES ) {
        return FTI_NSCS;
    }

    FTIFF_writeMetaDataFTIFF( FTI_Exec, fd );


    fdatasync( fd );
    close( fd );

    return FTI_SCES;

}
Exemplo n.º 6
0
Arquivo: hdf5.c Projeto: leobago/fti
/*-------------------------------------------------------------------------*/
int FTI_ReadHDF5Var(FTIT_dataset *FTI_DataVar)
{
    char str[FTI_BUFS];
    int res;

    hid_t dataset = H5Dopen(FTI_DataVar->h5group->h5groupID, FTI_DataVar->name, H5P_DEFAULT);
    hid_t dataspace = H5Dget_space(dataset);

    // If my data are stored in the CPU side
    // Just store the data to the file and return;
#ifdef GPUSUPPORT    
    if ( !FTI_DataVar->isDevicePtr ){
#endif
        res = H5Dread(dataset,FTI_DataVar->type->h5datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, FTI_DataVar->ptr);  
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }

        res = H5Dclose(dataset);
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        res = H5Sclose(dataspace);
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        return FTI_SCES;
#ifdef GPUSUPPORT        
    }

    hsize_t dimLength[32];
    int j;
    for (j = 0; j < FTI_DataVar->rank; j++) {
        dimLength[j] = FTI_DataVar->dimLength[j];
    }

    // This code is only executed in the GPU case.


    hsize_t *count = (hsize_t*) malloc (sizeof(hsize_t)*FTI_DataVar->rank); 
    hsize_t *offset= (hsize_t*) calloc (FTI_DataVar->rank,sizeof(hsize_t)); 

    if ( !count|| !offset){
        sprintf(str, "Could Not allocate count and offset regions");
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }


    hsize_t seperator;
    size_t fetchBytes;
    size_t hostBufSize = FTI_getHostBuffSize();
    //Calculate How many dimension I can compute each time 
    //and how bug should the HOST-GPU communication buffer should be

    fetchBytes = FTI_calculateCountDim(FTI_DataVar->eleSize, hostBufSize ,count, FTI_DataVar->rank, dimLength, &seperator);

    //If the buffer is smaller than the minimum amount 
    //then I need to allocate a bigger one.
    if (hostBufSize < fetchBytes){
        if ( FTI_Try( FTI_DestroyDevices(), "Deleting host buffers" ) != FTI_SCES){
            free(offset);
            free(count);
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }

        if ( FTI_Try (FTI_InitDevices( fetchBytes ), "Allocating host buffers")!= FTI_SCES) {
            free(offset);
            free(count);
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
    }

    unsigned char *basePtr = NULL;
    int id = 0;
    int prevId = 1;
    hsize_t totalBytes = FTI_DataVar->size;
    cudaStream_t streams[2]; 
    //Create the streams for the asynchronous data movement.
    CUDA_ERROR_CHECK(cudaStreamCreate(&(streams[0])));
    CUDA_ERROR_CHECK(cudaStreamCreate(&(streams[1])));
    unsigned char *dPtr = FTI_DataVar->devicePtr;
    // Perform the while loop until all data
    // are processed.
    while( totalBytes  ){
        basePtr = FTI_getHostBuffer(id); 
        //Read file 
        res = FTI_ReadElements( dataspace, FTI_DataVar->type->h5datatype, dataset, count, offset, FTI_DataVar->rank , basePtr);
        CUDA_ERROR_CHECK(cudaMemcpyAsync( dPtr , basePtr, fetchBytes, cudaMemcpyHostToDevice, streams[id]));
        if (res != FTI_SCES ) {
            free(offset);
            free(count);
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        //Increase accordingly the file offset
        FTI_AdvanceOffset(seperator, offset,count, dimLength, FTI_DataVar->rank);
        //Syncing the cuda stream.
        CUDA_ERROR_CHECK(cudaStreamSynchronize(streams[prevId]));   
        prevId = id;
        id = (id + 1)%2;
        dPtr = dPtr + fetchBytes;
        totalBytes -= fetchBytes;
    }
    CUDA_ERROR_CHECK(cudaStreamSynchronize(streams[prevId]));   
    CUDA_ERROR_CHECK(cudaStreamDestroy(streams[0]));
    CUDA_ERROR_CHECK(cudaStreamDestroy(streams[1]));

    res = H5Dclose(dataset);
    if (res < 0) {
        free(offset);
        free(count);
        sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }
    res = H5Sclose(dataspace);
    if (res < 0) {
        free(offset);
        free(count);
        sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }
    free(offset);
    free(count);
    return FTI_SCES;
#endif
}
Exemplo n.º 7
0
Arquivo: hdf5.c Projeto: leobago/fti
int FTI_WriteHDF5Var(FTIT_dataset *FTI_DataVar)
{
    int j;
    hsize_t dimLength[32];
    char str[FTI_BUFS];
    int res;
    hid_t dcpl;

    for (j = 0; j < FTI_DataVar->rank; j++) {
        dimLength[j] = FTI_DataVar->dimLength[j];
    }

    dcpl = H5Pcreate (H5P_DATASET_CREATE);
    res = H5Pset_fletcher32 (dcpl);
    res = H5Pset_chunk (dcpl, FTI_DataVar->rank, dimLength);

    hid_t dataspace = H5Screate_simple( FTI_DataVar->rank, dimLength, NULL);
    hid_t dataset = H5Dcreate2 ( FTI_DataVar->h5group->h5groupID, FTI_DataVar->name,FTI_DataVar->type->h5datatype, dataspace,  H5P_DEFAULT, dcpl , H5P_DEFAULT);

    // If my data are stored in the CPU side
    // Just store the data to the file and return;
#ifdef GPUSUPPORT    
    if ( !FTI_DataVar->isDevicePtr ){
#endif
        res = H5Dwrite(dataset,FTI_DataVar->type->h5datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, FTI_DataVar->ptr);  
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }

        res = H5Pclose (dcpl);
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }

        res = H5Dclose(dataset);
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        res = H5Sclose(dataspace);
        if (res < 0) {
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        return FTI_SCES;
#ifdef GPUSUPPORT        
    }

    // This code is only executed in the GPU case.

    hsize_t *count = (hsize_t*) malloc (sizeof(hsize_t)*FTI_DataVar->rank); 
    hsize_t *offset= (hsize_t*) calloc (FTI_DataVar->rank,sizeof(hsize_t)); 

    if ( !count|| !offset){
        sprintf(str, "Could Not allocate count and offset regions");
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }


    hsize_t seperator;
    hsize_t fetchBytes = FTI_getHostBuffSize();
    fetchBytes = FTI_calculateCountDim(FTI_DataVar->eleSize, fetchBytes ,count, FTI_DataVar->rank, dimLength, &seperator);
    sprintf(str,"GPU-Device Message: I Will Fetch %lld Bytes Per Stream Request", fetchBytes);
    FTI_Print(str,FTI_DBUG);


    FTIT_data_prefetch prefetcher;
    prefetcher.fetchSize = fetchBytes;
    prefetcher.totalBytesToFetch = FTI_DataVar->size;
    prefetcher.isDevice = FTI_DataVar->isDevicePtr;
    prefetcher.dptr = FTI_DataVar->devicePtr;
    size_t bytesToWrite;
    FTI_InitPrefetcher(&prefetcher);
    unsigned char *basePtr = NULL;


    if ( FTI_Try(FTI_getPrefetchedData(&prefetcher, &bytesToWrite, &basePtr), "Fetch next memory block from GPU to write to HDF5") !=  FTI_SCES){
        return FTI_NSCS;
    }

    while( basePtr  ){
        res = FTI_WriteElements( dataspace, FTI_DataVar->type->h5datatype, dataset, count, offset, FTI_DataVar->rank , basePtr);
        if (res != FTI_SCES ) {
            free(offset);
            free(count);
            sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
            FTI_Print(str, FTI_EROR);
            return FTI_NSCS;
        }
        FTI_AdvanceOffset(seperator, offset,count, dimLength, FTI_DataVar->rank);

        if ( FTI_Try(FTI_getPrefetchedData(&prefetcher, &bytesToWrite, &basePtr), 
                    "Fetch next memory block from GPU to write to HDF5") !=  FTI_SCES){
            return FTI_NSCS;
        }

    }


    res = H5Dclose(dataset);
    if (res < 0) {
        free(offset);
        free(count);
        sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }
    res = H5Sclose(dataspace);
    if (res < 0) {
        free(offset);
        free(count);
        sprintf(str, "Dataset #%d could not be written", FTI_DataVar->id);
        FTI_Print(str, FTI_EROR);
        return FTI_NSCS;
    }
    free(offset);
    free(count);
    return FTI_SCES;
#endif
}
Exemplo n.º 8
0
/*-------------------------------------------------------------------------*/
int FTI_WriteHdf5Var(int varID, FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec,
        FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt,
        FTIT_dataset* FTI_Data)
{

    if ( FTI_Exec->iCPInfo.status == FTI_ICP_FAIL ) {
        return FTI_NSCS;
    }

    char str[FTI_BUFS];

    hid_t file_id;
    memcpy( &file_id, FTI_Exec->iCPInfo.fh, sizeof(FTI_H5_FH) );

    FTIT_H5Group* rootGroup = FTI_Exec->H5groups[0];

    // write data into ckpt file
    int i;
    for (i = 0; i < FTI_Exec->nbVar; i++) {
        if( FTI_Data[i].id == varID ) {
            int toCommit = 0;
            if (FTI_Data[i].type->h5datatype < 0) {
                toCommit = 1;
            }
            sprintf(str, "Calling CreateComplexType [%d] with hid_t %ld", FTI_Data[i].type->id, (long)FTI_Data[i].type->h5datatype);
            FTI_Print(str, FTI_DBUG);
            FTI_CreateComplexType(FTI_Data[i].type, FTI_Exec->FTI_Type);
            if (toCommit == 1) {
                char name[FTI_BUFS];
                if (FTI_Data[i].type->structure == NULL) {
                    //this is the array of bytes with no name
                    sprintf(name, "Type%d", FTI_Data[i].type->id);
                } else {
                    strncpy(name, FTI_Data[i].type->structure->name, FTI_BUFS);
                }
                herr_t res = H5Tcommit(FTI_Data[i].type->h5group->h5groupID, name, FTI_Data[i].type->h5datatype, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
                if (res < 0) {
                    sprintf(str, "Datatype #%d could not be commited", FTI_Data[i].id);
                    FTI_Print(str, FTI_EROR);
                    int j;
                    for (j = 0; j < FTI_Exec->H5groups[0]->childrenNo; j++) {
                        FTI_CloseGroup(FTI_Exec->H5groups[rootGroup->childrenID[j]], FTI_Exec->H5groups);
                    }
                    H5Fclose(file_id);
                    return FTI_NSCS;
                }
            }
            //convert dimLength array to hsize_t
            if ( FTI_Try(FTI_WriteHDF5Var(&FTI_Data[i]) , "Writing data to HDF5 filesystem") != FTI_SCES){
                sprintf(str, "Dataset #%d could not be written", FTI_Data[i].id);
                FTI_Print(str, FTI_EROR);
                int j;
                for (j = 0; j < FTI_Exec->H5groups[0]->childrenNo; j++) {
                    FTI_CloseGroup(FTI_Exec->H5groups[rootGroup->childrenID[j]], FTI_Exec->H5groups);
                }
                H5Fclose(file_id);
                return FTI_NSCS;
            }
        }
    }

    FTI_Exec->iCPInfo.result = FTI_SCES;
    return FTI_SCES;

}