예제 #1
0
파일: Convert.cpp 프로젝트: Griffan/bamUtil
int Convert::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String refFile = "";
    bool lshift = false;
    bool noeof = false;
    bool params = false;

    bool useBases = false;
    bool useEquals = false;
    bool useOrigSeq = false;

    bool recover = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("recover", &recover)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SequenceConversion")
            EXCLUSIVE_PARAMETER("useBases", &useBases)
            EXCLUSIVE_PARAMETER("useEquals", &useEquals)
            EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }
    
    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the ref file was specified.
    // Open the reference.
    GenomeSequence* refPtr = NULL;
    if(refFile != "")
    {
        refPtr = new GenomeSequence(refFile);
    }

    SamRecord::SequenceTranslation translation;
    if((useBases) && (refPtr != NULL))
    {
        translation = SamRecord::BASES;
    }
    else if((useEquals) && (refPtr != NULL))
    {
        translation = SamRecord::EQUAL;
    }
    else
    {
        useOrigSeq = true;
        translation = SamRecord::NONE;
    }
    
    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    if(recover) samIn.setAttemptRecovery(true);
    samIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);
    samOut.SetWriteSequenceTranslation(translation);
    samOut.SetReference(refPtr);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    // Write the sam header.
    samOut.WriteHeader(samHeader);

    SamRecord samRecord;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    while(1) {
        try {
            // Keep reading records until ReadRecord returns false.
            while(samIn.ReadRecord(samHeader, samRecord))
            {
                // left shift if necessary.
                if(lshift)
                {
                    samRecord.shiftIndelsLeft();
                }

                // Successfully read a record from the file, so write it.
                if(!samOut.WriteRecord(samHeader, samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOut.GetStatusMessage());
                    returnStatus = samOut.GetStatus();
                }
            }
            break;
        } catch (std::runtime_error e) {
            std::cerr << "Caught runtime error: " << e.what() << "\n";
            if(!recover) {
                std::cerr << "Corrupted BAM file detected - consider using --recover option.\n";
                break;
            }
            std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n";
            // XXX need to resync SamFile stream here
            bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH);
            if(rc) {
                std::cerr << "Successful resync - some data lost.\n";
                continue;    // succeeded
            }
            std::cerr << "Failed to re-sync on data stream.\n";
            break;              // failed to resync
        }
    }

    std::cerr << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cerr << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    if(refPtr != NULL)
    {
        delete(refPtr);
    }

    // Since the reads were successful, return the status based
    // on the status of the writes.  If any failed, return
    // their failure status.
    return(returnStatus);
}
예제 #2
0
int Bam2FastQ::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool readName = false;
    String refFile = "";
    String firstOut = "";
    String secondOut = "";
    String unpairedOut = "";

    bool interleave = false;
    bool noeof = false;
    bool gzip = false;
    bool params = false;

    myOutBase = "";
    myNumMateFailures = 0;
    myNumPairs = 0;
    myNumUnpaired = 0;
    mySplitRG = false;
    myQField = "";
    myNumQualTagErrors = 0;
    myReverseComp = true;
    myRNPlus = false;
    myFirstRNExt = DEFAULT_FIRST_EXT;
    mySecondRNExt = DEFAULT_SECOND_EXT;
    myCompression = InputFile::DEFAULT;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER("splitRG", &mySplitRG)
        LONG_STRINGPARAMETER("qualField", &myQField)
        LONG_PARAMETER("merge", &interleave)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt)
        LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt)
        LONG_PARAMETER("rnPlus", &myRNPlus)
        LONG_PARAMETER("noReverseComp", &myReverseComp)
        LONG_PARAMETER("gzip", &gzip)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional OutputFile Names")
        LONG_STRINGPARAMETER("outBase", &myOutBase)
        LONG_STRINGPARAMETER("firstOut", &firstOut)
        LONG_STRINGPARAMETER("secondOut", &secondOut)
        LONG_STRINGPARAMETER("unpairedOut", &unpairedOut)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(gzip)
    {
        myCompression = InputFile::GZIP;
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both splitRG & firstOut/secondOut/unpairedOut
    // since it needs a different file for each RG.
    if(mySplitRG && (!firstOut.IsEmpty() || 
                   !secondOut.IsEmpty() || !unpairedOut.IsEmpty()))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n";
        std::cerr << "Use --outBase instead.\n";
        return(-1);
    }
    // Cannot specify splitRG & output to stdout.
    if(mySplitRG && (myOutBase[0] == '-'))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n";
        return(-1);
    }

    // Check to see if the out file was specified, if not, generate it from
    // the input filename.
    if(myOutBase == "")
    {
        // Just remove the extension from the input filename.
        int extStart = inFile.FastFindLastChar('.');
        if(extStart <= 0)
        {
            myOutBase = inFile;
        }
        else
        {
            myOutBase = inFile.Left(extStart);
        }
    }

    if(mySplitRG)
    {
        std::string fqList = myOutBase.c_str();
        fqList += ".list";
        myFqList = ifopen(fqList.c_str(), "w");
        ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n");
    }

    // Check to see if the first/second/single-ended were specified and
    // if not, set them.
    myFirstFileNameExt = "_1.fastq";
    mySecondFileNameExt = "_2.fastq";
    myUnpairedFileNameExt = ".fastq";
    if(interleave)
    {
        myFirstFileNameExt = "_interleaved.fastq";
        myFirstFileNameExt = "_interleaved.fastq";
    }
    getFileName(firstOut, myFirstFileNameExt);
    getFileName(secondOut, mySecondFileNameExt);
    getFileName(unpairedOut, myUnpairedFileNameExt);

    if(params)
    {
        inputParameters.Status();
    }

    // Open the files for reading/writing.
    // Open prior to opening the output files,
    // so if there is an error, the outputs don't get created.
    SamFile samIn;
    samIn.OpenForRead(inFile, &mySamHeader);
    // Skip non-primary reads.
    samIn.SetReadFlags(0, 0x0100);

    // Open the output files if not splitting RG
    if(!mySplitRG)
    {
        myUnpairedFile = ifopen(unpairedOut, "w", myCompression);

        // Only open the first file if it is different than an already opened file.
        if(firstOut != unpairedOut)
        {
            myFirstFile = ifopen(firstOut, "w", myCompression);
        }
        else
        {
            myFirstFile = myUnpairedFile;
        }

        // If it is interleaved or the 2nd file is not a new name, set it appropriately.
        if(interleave || secondOut == firstOut)
        {
            mySecondFile = myFirstFile;
        }
        else if(secondOut == unpairedOut)
        {
            mySecondFile = myUnpairedFile;
        }
        else
        {
            mySecondFile = ifopen(secondOut, "w", myCompression);
        }
    
        if(myUnpairedFile == NULL)
        {
            std::cerr << "Failed to open " << unpairedOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(myFirstFile == NULL)
        {
            std::cerr << "Failed to open " << firstOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(mySecondFile == NULL)
        {
            std::cerr << "Failed to open " << secondOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
    }

    if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0))
    {
        readName = true;
    }
    else
    {
        // defaulting to coordinate sorted.
        samIn.setSortedValidation(SamFile::COORDINATE);
    }

    // Setup the '=' translation if the reference was specified.
    if(!refFile.IsEmpty())
    {
        GenomeSequence* refPtr = new GenomeSequence(refFile);
        samIn.SetReadSequenceTranslation(SamRecord::BASES);
        samIn.SetReference(refPtr);
    }

    SamRecord* recordPtr;
    int16_t samFlag;

    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = myPool.getRecord();
        if(recordPtr == NULL)
        {
            // Failed to allocate a new record.
            throw(std::runtime_error("Failed to allocate a new SAM/BAM record"));
        }
        if(!samIn.ReadRecord(mySamHeader, *recordPtr))
        {
            // Failed to read a record.
            returnStatus = samIn.GetStatus();
            continue;
        }

        // Have a record.  Check to see if it is a pair or unpaired read.
        samFlag = recordPtr->getFlag();
        if(SamFlag::isPaired(samFlag))
        {
            if(readName)
            {
                handlePairedRN(*recordPtr);
            }
            else
            {
                handlePairedCoord(*recordPtr);
            }
        }
        else
        {
            ++myNumUnpaired;
            writeFastQ(*recordPtr, myUnpairedFile,
                       myUnpairedFileNameExt);
        }
    }

    // Flush All
    cleanUpMateMap(0, true);

    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        returnStatus = SamStatus::SUCCESS;
    }

    samIn.Close();
    closeFiles();
    
    // Output the results
    std::cerr << "\nFound " << myNumPairs << " read pairs.\n";
    std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n";
    if(myNumMateFailures != 0)
    {
        std::cerr << "Failed to find mates for " << myNumMateFailures
                  << " reads, so they were written as unpaired\n"
                  << "  (not included in either of the above counts).\n";
    }
    if(myNumQualTagErrors != 0)
    {
        std::cerr << myNumQualTagErrors << " records did not have tag "
                  << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n";
    }

    return(returnStatus);
}