Beispiel #1
0
int ClipOverlap::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String storeOrig = "";
    bool readName = false;
    bool noRNValidate = false;
    bool stats = false;
    int poolSize = DEFAULT_POOL_SIZE;
    bool unmapped = false;
    bool noeof = false;
    bool params = false;
    String excludeFlags = "0xF0C";

    // TODO, cleanup legacy parameters
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("storeOrig", &storeOrig)
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER ("noRNValidate", &noRNValidate)
        LONG_PARAMETER ("stats", &stats)
        LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly)
        LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters")
        LONG_INTPARAMETER("poolSize", &poolSize)
        LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap)
        LONG_PHONEHOME(VERSION)
        BEGIN_LEGACY_PARAMETERS()
        LONG_PARAMETER ("clipsOnly", &myOverlapsOnly)
        LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the out file was specified, if not, report an error.
    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // Out file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if((storeOrig.Length() != 0) && (storeOrig.Length() != 2))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "--storeOrig tag name must be 2 characters.\n";
        return(-1);
    }

    myOverlapHandler = new OverlapClipLowerBaseQual();
    if(myOverlapHandler == NULL)
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Failed to allocate the overlap handler\n";
        return(-1);
    }

    if(unmapped)
    {
        myOverlapHandler->markAsUnmapped();
    }

    // Setup the overlap handler.
    myOverlapHandler->keepStats(stats);
    if(storeOrig.Length() != 0)
    {
        myOverlapHandler->storeOrigCigar(storeOrig);
    }

    myIntExcludeFlags = excludeFlags.AsInteger();

    if(params)
    {
        inputParameters.Status();
    }

    // For each step process the file.
    // Open the files & read/write the sam header.
    SamStatus::Status runStatus = SamStatus::SUCCESS;
    for(int i = 1; i <= myOverlapHandler->numSteps(); i++)
    {
        // Open the file for reading.
        mySamHeader.resetHeader();
        SamFile samIn(inFile, SamFile::READ, &mySamHeader);
        SamFile* samOutPtr = NULL;
        // Check if writing, if so, open the output file.
        if(i == myOverlapHandler->numSteps())
        {
            samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader);
        }

        if(readName)
        {
            if(!noRNValidate)
            {
                samIn.setSortedValidation(SamFile::QUERY_NAME);
            }
            runStatus = handleSortedByReadName(samIn, samOutPtr);
        }
        else
        {
            // Coordinate sorted, so work with the pools.
            samIn.setSortedValidation(SamFile::COORDINATE);
            myPool.setMaxAllocatedRecs(poolSize);

            // Reset the number of failures
            myNumMateFailures = 0;
            myNumPoolFail = 0;
            myNumPoolFailNoHandle = 0;
            myNumPoolFailHandled = 0;
            myNumOutOfOrder = 0;

            // Run by coordinate
            if(samOutPtr != NULL)
            {
                // Setup the output buffer for writing.
                SamCoordOutput outputBuffer(myPool);
                outputBuffer.setOutputFile(samOutPtr, &mySamHeader);
                runStatus = handleSortedByCoord(samIn, &outputBuffer);

                // Cleanup the output buffer.
                if(!outputBuffer.flushAll())
                {
                    std::cerr << "ERROR: Failed to flush the output buffer\n";
                    runStatus = SamStatus::FAIL_IO;
                }
            }
            else
            {
                runStatus = handleSortedByCoord(samIn, NULL);
            }
        }

        if(runStatus != SamStatus::SUCCESS)
        {
            break;
        }
        // Close the input file, it will be reopened if there are 
        // multiple steps.
        samIn.Close();
        if(samOutPtr != NULL)
        {
            samOutPtr->Close();
            delete samOutPtr;
            samOutPtr = NULL;
        }
    }

    // Done processing.
    // Print Stats
    myOverlapHandler->printStats();

    if(myNumMateFailures != 0)
    {
        std::cerr << "WARNING: did not find expected overlapping mates for "
                  << myNumMateFailures << " records." << std::endl;
    }
    if(myNumPoolFail != 0)
    {
        // Had to skip clipping some records due to running out of
        // memory and not being able to wait for the mate.
        std::cerr << "WARNING: " << myNumPoolFail 
                  << " record pool failures\n";
        if(myNumPoolFailNoHandle != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, skipped handling " 
                      << myNumPoolFailNoHandle << " records." << std::endl;
        }
        if(myNumPoolFailHandled != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, default handled " 
                      << myNumPoolFailHandled << " records." << std::endl;
        }
        if(myNumOutOfOrder != 0)
        {
            std::cerr << "WARNING: Resulting File out of Order by " 
                      << myNumOutOfOrder << " records.\n";
        }
    }

    if(runStatus == SamStatus::SUCCESS)
    {
        if(myNumPoolFail == 0)
        {
            std::cerr << "Completed ClipOverlap Successfully.\n";
        }
        else
        {
            runStatus = SamStatus::NO_MORE_RECS;
            std::cerr << "Completed ClipOverlap with WARNINGS.\n";
        }
    }
    else
    {
        std::cerr << "Failed to complete ClipOverlap.\n";
    }
    return(runStatus);
}
Beispiel #2
0
int Dedup_LowMem::execute(int argc, char** argv)
{
    /* --------------------------------
     * process the arguments
     * -------------------------------*/
    String inFile, outFile, logFile;
    myDoRecab = false;
    bool removeFlag = false;
    bool verboseFlag = false;
    myForceFlag = false;
    myNumMissingMate = 0;
    myMinQual = DEFAULT_MIN_QUAL;
    String excludeFlags = "0xB04";
    uint16_t intExcludeFlags = 0;
    bool noeof = false;
    bool params = false;

    LongParamContainer parameters;
    parameters.addGroup("Required Parameters");
    parameters.addString("in", &inFile);
    parameters.addString("out", &outFile);
    parameters.addGroup("Optional Parameters");
    parameters.addInt("minQual", & myMinQual);
    parameters.addString("log", &logFile);
    parameters.addBool("oneChrom", &myOneChrom);
    parameters.addBool("recab", &myDoRecab);
    parameters.addBool("rmDups", &removeFlag);
    parameters.addBool("force", &myForceFlag);
    parameters.addString("excludeFlags", &excludeFlags);
    parameters.addBool("verbose", &verboseFlag);
    parameters.addBool("noeof", &noeof);
    parameters.addBool("params", &params);
    parameters.addPhoneHome(VERSION);
    myRecab.addRecabSpecificParameters(parameters);

    ParameterList inputParameters;
    inputParameters.Add(new LongParameters ("Input Parameters",
                                            parameters.getLongParameterList()));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(inFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an input file" << std::endl;
        return EXIT_FAILURE;
    }

    if(outFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an output file" << std::endl;
        return EXIT_FAILURE;
    }

    intExcludeFlags = excludeFlags.AsInteger();

    if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
                  << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
        return EXIT_FAILURE;
    }

    if(!SamFlag::isSecondary(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
        return EXIT_FAILURE;
    }

    if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
        return EXIT_FAILURE;
    }

    if(logFile.IsEmpty())
    {
        logFile = outFile + ".log";
    }

    if(myDoRecab)
    {
        int status = myRecab.processRecabParam();
        if(status != 0)
        {
            inputParameters.Status();
            return(status);
        }
    }

    if(params)
    {
        inputParameters.Status();
    }

    Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);

    /* -------------------------------------------------------------------
     * The arguments are processed.  Prepare the input BAM file,
     * instantiate dedup_LowMem, and construct the read group library map
     * ------------------------------------------------------------------*/

    SamFile samIn;

    samIn.OpenForRead(inFile.c_str());
    // If the file isn't sorted it will throw an exception.
    samIn.setSortedValidation(SamFile::COORDINATE);

    SamFileHeader header;
    samIn.ReadHeader(header);

    buildReadGroupLibraryMap(header);

    lastReference = -1;
    lastCoordinate = -1;

    // for keeping some basic statistics
    uint32_t recordCount = 0;
    uint32_t pairedCount = 0;
    uint32_t properPairCount = 0;
    uint32_t unmappedCount = 0;
    uint32_t reverseCount = 0;
    uint32_t qualCheckFailCount = 0;
    uint32_t secondaryCount = 0;
    uint32_t supplementaryCount = 0;
    uint32_t excludedCount = 0;

    // Now we start reading records
    SamRecord* recordPtr;
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = mySamPool.getRecord();
        if(recordPtr == NULL)
        {
            std::cerr << "Failed to allocate enough records\n";
            return(-1);
        }
        if(!samIn.ReadRecord(header, *recordPtr))
        {
            returnStatus = samIn.GetStatus();
            continue;
        }
        // Take note of properties of this record
        int flag = recordPtr->getFlag();
        if(SamFlag::isPaired(flag))     ++pairedCount;
        if(SamFlag::isProperPair(flag)) ++properPairCount;
        if(SamFlag::isReverse(flag))    ++reverseCount;
        if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
        if(SamFlag::isSecondary(flag))  ++secondaryCount;
        if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
        if(!SamFlag::isMapped(flag))    ++unmappedCount;

        // put the record in the appropriate maps:
        //   single reads go in myFragmentMap
        //   paired reads go in myPairedMap
        recordCount = samIn.GetCurrentRecordCount();

        // if we have moved to a new position, look back at previous reads for duplicates
        if (hasPositionChanged(*recordPtr))
        {
            cleanupPriorReads(recordPtr);
        }

        // Determine if this read should be checked for duplicates.
        if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
        {
            ++excludedCount;

            // No deduping done on this record, but still build the recab table.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(*recordPtr);
            }
            // Nothing more to do with this record, so
            // release the pointer.
            mySamPool.releaseRecord(recordPtr);
        }
        else
        {
            if(SamFlag::isDuplicate(flag) && !myForceFlag)
            {
                // Error: Marked duplicates, and duplicates aren't excluded.
                Logger::gLogger->error("There are records already duplicate marked.");
                Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
            }

            checkDups(*recordPtr, recordCount);
            mySamPool.releaseRecord(recordPtr);
        }
        // let the user know we're not napping
        if (verboseFlag && (recordCount % 100000 == 0))
        {
            Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
                                      recordCount, myFragmentMap.size(),
                                      myPairedMap.size(),
                                      myMateMap.size());
        }
    }

    // we're finished reading record so clean up the duplicate search and
    //  close the input file
    cleanupPriorReads(NULL);
    samIn.Close();

    // print some statistics
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
    Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
    Logger::gLogger->writeLog("Total number of paired-end reads: %u",
                              pairedCount);
    Logger::gLogger->writeLog("Total number of properly paired reads: %u",
                              properPairCount);
    Logger::gLogger->writeLog("Total number of unmapped reads: %u",
                              unmappedCount);
    Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
                              reverseCount);
    Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
                              qualCheckFailCount);
    Logger::gLogger->writeLog("Total number of secondary reads: %u",
                              secondaryCount);
    Logger::gLogger->writeLog("Total number of supplementary reads: %u",
                              supplementaryCount);
    Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
                              myFragmentMap.size());
    Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
                              myPairedMap.size());
    Logger::gLogger->writeLog("Total number of missing mates: %u",
                              myNumMissingMate);
    Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
                              excludedCount);
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
                              myDupList.size());

    // sort the indices of duplicate records
    std::sort(myDupList.begin(), myDupList.end(),
              std::less<uint32_t> ());

    // get ready to write the output file by making a second pass
    // through the input file
    samIn.OpenForRead(inFile.c_str());
    samIn.ReadHeader(header);

    SamFile samOut;
    samOut.OpenForWrite(outFile.c_str());
    samOut.WriteHeader(header);

    // If we are recalibrating, output the model information.
    if(myDoRecab)
    {
        myRecab.modelFitPrediction(outFile);
    }

    // an iterator to run through the duplicate indices
    int currentDupIndex = 0;
    bool moreDups = !myDupList.empty();

    // let the user know what we're doing
    Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());

    // count the duplicate records as a check
    uint32_t singleDuplicates(0), pairedDuplicates(0);

    // start reading records and writing them out
    SamRecord record;
    while(samIn.ReadRecord(header, record))
    {
        uint32_t currentIndex = samIn.GetCurrentRecordCount();

        bool foundDup = moreDups &&
                        (currentIndex == myDupList[currentDupIndex]);

        // modify the duplicate flag and write out the record,
        // if it's appropriate
        int flag = record.getFlag();
        if (foundDup)
        {
            // this record is a duplicate, so mark it.
            record.setFlag( flag | 0x400 );
            currentDupIndex++;
            // increment duplicate counters to verify we found them all
            if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
            {   // unpaired or mate unmapped
                singleDuplicates++;
            }
            else
            {
                pairedDuplicates++;
            }
            // recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }

            // write the record if we are not removing duplicates
            if (!removeFlag ) samOut.WriteRecord(header, record);
        }
        else
        {
            if(myForceFlag)
            {
                // this is not a duplicate we've identified but we want to
                // remove any duplicate marking
                record.setFlag( flag & 0xfffffbff ); // unmark duplicate
            }
            // Not a duplicate, so recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }
            samOut.WriteRecord(header, record);
        }

        // Let the user know we're still here
        if (verboseFlag && (currentIndex % 100000 == 0)) {
            Logger::gLogger->writeLog("recordCount=%u", currentIndex);
        }
    }

    // We're done.  Close the files and print triumphant messages.
    samIn.Close();
    samOut.Close();

    Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
                              removeFlag ? "removed" : "marked" ,
                              singleDuplicates,
                              pairedDuplicates/2);
    Logger::gLogger->writeLog("\nDedup_LowMem complete!");
    return 0;
}
Beispiel #3
0
void testAsInteger()
{
    // Test AsInteger with ints & negative ints.
    String intString = "123";
    String negIntString = "-123";
    assert(intString.AsInteger() == 123);
    assert(negIntString.AsInteger() == -123);

    // Run the same tests with AsInteger that returns a bool and takes
    // in a long to set.
    long retValue;
    assert(intString.AsInteger(retValue));
    assert(retValue == 123);
    assert(negIntString.AsInteger(retValue));
    assert(retValue == -123);


    // Strings that are not integers
    // For AsInteger, it returns just the starting integer portion.
    // For AsInteger that returns a bool and a long set, it returns false
    // and sets the long to the starting int.
    String nonIntString = "abd";
    assert(nonIntString.AsInteger() == 0);
    assert(!nonIntString.AsInteger(retValue));

    nonIntString = "12ab33";
    assert(nonIntString.AsInteger() == 12);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == 12);
    nonIntString = "as12ab3a4sd";
    assert(nonIntString.AsInteger() == 0);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == 0);
    // Negatives are only recognized as the first characer.
    nonIntString = "-12ab3a4sd";
    assert(nonIntString.AsInteger() == -12);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == -12);
    nonIntString = "-as12ab3a4sd";
    assert(nonIntString.AsInteger() == 0);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == 0);
    nonIntString = "as-12ab3a4sd";
    assert(nonIntString.AsInteger() == 0);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == 0);
    nonIntString = "as12-ab3a4sd";
    assert(nonIntString.AsInteger() == 0);
    assert(!nonIntString.AsInteger(retValue));
    assert(retValue == 0);
}