Example #1
0
// Dump the specified Bam Index file.
int DumpIndex::execute(int argc, char **argv)
{
    // Extract command line arguments.
    static const int UNSPECIFIED_INT = -1;
    String indexFile = "";
    int refID = UNSPECIFIED_INT;
    bool summary = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_INTPARAMETER("refID", &refID)
        LONG_PARAMETER("summary", &summary)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Check to see if the index file was specified, if not, report an error.
    if(indexFile == "")
    {
        usage();
        inputParameters.Status();
        // mandatory argument was not specified.
        std::cerr << "Missing mandatory argument: --bamIndex" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Read the index.
    BamIndex bamIndex;
    SamStatus status;
    status = bamIndex.readIndex(indexFile);

    if(status != SamStatus::SUCCESS)
    {
        // Failed to read the index, return.
        fprintf(stderr, "%s\n", status.getStatusMessage());
        return(status.getStatus());
    }

    // Print the index file.
    bamIndex.printIndex(refID, summary);

    return(status.getStatus());
}
Example #2
0
int Bam2FastQ::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool readName = false;
    String refFile = "";
    String firstOut = "";
    String secondOut = "";
    String unpairedOut = "";

    bool interleave = false;
    bool noeof = false;
    bool gzip = false;
    bool params = false;

    myOutBase = "";
    myNumMateFailures = 0;
    myNumPairs = 0;
    myNumUnpaired = 0;
    mySplitRG = false;
    myQField = "";
    myNumQualTagErrors = 0;
    myReverseComp = true;
    myRNPlus = false;
    myFirstRNExt = DEFAULT_FIRST_EXT;
    mySecondRNExt = DEFAULT_SECOND_EXT;
    myCompression = InputFile::DEFAULT;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER("splitRG", &mySplitRG)
        LONG_STRINGPARAMETER("qualField", &myQField)
        LONG_PARAMETER("merge", &interleave)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_STRINGPARAMETER("firstRNExt", &myFirstRNExt)
        LONG_STRINGPARAMETER("secondRNExt", &mySecondRNExt)
        LONG_PARAMETER("rnPlus", &myRNPlus)
        LONG_PARAMETER("noReverseComp", &myReverseComp)
        LONG_PARAMETER("gzip", &gzip)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional OutputFile Names")
        LONG_STRINGPARAMETER("outBase", &myOutBase)
        LONG_STRINGPARAMETER("firstOut", &firstOut)
        LONG_STRINGPARAMETER("secondOut", &secondOut)
        LONG_STRINGPARAMETER("unpairedOut", &unpairedOut)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(gzip)
    {
        myCompression = InputFile::GZIP;
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both interleaved & secondOut since secondOut would be N/A.
    if(interleave && !secondOut.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --merge & --secondOut.\n";
        return(-1);
    }

    // Cannot specify both splitRG & firstOut/secondOut/unpairedOut
    // since it needs a different file for each RG.
    if(mySplitRG && (!firstOut.IsEmpty() || 
                   !secondOut.IsEmpty() || !unpairedOut.IsEmpty()))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & --firstOut/--secondOut/--unpairedOut.\n";
        std::cerr << "Use --outBase instead.\n";
        return(-1);
    }
    // Cannot specify splitRG & output to stdout.
    if(mySplitRG && (myOutBase[0] == '-'))
    {
        usage();
        inputParameters.Status();
        std::cerr << "ERROR: Cannot specify --splitRG & write to stdout.\n";
        return(-1);
    }

    // Check to see if the out file was specified, if not, generate it from
    // the input filename.
    if(myOutBase == "")
    {
        // Just remove the extension from the input filename.
        int extStart = inFile.FastFindLastChar('.');
        if(extStart <= 0)
        {
            myOutBase = inFile;
        }
        else
        {
            myOutBase = inFile.Left(extStart);
        }
    }

    if(mySplitRG)
    {
        std::string fqList = myOutBase.c_str();
        fqList += ".list";
        myFqList = ifopen(fqList.c_str(), "w");
        ifprintf(myFqList, "MERGE_NAME\tFASTQ1\tFASTQ2\tRG\n");
    }

    // Check to see if the first/second/single-ended were specified and
    // if not, set them.
    myFirstFileNameExt = "_1.fastq";
    mySecondFileNameExt = "_2.fastq";
    myUnpairedFileNameExt = ".fastq";
    if(interleave)
    {
        myFirstFileNameExt = "_interleaved.fastq";
        myFirstFileNameExt = "_interleaved.fastq";
    }
    getFileName(firstOut, myFirstFileNameExt);
    getFileName(secondOut, mySecondFileNameExt);
    getFileName(unpairedOut, myUnpairedFileNameExt);

    if(params)
    {
        inputParameters.Status();
    }

    // Open the files for reading/writing.
    // Open prior to opening the output files,
    // so if there is an error, the outputs don't get created.
    SamFile samIn;
    samIn.OpenForRead(inFile, &mySamHeader);
    // Skip non-primary reads.
    samIn.SetReadFlags(0, 0x0100);

    // Open the output files if not splitting RG
    if(!mySplitRG)
    {
        myUnpairedFile = ifopen(unpairedOut, "w", myCompression);

        // Only open the first file if it is different than an already opened file.
        if(firstOut != unpairedOut)
        {
            myFirstFile = ifopen(firstOut, "w", myCompression);
        }
        else
        {
            myFirstFile = myUnpairedFile;
        }

        // If it is interleaved or the 2nd file is not a new name, set it appropriately.
        if(interleave || secondOut == firstOut)
        {
            mySecondFile = myFirstFile;
        }
        else if(secondOut == unpairedOut)
        {
            mySecondFile = myUnpairedFile;
        }
        else
        {
            mySecondFile = ifopen(secondOut, "w", myCompression);
        }
    
        if(myUnpairedFile == NULL)
        {
            std::cerr << "Failed to open " << unpairedOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(myFirstFile == NULL)
        {
            std::cerr << "Failed to open " << firstOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
        if(mySecondFile == NULL)
        {
            std::cerr << "Failed to open " << secondOut
                      << " so can't convert bam2FastQ.\n";
            return(-1);
        }
    }

    if((readName) || (strcmp(mySamHeader.getSortOrder(), "queryname") == 0))
    {
        readName = true;
    }
    else
    {
        // defaulting to coordinate sorted.
        samIn.setSortedValidation(SamFile::COORDINATE);
    }

    // Setup the '=' translation if the reference was specified.
    if(!refFile.IsEmpty())
    {
        GenomeSequence* refPtr = new GenomeSequence(refFile);
        samIn.SetReadSequenceTranslation(SamRecord::BASES);
        samIn.SetReference(refPtr);
    }

    SamRecord* recordPtr;
    int16_t samFlag;

    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = myPool.getRecord();
        if(recordPtr == NULL)
        {
            // Failed to allocate a new record.
            throw(std::runtime_error("Failed to allocate a new SAM/BAM record"));
        }
        if(!samIn.ReadRecord(mySamHeader, *recordPtr))
        {
            // Failed to read a record.
            returnStatus = samIn.GetStatus();
            continue;
        }

        // Have a record.  Check to see if it is a pair or unpaired read.
        samFlag = recordPtr->getFlag();
        if(SamFlag::isPaired(samFlag))
        {
            if(readName)
            {
                handlePairedRN(*recordPtr);
            }
            else
            {
                handlePairedCoord(*recordPtr);
            }
        }
        else
        {
            ++myNumUnpaired;
            writeFastQ(*recordPtr, myUnpairedFile,
                       myUnpairedFileNameExt);
        }
    }

    // Flush All
    cleanUpMateMap(0, true);

    if(returnStatus == SamStatus::NO_MORE_RECS)
    {
        returnStatus = SamStatus::SUCCESS;
    }

    samIn.Close();
    closeFiles();
    
    // Output the results
    std::cerr << "\nFound " << myNumPairs << " read pairs.\n";
    std::cerr << "Found " << myNumUnpaired << " unpaired reads.\n";
    if(myNumMateFailures != 0)
    {
        std::cerr << "Failed to find mates for " << myNumMateFailures
                  << " reads, so they were written as unpaired\n"
                  << "  (not included in either of the above counts).\n";
    }
    if(myNumQualTagErrors != 0)
    {
        std::cerr << myNumQualTagErrors << " records did not have tag "
                  << myQField.c_str() << " or it was invalid, so the quality field was used for those records.\n";
    }

    return(returnStatus);
}
Example #3
0
int ClipOverlap::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String storeOrig = "";
    bool readName = false;
    bool noRNValidate = false;
    bool stats = false;
    int poolSize = DEFAULT_POOL_SIZE;
    bool unmapped = false;
    bool noeof = false;
    bool params = false;
    String excludeFlags = "0xF0C";

    // TODO, cleanup legacy parameters
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("storeOrig", &storeOrig)
        LONG_PARAMETER("readName", &readName)
        LONG_PARAMETER ("noRNValidate", &noRNValidate)
        LONG_PARAMETER ("stats", &stats)
        LONG_PARAMETER ("overlapsOnly", &myOverlapsOnly)
        LONG_STRINGPARAMETER ("excludeFlags", &excludeFlags)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Coordinate Processing Optional Parameters")
        LONG_INTPARAMETER("poolSize", &poolSize)
        LONG_PARAMETER("poolSkipOverlap", &myPoolSkipOverlap)
        LONG_PHONEHOME(VERSION)
        BEGIN_LEGACY_PARAMETERS()
        LONG_PARAMETER ("clipsOnly", &myOverlapsOnly)
        LONG_PARAMETER("poolSkipClip", &myPoolSkipOverlap)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the out file was specified, if not, report an error.
    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // Out file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if((storeOrig.Length() != 0) && (storeOrig.Length() != 2))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "--storeOrig tag name must be 2 characters.\n";
        return(-1);
    }

    myOverlapHandler = new OverlapClipLowerBaseQual();
    if(myOverlapHandler == NULL)
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Failed to allocate the overlap handler\n";
        return(-1);
    }

    if(unmapped)
    {
        myOverlapHandler->markAsUnmapped();
    }

    // Setup the overlap handler.
    myOverlapHandler->keepStats(stats);
    if(storeOrig.Length() != 0)
    {
        myOverlapHandler->storeOrigCigar(storeOrig);
    }

    myIntExcludeFlags = excludeFlags.AsInteger();

    if(params)
    {
        inputParameters.Status();
    }

    // For each step process the file.
    // Open the files & read/write the sam header.
    SamStatus::Status runStatus = SamStatus::SUCCESS;
    for(int i = 1; i <= myOverlapHandler->numSteps(); i++)
    {
        // Open the file for reading.
        mySamHeader.resetHeader();
        SamFile samIn(inFile, SamFile::READ, &mySamHeader);
        SamFile* samOutPtr = NULL;
        // Check if writing, if so, open the output file.
        if(i == myOverlapHandler->numSteps())
        {
            samOutPtr = new SamFile(outFile, SamFile::WRITE, &mySamHeader);
        }

        if(readName)
        {
            if(!noRNValidate)
            {
                samIn.setSortedValidation(SamFile::QUERY_NAME);
            }
            runStatus = handleSortedByReadName(samIn, samOutPtr);
        }
        else
        {
            // Coordinate sorted, so work with the pools.
            samIn.setSortedValidation(SamFile::COORDINATE);
            myPool.setMaxAllocatedRecs(poolSize);

            // Reset the number of failures
            myNumMateFailures = 0;
            myNumPoolFail = 0;
            myNumPoolFailNoHandle = 0;
            myNumPoolFailHandled = 0;
            myNumOutOfOrder = 0;

            // Run by coordinate
            if(samOutPtr != NULL)
            {
                // Setup the output buffer for writing.
                SamCoordOutput outputBuffer(myPool);
                outputBuffer.setOutputFile(samOutPtr, &mySamHeader);
                runStatus = handleSortedByCoord(samIn, &outputBuffer);

                // Cleanup the output buffer.
                if(!outputBuffer.flushAll())
                {
                    std::cerr << "ERROR: Failed to flush the output buffer\n";
                    runStatus = SamStatus::FAIL_IO;
                }
            }
            else
            {
                runStatus = handleSortedByCoord(samIn, NULL);
            }
        }

        if(runStatus != SamStatus::SUCCESS)
        {
            break;
        }
        // Close the input file, it will be reopened if there are 
        // multiple steps.
        samIn.Close();
        if(samOutPtr != NULL)
        {
            samOutPtr->Close();
            delete samOutPtr;
            samOutPtr = NULL;
        }
    }

    // Done processing.
    // Print Stats
    myOverlapHandler->printStats();

    if(myNumMateFailures != 0)
    {
        std::cerr << "WARNING: did not find expected overlapping mates for "
                  << myNumMateFailures << " records." << std::endl;
    }
    if(myNumPoolFail != 0)
    {
        // Had to skip clipping some records due to running out of
        // memory and not being able to wait for the mate.
        std::cerr << "WARNING: " << myNumPoolFail 
                  << " record pool failures\n";
        if(myNumPoolFailNoHandle != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, skipped handling " 
                      << myNumPoolFailNoHandle << " records." << std::endl;
        }
        if(myNumPoolFailHandled != 0)
        {
            std::cerr << "Due to hitting the max record poolSize, default handled " 
                      << myNumPoolFailHandled << " records." << std::endl;
        }
        if(myNumOutOfOrder != 0)
        {
            std::cerr << "WARNING: Resulting File out of Order by " 
                      << myNumOutOfOrder << " records.\n";
        }
    }

    if(runStatus == SamStatus::SUCCESS)
    {
        if(myNumPoolFail == 0)
        {
            std::cerr << "Completed ClipOverlap Successfully.\n";
        }
        else
        {
            runStatus = SamStatus::NO_MORE_RECS;
            std::cerr << "Completed ClipOverlap with WARNINGS.\n";
        }
    }
    else
    {
        std::cerr << "Failed to complete ClipOverlap.\n";
    }
    return(runStatus);
}
Example #4
0
int VcfSplit::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcfBase = "";
    String refName = "";
    bool uncompress = false;
    bool params = false;
    bool noeof = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("obase", &outputVcfBase)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcfBase == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--obase\", a required parameter.\n\n";
        return(-1);
    }
    outputVcfBase += ".";

    if(params)
    {
        inputParameters.Status();
    }

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    VcfFileReader inFile;
    std::map<std::string, VcfFileWriter*> outFiles;
    VcfHeader header;
    
    // Open the file.
    inFile.open(inputVcf, header);

    if(refName != "")
    {
        inFile.setReadSection(refName.c_str());
    }

    VcfRecord record;
    int numRecords = 0;

    std::string prevChr = "";
    std::string chr = "";
    VcfFileWriter* outFilePtr = 0;
    std::string outName = "";
    while(inFile.readRecord(record))
    {
        ++numRecords;

        chr = record.getChromStr();

        if((outFilePtr == 0) || (chr != prevChr))
        {
            outFilePtr = outFiles[chr];
            if(outFilePtr == 0)
            {
                outFilePtr = new VcfFileWriter();
                outFiles[chr] = outFilePtr;
                outName = outputVcfBase.c_str();
                if(chr.substr(0,3) != "chr")
                {
                    outName += "chr";
                }
                outName += chr + ".vcf";
                // chr not in outFile list.
                if(uncompress)
                {
                    outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT);
                }
                else
                {
                    outName += ".gz";
                    outFilePtr->open(outName.c_str(), header);
                }
            }
        }
        outFilePtr->writeRecord(record);
    }
 
    inFile.close();   

    for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin();
         it != outFiles.end(); ++it)
    {
        if(it->second != 0)
        {
            it->second->close();
            it->second = 0;
        }
    }
  

    std::cerr << "NumRecords: " << numRecords << "\n";
    return(0);
}
Example #5
0
int VcfMac::execute(int argc, char **argv)
{
    String inputVcf = "";
    int minAC = -1;
    String sampleSubset = "";
    String filterList = "";
    bool params = false;

    IntervalTree<int> regions;
    std::vector<int> intersection;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("sampleSubset", &sampleSubset)
        LONG_INTPARAMETER("minAC", &minAC)
        LONG_STRINGPARAMETER("filterList", &filterList)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the two input files.
    VcfFileReader inFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file
    if(sampleSubset.IsEmpty())
    {
        inFile.open(inputVcf, header);        
    }
    else
    {
        inFile.open(inputVcf, header, sampleSubset, NULL, NULL);
    }
    
    // Add the discard rule for minor allele count.
    if(minAC >= 0)
    {
        inFile.addDiscardMinMinorAlleleCount(minAC, NULL);
    }
    
    if(!filterList.IsEmpty())
    {
        // Open the filter list.
        IFILE regionFile = ifopen(filterList, "r");
        String regionLine;
        StringArray regionColumn;
        int start;
        int end;
        int intervalVal = 1;
        if(regionFile == NULL)
        {
            std::cerr << "Failed to open " << filterList 
                      << ", so keeping all positions\n";
            filterList.Clear();
        }
        else
        {
            while( regionFile->isOpen() && !regionFile->ifeof())
            {
                // Read the next interval
                regionLine.Clear();
                regionLine.ReadLine(regionFile);
                if(regionLine.IsEmpty())
                {
                    // Nothing on this line, continue to the next.
                    continue;
                }
                regionColumn.ReplaceColumns(regionLine, ' ');
                if(regionColumn.Length() != 2)
                {
                    std::cerr << "Improperly formatted region line: " 
                              << regionLine << "; skipping to the next line.\n";
                    continue;
                }
                // Convert the columns to integers.
                if(!regionColumn[0].AsInteger(start))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, start position "
                              << "(1st column) is not an integer: "
                              << regionColumn[0]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                if(!regionColumn[1].AsInteger(end))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, end position "
                              << "(2nd column) is not an integer: "
                              << regionColumn[1]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                // Add 1-based inclusive intervals.
                regions.add(start,end, intervalVal);
            }
        }
    }


    int numReadRecords = 0;

    while( inFile.readRecord(record))
    {
        if(!filterList.IsEmpty())
        {
            // Check if the region should be kept.
            intersection.clear();
            regions.get_intersecting_intervals(record.get1BasedPosition(), intersection);
            
            if(intersection.empty())
            {
                // not in the interval, so continue to the next record.
                continue;
            }
        }

        ++numReadRecords;

        // Loop through the number of possible alternates.
        unsigned int numAlts = record.getNumAlts();
        int minAlleleCount = -1;
        int curAlleleCount = 0;
        int totalAlleleCount = 0;
        for(unsigned int i = 0; i <= numAlts; i++)
        {
            curAlleleCount = record.getAlleleCount(i);
            if((minAlleleCount == -1) ||
               (curAlleleCount < minAlleleCount))
            {
                minAlleleCount = curAlleleCount;
            }
            totalAlleleCount += curAlleleCount;
        }
        if(totalAlleleCount != 0)
        {
            double maf = (double)minAlleleCount/totalAlleleCount;
            std::cout << record.getIDStr()
                      << "\t" << minAlleleCount
                      << "\t" << maf << "\n";
        }
    }
    
    inFile.close();

    //    std::cerr << "\n\t# Records: " << numReadRecords << "\n";

    // return success.
    return(0);
}
Example #6
0
int Dedup_LowMem::execute(int argc, char** argv)
{
    /* --------------------------------
     * process the arguments
     * -------------------------------*/
    String inFile, outFile, logFile;
    myDoRecab = false;
    bool removeFlag = false;
    bool verboseFlag = false;
    myForceFlag = false;
    myNumMissingMate = 0;
    myMinQual = DEFAULT_MIN_QUAL;
    String excludeFlags = "0xB04";
    uint16_t intExcludeFlags = 0;
    bool noeof = false;
    bool params = false;

    LongParamContainer parameters;
    parameters.addGroup("Required Parameters");
    parameters.addString("in", &inFile);
    parameters.addString("out", &outFile);
    parameters.addGroup("Optional Parameters");
    parameters.addInt("minQual", & myMinQual);
    parameters.addString("log", &logFile);
    parameters.addBool("oneChrom", &myOneChrom);
    parameters.addBool("recab", &myDoRecab);
    parameters.addBool("rmDups", &removeFlag);
    parameters.addBool("force", &myForceFlag);
    parameters.addString("excludeFlags", &excludeFlags);
    parameters.addBool("verbose", &verboseFlag);
    parameters.addBool("noeof", &noeof);
    parameters.addBool("params", &params);
    parameters.addPhoneHome(VERSION);
    myRecab.addRecabSpecificParameters(parameters);

    ParameterList inputParameters;
    inputParameters.Add(new LongParameters ("Input Parameters",
                                            parameters.getLongParameterList()));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(inFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an input file" << std::endl;
        return EXIT_FAILURE;
    }

    if(outFile.IsEmpty())
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Specify an output file" << std::endl;
        return EXIT_FAILURE;
    }

    intExcludeFlags = excludeFlags.AsInteger();

    if(myForceFlag && SamFlag::isDuplicate(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "Cannot specify --force and Duplicate in the excludeFlags.  Since --force indicates to override"
                  << " previous duplicate setting and the excludeFlags says to skip those, you can't do both.\n";
        return EXIT_FAILURE;
    }

    if(!SamFlag::isSecondary(intExcludeFlags))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Secondary reads must be excluded, edit --excludeFlags to include 0x0100\n";
        return EXIT_FAILURE;
    }

    if(!(intExcludeFlags & SamFlag::SUPPLEMENTARY_ALIGNMENT))
    {
        printUsage(std::cerr);
        inputParameters.Status();
        std::cerr << "ERROR: Supplementary reads must be excluded, edit --excludeFlags to include 0x0800\n";
        return EXIT_FAILURE;
    }

    if(logFile.IsEmpty())
    {
        logFile = outFile + ".log";
    }

    if(myDoRecab)
    {
        int status = myRecab.processRecabParam();
        if(status != 0)
        {
            inputParameters.Status();
            return(status);
        }
    }

    if(params)
    {
        inputParameters.Status();
    }

    Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);

    /* -------------------------------------------------------------------
     * The arguments are processed.  Prepare the input BAM file,
     * instantiate dedup_LowMem, and construct the read group library map
     * ------------------------------------------------------------------*/

    SamFile samIn;

    samIn.OpenForRead(inFile.c_str());
    // If the file isn't sorted it will throw an exception.
    samIn.setSortedValidation(SamFile::COORDINATE);

    SamFileHeader header;
    samIn.ReadHeader(header);

    buildReadGroupLibraryMap(header);

    lastReference = -1;
    lastCoordinate = -1;

    // for keeping some basic statistics
    uint32_t recordCount = 0;
    uint32_t pairedCount = 0;
    uint32_t properPairCount = 0;
    uint32_t unmappedCount = 0;
    uint32_t reverseCount = 0;
    uint32_t qualCheckFailCount = 0;
    uint32_t secondaryCount = 0;
    uint32_t supplementaryCount = 0;
    uint32_t excludedCount = 0;

    // Now we start reading records
    SamRecord* recordPtr;
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
    while(returnStatus == SamStatus::SUCCESS)
    {
        recordPtr = mySamPool.getRecord();
        if(recordPtr == NULL)
        {
            std::cerr << "Failed to allocate enough records\n";
            return(-1);
        }
        if(!samIn.ReadRecord(header, *recordPtr))
        {
            returnStatus = samIn.GetStatus();
            continue;
        }
        // Take note of properties of this record
        int flag = recordPtr->getFlag();
        if(SamFlag::isPaired(flag))     ++pairedCount;
        if(SamFlag::isProperPair(flag)) ++properPairCount;
        if(SamFlag::isReverse(flag))    ++reverseCount;
        if(SamFlag::isQCFailure(flag))  ++qualCheckFailCount;
        if(SamFlag::isSecondary(flag))  ++secondaryCount;
        if(flag & SamFlag::SUPPLEMENTARY_ALIGNMENT)  ++supplementaryCount;
        if(!SamFlag::isMapped(flag))    ++unmappedCount;

        // put the record in the appropriate maps:
        //   single reads go in myFragmentMap
        //   paired reads go in myPairedMap
        recordCount = samIn.GetCurrentRecordCount();

        // if we have moved to a new position, look back at previous reads for duplicates
        if (hasPositionChanged(*recordPtr))
        {
            cleanupPriorReads(recordPtr);
        }

        // Determine if this read should be checked for duplicates.
        if((!SamFlag::isMapped(flag)) || ((flag & intExcludeFlags) != 0))
        {
            ++excludedCount;

            // No deduping done on this record, but still build the recab table.
            if(myDoRecab)
            {
                myRecab.processReadBuildTable(*recordPtr);
            }
            // Nothing more to do with this record, so
            // release the pointer.
            mySamPool.releaseRecord(recordPtr);
        }
        else
        {
            if(SamFlag::isDuplicate(flag) && !myForceFlag)
            {
                // Error: Marked duplicates, and duplicates aren't excluded.
                Logger::gLogger->error("There are records already duplicate marked.");
                Logger::gLogger->error("Use -f to clear the duplicate flag and start the dedup_LowMem procedure over");
            }

            checkDups(*recordPtr, recordCount);
            mySamPool.releaseRecord(recordPtr);
        }
        // let the user know we're not napping
        if (verboseFlag && (recordCount % 100000 == 0))
        {
            Logger::gLogger->writeLog("recordCount=%u singleKeyMap=%u pairedKeyMap=%u, dictSize=%u",
                                      recordCount, myFragmentMap.size(),
                                      myPairedMap.size(),
                                      myMateMap.size());
        }
    }

    // we're finished reading record so clean up the duplicate search and
    //  close the input file
    cleanupPriorReads(NULL);
    samIn.Close();

    // print some statistics
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("SUMMARY STATISTICS OF THE READS");
    Logger::gLogger->writeLog("Total number of reads: %u",recordCount);
    Logger::gLogger->writeLog("Total number of paired-end reads: %u",
                              pairedCount);
    Logger::gLogger->writeLog("Total number of properly paired reads: %u",
                              properPairCount);
    Logger::gLogger->writeLog("Total number of unmapped reads: %u",
                              unmappedCount);
    Logger::gLogger->writeLog("Total number of reverse strand mapped reads: %u",
                              reverseCount);
    Logger::gLogger->writeLog("Total number of QC-failed reads: %u",
                              qualCheckFailCount);
    Logger::gLogger->writeLog("Total number of secondary reads: %u",
                              secondaryCount);
    Logger::gLogger->writeLog("Total number of supplementary reads: %u",
                              supplementaryCount);
    Logger::gLogger->writeLog("Size of singleKeyMap (must be zero): %u",
                              myFragmentMap.size());
    Logger::gLogger->writeLog("Size of pairedKeyMap (must be zero): %u",
                              myPairedMap.size());
    Logger::gLogger->writeLog("Total number of missing mates: %u",
                              myNumMissingMate);
    Logger::gLogger->writeLog("Total number of reads excluded from duplicate checking: %u",
                              excludedCount);
    Logger::gLogger->writeLog("--------------------------------------------------------------------------");
    Logger::gLogger->writeLog("Sorting the indices of %d duplicated records",
                              myDupList.size());

    // sort the indices of duplicate records
    std::sort(myDupList.begin(), myDupList.end(),
              std::less<uint32_t> ());

    // get ready to write the output file by making a second pass
    // through the input file
    samIn.OpenForRead(inFile.c_str());
    samIn.ReadHeader(header);

    SamFile samOut;
    samOut.OpenForWrite(outFile.c_str());
    samOut.WriteHeader(header);

    // If we are recalibrating, output the model information.
    if(myDoRecab)
    {
        myRecab.modelFitPrediction(outFile);
    }

    // an iterator to run through the duplicate indices
    int currentDupIndex = 0;
    bool moreDups = !myDupList.empty();

    // let the user know what we're doing
    Logger::gLogger->writeLog("\nWriting %s", outFile.c_str());

    // count the duplicate records as a check
    uint32_t singleDuplicates(0), pairedDuplicates(0);

    // start reading records and writing them out
    SamRecord record;
    while(samIn.ReadRecord(header, record))
    {
        uint32_t currentIndex = samIn.GetCurrentRecordCount();

        bool foundDup = moreDups &&
                        (currentIndex == myDupList[currentDupIndex]);

        // modify the duplicate flag and write out the record,
        // if it's appropriate
        int flag = record.getFlag();
        if (foundDup)
        {
            // this record is a duplicate, so mark it.
            record.setFlag( flag | 0x400 );
            currentDupIndex++;
            // increment duplicate counters to verify we found them all
            if ( ( ( flag & 0x0001 ) == 0 ) || ( flag & 0x0008 ) )
            {   // unpaired or mate unmapped
                singleDuplicates++;
            }
            else
            {
                pairedDuplicates++;
            }
            // recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }

            // write the record if we are not removing duplicates
            if (!removeFlag ) samOut.WriteRecord(header, record);
        }
        else
        {
            if(myForceFlag)
            {
                // this is not a duplicate we've identified but we want to
                // remove any duplicate marking
                record.setFlag( flag & 0xfffffbff ); // unmark duplicate
            }
            // Not a duplicate, so recalibrate if necessary.
            if(myDoRecab)
            {
                myRecab.processReadApplyTable(record);
            }
            samOut.WriteRecord(header, record);
        }

        // Let the user know we're still here
        if (verboseFlag && (currentIndex % 100000 == 0)) {
            Logger::gLogger->writeLog("recordCount=%u", currentIndex);
        }
    }

    // We're done.  Close the files and print triumphant messages.
    samIn.Close();
    samOut.Close();

    Logger::gLogger->writeLog("Successfully %s %u unpaired and %u paired duplicate reads",
                              removeFlag ? "removed" : "marked" ,
                              singleDuplicates,
                              pairedDuplicates/2);
    Logger::gLogger->writeLog("\nDedup_LowMem complete!");
    return 0;
}
int ReadReference::execute(int argc, char **argv)
{
    static const int UNSPECIFIED_INT = -1;
    String refFile = "";
    String refName = "";
    int start = UNSPECIFIED_INT;
    int numBases = UNSPECIFIED_INT;
    int end = UNSPECIFIED_INT;
    bool params = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_INTPARAMETER("start", &start)
        LONG_INTPARAMETER("end", &end)
        LONG_INTPARAMETER("numBases", &numBases)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);
    
    if((refName == "") || (start == UNSPECIFIED_INT) || 
       ((end == UNSPECIFIED_INT) && (numBases == UNSPECIFIED_INT)))
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing Required Parameter\n\n";
        return(-1);
    }
    if((end != UNSPECIFIED_INT) && (numBases != UNSPECIFIED_INT))
    {
        usage();
        inputParameters.Status();
        std::cerr << "Only --end or --numBases can be specified\n\n";
        return(-1);
    }
    else if(numBases != UNSPECIFIED_INT)
    {
        end = start + numBases;
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the reference.
    GenomeSequence reference(refFile);

    uint32_t refStart = 
        reference.getGenomePosition(refName.c_str());

    if(refStart == INVALID_GENOME_INDEX)
    {
        std::cerr << "Reference Name: " << refName.c_str()
                  << " not found in the reference file\n"; 
        return(-1);
    }

    std::string refString;
    
    reference.getString(refString, refStart + start, end - start);
    std::cout << refString << std::endl;
    
    return(0);
}
Example #8
0
int Convert::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String refFile = "";
    bool lshift = false;
    bool noeof = false;
    bool params = false;

    bool useBases = false;
    bool useEquals = false;
    bool useOrigSeq = false;

    bool recover = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("recover", &recover)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SequenceConversion")
            EXCLUSIVE_PARAMETER("useBases", &useBases)
            EXCLUSIVE_PARAMETER("useEquals", &useEquals)
            EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }
    
    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(outFile == "")
    {
        printUsage(std::cerr);
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the ref file was specified.
    // Open the reference.
    GenomeSequence* refPtr = NULL;
    if(refFile != "")
    {
        refPtr = new GenomeSequence(refFile);
    }

    SamRecord::SequenceTranslation translation;
    if((useBases) && (refPtr != NULL))
    {
        translation = SamRecord::BASES;
    }
    else if((useEquals) && (refPtr != NULL))
    {
        translation = SamRecord::EQUAL;
    }
    else
    {
        useOrigSeq = true;
        translation = SamRecord::NONE;
    }
    
    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    if(recover) samIn.setAttemptRecovery(true);
    samIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);
    samOut.SetWriteSequenceTranslation(translation);
    samOut.SetReference(refPtr);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    // Write the sam header.
    samOut.WriteHeader(samHeader);

    SamRecord samRecord;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    while(1) {
        try {
            // Keep reading records until ReadRecord returns false.
            while(samIn.ReadRecord(samHeader, samRecord))
            {
                // left shift if necessary.
                if(lshift)
                {
                    samRecord.shiftIndelsLeft();
                }

                // Successfully read a record from the file, so write it.
                if(!samOut.WriteRecord(samHeader, samRecord))
                {
                    // Failed to write a record.
                    fprintf(stderr, "%s\n", samOut.GetStatusMessage());
                    returnStatus = samOut.GetStatus();
                }
            }
            break;
        } catch (std::runtime_error e) {
            std::cerr << "Caught runtime error: " << e.what() << "\n";
            if(!recover) {
                std::cerr << "Corrupted BAM file detected - consider using --recover option.\n";
                break;
            }
            std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n";
            // XXX need to resync SamFile stream here
            bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH);
            if(rc) {
                std::cerr << "Successful resync - some data lost.\n";
                continue;    // succeeded
            }
            std::cerr << "Failed to re-sync on data stream.\n";
            break;              // failed to resync
        }
    }

    std::cerr << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cerr << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    if(refPtr != NULL)
    {
        delete(refPtr);
    }

    // Since the reads were successful, return the status based
    // on the status of the writes.  If any failed, return
    // their failure status.
    return(returnStatus);
}
Example #9
0
// main function of verifyBamID
int execute(int argc, char** argv) {
  printf("verifyBamID %s -- verify identity and purity of sequence data\n"
	 "(c) 2010-2014 Hyun Min Kang, Goo Jun, and Goncalo Abecasis\n\n", VERSION);

  VerifyBamIDArgs args;
  ParameterList pl;

  BEGIN_LONG_PARAMETERS(longParameters)
    LONG_PARAMETER_GROUP("Input Files")
    LONG_STRINGPARAMETER("vcf",&args.sVcfFile)
    LONG_STRINGPARAMETER("bam",&args.sBamFile)
    LONG_STRINGPARAMETER("subset",&args.sSubsetInds)
    LONG_STRINGPARAMETER("smID",&args.sSMID)

    LONG_PARAMETER_GROUP("VCF analysis options")
    LONG_DOUBLEPARAMETER("genoError",&args.genoError)
    LONG_DOUBLEPARAMETER("minAF",&args.minAF)
    LONG_DOUBLEPARAMETER("minCallRate",&args.minCallRate)

    LONG_PARAMETER_GROUP("Individuals to compare with chip data")
    EXCLUSIVE_PARAMETER("site",&args.bSiteOnly)
    EXCLUSIVE_PARAMETER("self",&args.bSelfOnly)
    EXCLUSIVE_PARAMETER("best",&args.bFindBest)

    LONG_PARAMETER_GROUP("Chip-free optimization options")
    EXCLUSIVE_PARAMETER("free-none",&args.bFreeNone)
    EXCLUSIVE_PARAMETER("free-mix",&args.bFreeMixOnly)
    EXCLUSIVE_PARAMETER("free-refBias",&args.bFreeRefBiasOnly)
    EXCLUSIVE_PARAMETER("free-full",&args.bFreeFull)

    LONG_PARAMETER_GROUP("With-chip optimization options")
    EXCLUSIVE_PARAMETER("chip-none",&args.bChipNone)
    EXCLUSIVE_PARAMETER("chip-mix",&args.bChipMixOnly)
    EXCLUSIVE_PARAMETER("chip-refBias",&args.bChipRefBiasOnly)
    EXCLUSIVE_PARAMETER("chip-full",&args.bChipFull)

    LONG_PARAMETER_GROUP("BAM analysis options")
    LONG_PARAMETER("ignoreRG",&args.bIgnoreRG)
    LONG_PARAMETER("ignoreOverlapPair",&args.bIgnoreOverlapPair)
    LONG_PARAMETER("noEOF",&args.bNoEOF)
    LONG_PARAMETER("precise",&args.bPrecise)
    LONG_INTPARAMETER("minMapQ",&args.minMapQ)
    LONG_INTPARAMETER("maxDepth",&args.maxDepth)
    LONG_INTPARAMETER("minQ",&args.minQ)
    LONG_INTPARAMETER("maxQ",&args.maxQ)
    LONG_DOUBLEPARAMETER("grid",&args.grid)

    LONG_PARAMETER_GROUP("Modeling Reference Bias")
    LONG_DOUBLEPARAMETER("refRef",&args.pRefRef)
    LONG_DOUBLEPARAMETER("refHet",&args.pRefHet)
    LONG_DOUBLEPARAMETER("refAlt",&args.pRefAlt)

    LONG_PARAMETER_GROUP("Output options")
    LONG_STRINGPARAMETER("out",&args.sOutFile)
    LONG_PARAMETER("verbose",&args.bVerbose)
    LONG_PHONEHOME(VERSION)
  END_LONG_PARAMETERS();

  pl.Add(new LongParameters("Available Options",longParameters));
  pl.Read(argc, argv);
  pl.Status();

  // check the validity of input files
  if ( args.sVcfFile.IsEmpty() ) {
    error("--vcf [vcf file] required");
  }

  if ( args.sBamFile.IsEmpty() ) {
    error("--bam [bam file] is required");
  }

  if ( args.sOutFile.IsEmpty() ) {
    error("--out [output prefix] is required");
  }
  Logger::gLogger = new Logger((args.sOutFile + ".log").c_str(), args.bVerbose);

  if ( ! ( args.bSiteOnly || args.bSelfOnly || args.bFindBest ) ) {
    warning("--self option was autotomatically turned on by default. Specify --best option if you wanted to check across all possible samples in the VCF");
    args.bSelfOnly = true;
  }

  if ( ( args.maxDepth > 20 ) && ( !args.bPrecise ) ) {
    warning("--precise option is not turned on at --maxDepth %d : may be prone to precision errors",args.maxDepth);
  }

  if ( ( args.bChipRefBiasOnly ) && ( !args.bSelfOnly ) ) {
    error("--self must be set for --chip-refBias to work. Skipping..");
  }

  // check timestamp
  time_t t;
  time(&t);
  Logger::gLogger->writeLog("Analysis started on %s",ctime(&t));

  // load arguments
  VerifyBamID vbid(&args);

  // load input VCF and BAM files
  Logger::gLogger->writeLog("Opening Input Files");
  vbid.loadFiles(args.sBamFile.c_str(), args.sVcfFile.c_str());

  // Check which genotype-free method is used
  if ( args.bFreeNone ) {  // if no genotype-free mode is tested. skip it
    // do nothing for genotype-free estimation
    Logger::gLogger->writeLog("Skipping chip-free estimation of sample mixture");
  }
  else if ( args.bFreeMixOnly ) { // only mixture is estimated.
    // genotype-free method
    Logger::gLogger->writeLog("Performing chip-free estimation of sample mixture at fixed reference bias parameters (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);

    // scan across multiple readgroups
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::mixLLK mix(&vbid);
      mix.OptimizeLLK(rg);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf, LLK0 = %lf, LLK1 = %lf\n",mix.fMix,mix.llk0,mix.llk1);
      vbid.mixOut.llk0s[rg+1] = mix.llk0;
      vbid.mixOut.llk1s[rg+1] = mix.llk1;
      vbid.mixOut.fMixs[rg+1] = mix.fMix;
    }

    //vbid.mixRefHet = 0.5;
    //vbid.mixRefAlt = 0.00;
  }
  else if ( args.bFreeRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias without sample mixture");
    for(int rg=-1; rg < vbid.nRGs - (int)args.bIgnoreRG; ++rg) {
      VerifyBamID::refBiasMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(2);
      startingPoint[0] = 0;      // pRefHet = 0.5
      startingPoint[1] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(2);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf at readGroup %d",pRefHet,pRefAlt,myMinimizer.fmin,rg);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  else if ( args.bFreeFull ) {
    Logger::gLogger->writeLog("Performing chip-free estimation of reference-bias and sample mixture together");
    for(int rg = -1; rg < vbid.nRGs - args.bIgnoreRG; ++rg) {
      VerifyBamID::fullMixLLKFunc myFunc(&vbid, rg);
      AmoebaMinimizer myMinimizer;
      Vector startingPoint(3);
      startingPoint[0] = -3.91;  // start with fMix = 0.01
      startingPoint[1] = 0;      // pRefHet = 0.5
      startingPoint[2] = -4.595; // pRefAlt = 0.01
      myMinimizer.func = &myFunc;
      myMinimizer.Reset(3);
      myMinimizer.point = startingPoint;
      myMinimizer.Minimize(1e-6);
      double fMix = VerifyBamID::invLogit(myMinimizer.point[0]);
      if ( fMix > 0.5 ) 
	fMix = 1.-fMix;
      double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
      double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);
      Logger::gLogger->writeLog("Optimal per-sample fMix = %lf\n",fMix);
      Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

      vbid.mixOut.llk0s[rg+1] = myFunc.llk0;
      vbid.mixOut.llk1s[rg+1] = myFunc.llk1;
      vbid.mixOut.fMixs[rg+1] = myFunc.fMix;
      vbid.mixOut.refHets[rg+1] = myFunc.pRefHet;
      vbid.mixOut.refAlts[rg+1] = myFunc.pRefAlt;
    }
  }
  Logger::gLogger->writeLog("calculating depth distribution");  
  vbid.calculateDepthDistribution(args.maxDepth, vbid.mixOut);

  Logger::gLogger->writeLog("finished calculating depth distribution");  

  std::vector<int> bestInds(vbid.nRGs+1,-1);
  std::vector<int> selfInds(vbid.nRGs+1,-1);

  if ( args.bChipNone ) {
    // do nothing
    Logger::gLogger->writeLog("Skipping with-chip estimation of sample mixture");
  }
  else if ( args.bChipMixOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of sample mixture at fixed reference bias parameter (%lf, %lf, %lf)",args.pRefRef,args.pRefHet,args.pRefAlt);
    
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;
      VerifyBamID::ibdLLK ibd(&vbid);
      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	double fIBD = ibd.OptimizeLLK(i, rg);
	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(),fIBD, ibd.llk0, ibd.llk1, rg);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  vbid.bestOut.llk0s[rg+1] = ibd.llk0;
	  vbid.bestOut.llk1s[rg+1] = ibd.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1-ibd.fIBD;
	  maxIBD = ibd.fIBD;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = ibd.llk0;
	  vbid.selfOut.llk1s[rg+1] = ibd.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1-ibd.fIBD;
	}
      }

      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1],rg,vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }
  else if ( args.bChipRefBiasOnly ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias without sample mixture");
    if ( args.bSelfOnly ) {
      for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
	VerifyBamID::refBiasIbdLLKFunc myFunc(&vbid, rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(2);
	startingPoint[0] = 0;      // pRefHet = 0.5
	startingPoint[1] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;
	myMinimizer.Reset(2);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[1]);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	//vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);

	vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	vbid.calculateDepthByGenotype(0,rg,vbid.selfOut);
      }
    }
    else {
      Logger::gLogger->warning("--self must be set for --chip-refBias to work. Skipping..");
    }
  }
  else if ( args.bChipFull ) {
    Logger::gLogger->writeLog("Performing with-chip estimation of reference-bias and sample mixture together");
    for(int rg=-1; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      double maxIBD = -1;

      for(int i=0; i < (int)vbid.pGenotypes->indids.size(); ++i) {
	VerifyBamID::fullIbdLLKFunc myFunc(&vbid,i,rg);
	AmoebaMinimizer myMinimizer;
	Vector startingPoint(3);
	startingPoint[0] = 3.91;  // start with fIBD = 0.99
	startingPoint[1] = 0;      // pRefHet = 0.5
	startingPoint[2] = -4.595; // pRefAlt = 0.01
	myMinimizer.func = &myFunc;

	myFunc.indIdx = i;
	myMinimizer.Reset(3);
	myMinimizer.point = startingPoint;
	myMinimizer.Minimize(1e-6);
	double fIBD = VerifyBamID::invLogit(myMinimizer.point[0]);
	double pRefHet = VerifyBamID::invLogit(myMinimizer.point[1]);
	double pRefAlt = VerifyBamID::invLogit(myMinimizer.point[2]);

	Logger::gLogger->writeLog("Comparing with individual %s.. Optimal fIBD = %lf, LLK0 = %lf, LLK1 = %lf for readgroup %d",vbid.pGenotypes->indids[i].c_str(), fIBD, myFunc.llk0, myFunc.llk1, rg);
	//Logger::gLogger->writeLog("Optimal per-sample fIBD = %lf, ",fIBD);
	Logger::gLogger->writeLog("Reference Bias Estimated as ( Pr[refBase|HET] = %lf, Pr[refBase|ALT] = %lf ) with LLK = %lf",pRefHet,pRefAlt,myMinimizer.fmin);
	if ( maxIBD < fIBD ) {
	  bestInds[rg+1] = i;
	  maxIBD = fIBD;
	  vbid.bestOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.bestOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.bestOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.bestOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.bestOut.refAlts[rg+1] = myFunc.pRefAlt;
	}

	if ( ( (rg < 0) && (vbid.pPile->sBamSMID == vbid.pGenotypes->indids[i] ) ) || ( ( rg >= 0 ) && ( vbid.pPile->vsSMIDs[rg] == vbid.pGenotypes->indids[i]) ) ) {
	  selfInds[rg+1] = i;
	  vbid.selfOut.llk0s[rg+1] = myFunc.llk0;
	  vbid.selfOut.llk1s[rg+1] = myFunc.llk1;
	  vbid.selfOut.fMixs[rg+1] = 1.-myFunc.fIBD;
	  vbid.selfOut.refHets[rg+1] = myFunc.pRefHet;
	  vbid.selfOut.refAlts[rg+1] = myFunc.pRefAlt;
	  vbid.calculateDepthByGenotype(i, rg, vbid.selfOut);
	}
      }
      //vbid.setRefBiasParams(1.0, pRefHet, pRefAlt);
      if ( bestInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Best Matching Individual is %s with IBD = %lf",vbid.pGenotypes->indids[bestInds[rg+1]].c_str(),maxIBD);
	vbid.calculateDepthByGenotype(bestInds[rg+1], rg, vbid.bestOut);
      }

      if ( selfInds[rg+1] >= 0 ) {
	Logger::gLogger->writeLog("Self Individual is %s with IBD = %lf",vbid.pGenotypes->indids[selfInds[rg+1]].c_str(),vbid.selfOut.fMixs[rg+1]);
	vbid.calculateDepthByGenotype(selfInds[rg+1],rg,vbid.selfOut);
      }
    }
  }

  // PRINT OUTPUT FILE - ".selfSM"
  // [SEQ_ID]  : SAMPLE ID in the sequence file
  // [CHIP_ID] : SAMPLE ID in the chip file (NA if not available)
  // [#SNPS] : Number of markers evaluated
  // [#READS]   : Number of reads evaluated
  // [AVG_DP]   : Mean depth
  // [FREEMIX]  : Chip-free estimated alpha (% MIX in 0-1 scale), NA if unavailable
  // [FREELK1]  : Chip-free log-likelihood at estimated alpha
  // [FREELK0]  : Chip-free log-likelihood at 0% contamination
  // [CHIPIBD]  : With-chip estimated alpha (% MIX in 0-1 scale)
  // [CHIPLK1]  : With-chip log-likelihood at estimated alpha
  // [CHIPLK0]  : With-chip log-likelihood at 0% contamination
  // [DPREF]    : Depth at reference site in the chip
  // [RDPHET]   : Relative depth at HET site in the chip
  // [RDPALT]   : Relative depth at HOMALT site in the chip
  // [FREE_RF]  : Pr(Ref|Ref) site estimated without chip data
  // [FREE_RH]  : Pr(Ref|Het) site estimated without chip data
  // [FREE_RA]  : Pr(Ref|Alt) site estimated without chip data
  // [CHIP_RF]  : Pr(Ref|Ref) site estimated with chip data
  // [CHIP_RH]  : Pr(Ref|Het) site estimated with chip data
  // [CHIP_RA]  : Pr(Ref|Alt) site estimated with chip data
  // [DPREF]    : Depth at reference alleles
  // [RDPHET]   : Relative depth at heterozygous alleles
  // [RDPALT]   : Relative depth at hom-alt alleles

  String selfSMFN = args.sOutFile + ".selfSM";
  String bestSMFN = args.sOutFile + ".bestSM";
  String selfRGFN = args.sOutFile + ".selfRG";
  String bestRGFN = args.sOutFile + ".bestRG";
  String dpSMFN = args.sOutFile + ".depthSM";
  String dpRGFN = args.sOutFile + ".depthRG";

  IFILE selfSMF = ifopen(selfSMFN,"wb");
  IFILE bestSMF = (args.bFindBest ? ifopen(bestSMFN,"wb") : NULL);
  IFILE selfRGF = (args.bIgnoreRG ? NULL : ifopen(selfRGFN,"wb"));
  IFILE bestRGF = (args.bFindBest && !args.bIgnoreRG) ? ifopen(bestRGFN,"wb") : NULL;

  IFILE dpSMF = ifopen(dpSMFN,"wb");
  IFILE dpRGF = (args.bIgnoreRG ? NULL : ifopen(dpRGFN,"wb"));
  if ( selfSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",selfSMF);
  }
  if ( args.bFindBest && ( bestSMF == NULL ) ) {
    Logger::gLogger->error("Cannot write to %s",bestSMF);
  }
  if ( dpSMF == NULL ) {
    Logger::gLogger->error("Cannot write to %s",dpSMF);
  }

  ifprintf(dpSMF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
  int nCumMarkers = 0;
  for(int i=args.maxDepth; i >= 0; --i) {
    nCumMarkers += vbid.mixOut.depths[i];
    ifprintf(dpSMF,"ALL\t%d\t%d\t%.5lf\t%.5lf\n",i, vbid.mixOut.depths[i],(double) vbid.mixOut.depths[i]/(double)vbid.nMarkers,(double)nCumMarkers/(double)vbid.nMarkers);
  }
  ifclose(dpSMF);


  if ( dpRGF != NULL ) {
    ifprintf(dpRGF,"#RG\tDEPTH\t#SNPs\t%%SNPs\t%%CUMUL\n");
    for(int rg=0; rg < (vbid.nRGs - (int)args.bIgnoreRG); ++rg) {
      const char* rgID = vbid.pPile->vsRGIDs[rg].c_str();

      int nMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	nMarkers += vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
      }

      nCumMarkers = 0;
      for(int i=args.maxDepth; i >= 0; --i) {
	int d = vbid.mixOut.depths[(rg+1)*(args.maxDepth+1) + i];
	nCumMarkers += d;
	ifprintf(dpRGF,"%s\t%d\t%d\t%.5lf\t%.5lf\n",rgID,i,d,(double)d/(double)vbid.nMarkers,(double)nCumMarkers/(double)nMarkers);
      }
    }
    ifclose(dpRGF);
  }

  const char* headers[] = {"#SEQ_ID","RG","CHIP_ID","#SNPS","#READS","AVG_DP","FREEMIX","FREELK1","FREELK0","FREE_RH","FREE_RA","CHIPMIX","CHIPLK1","CHIPLK0","CHIP_RH","CHIP_RA","DPREF","RDPHET","RDPALT"};
  int nheaders = sizeof(headers)/sizeof(headers[0]);

  for(int i=0; i < nheaders; ++i) { ifprintf(selfSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
  ifprintf(selfSMF,"\n");
  ifprintf(selfSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
  ifprintf(selfSMF,"\t%s",selfInds[0] >= 0 ? vbid.pGenotypes->indids[selfInds[0]].c_str() : "NA");
  ifprintf(selfSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
  if ( args.bFreeNone ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bFreeMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
  else if ( args.bFreeRefBiasOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else if ( args.bFreeFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
  else { error("Invalid option in handling bFree"); }

  if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[0],vbid.selfOut.llk1s[0],vbid.selfOut.llk0s[0],(double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipMixOnly ) { ifprintf(selfSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else if ( args.bChipFull ) { ifprintf(selfSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[0], vbid.selfOut.llk1s[0], vbid.selfOut.llk0s[0], vbid.selfOut.refHets[0], vbid.selfOut.refAlts[0], (double)vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[1], (double)vbid.selfOut.numReads[2]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[2], (double)vbid.selfOut.numReads[3]*vbid.selfOut.numGenos[1]/vbid.selfOut.numReads[1]/vbid.selfOut.numGenos[3]); }
  else { error("Invalid option in handling bChip"); }
  ifprintf(selfSMF,"\n");
  ifclose(selfSMF);

  if ( bestSMF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestSMF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestSMF,"\n");
    ifprintf(bestSMF,"%s\tALL",vbid.pPile->sBamSMID.c_str());
    ifprintf(bestSMF,"\t%s",bestInds[0] >= 0 ? vbid.pGenotypes->indids[bestInds[0]].c_str() : "NA");
    ifprintf(bestSMF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[0],(double)vbid.mixOut.numReads[0]/(double)vbid.nMarkers);
    if ( args.bFreeNone ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bFreeMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0]); }
    else if ( args.bFreeRefBiasOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else if ( args.bFreeFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[0],vbid.mixOut.llk1s[0],vbid.mixOut.llk0s[0],vbid.mixOut.refHets[0],vbid.mixOut.refAlts[0]); }
    else { error("Invalid option in handling bFree"); }
    
    if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestSMF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[0],vbid.bestOut.llk1s[0],vbid.bestOut.llk0s[0],(double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipMixOnly ) { ifprintf(bestSMF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else if ( args.bChipFull ) { ifprintf(bestSMF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[0], vbid.bestOut.llk1s[0], vbid.bestOut.llk0s[0], vbid.bestOut.refHets[0], vbid.bestOut.refAlts[0], (double)vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[1], (double)vbid.bestOut.numReads[2]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[2], (double)vbid.bestOut.numReads[3]*vbid.bestOut.numGenos[1]/vbid.bestOut.numReads[1]/vbid.bestOut.numGenos[3]); }
    else { error("Invalid option in handling bChip"); }
    ifprintf(bestSMF,"\n");
    ifclose(bestSMF);
  }

  if ( selfRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(selfRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(selfRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(selfRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(selfRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(selfRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(selfRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(selfRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(selfRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.selfOut.fMixs[rg+1], vbid.selfOut.llk1s[rg+1], vbid.selfOut.llk0s[rg+1], vbid.selfOut.refHets[rg+1], vbid.selfOut.refAlts[rg+1], (double)vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+1], (double)vbid.selfOut.numReads[(rg+1)*4+2]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+2], (double)vbid.selfOut.numReads[(rg+1)*4+3]*vbid.selfOut.numGenos[(rg+1)*4+1]/vbid.selfOut.numReads[(rg+1)*4+1]/vbid.selfOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(selfRGF,"\n");
    }
    ifclose(selfRGF);
  }

  if ( bestRGF != NULL ) {
    for(int i=0; i < nheaders; ++i) { ifprintf(bestRGF,"%s%s",i>0 ? "\t" : "",headers[i]); }
    ifprintf(bestRGF,"\n");
    for(int rg=0; rg < vbid.nRGs; ++rg) {
      ifprintf(bestRGF,"%s\t%s",vbid.pPile->sBamSMID.c_str(),vbid.pPile->vsRGIDs[rg].c_str());
      ifprintf(bestRGF,"\t%s",bestInds[rg] >= 0 ? vbid.pGenotypes->indids[bestInds[rg]].c_str() : "NA");
      ifprintf(bestRGF,"\t%d\t%d\t%.2lf",vbid.nMarkers,vbid.mixOut.numReads[(rg+1)*4],(double)vbid.mixOut.numReads[(rg+1)*4]/(double)vbid.mixOut.numGenos[(rg+1)*4]);
      if ( args.bFreeNone ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bFreeMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1]); }
      else if ( args.bFreeRefBiasOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else if ( args.bFreeFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf",vbid.mixOut.fMixs[rg+1],vbid.mixOut.llk1s[rg+1],vbid.mixOut.llk0s[rg+1],vbid.mixOut.refHets[rg+1],vbid.mixOut.refAlts[rg+1]); }
      else { error("Invalid option in handling bFree"); }
      
      if ( args.bChipNone || bestInds[0] < 0 ) { ifprintf(bestRGF,"\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\tNA\tNA\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipMixOnly ) { ifprintf(bestRGF,"\tNA\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf",vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else if ( args.bChipFull ) { ifprintf(bestRGF,"\t%.5lf\t%.2lf\t%.2lf\t%.5lf\t%.5lf\t%.3lf\t%.4lf\t%.4lf", vbid.bestOut.fMixs[rg+1], vbid.bestOut.llk1s[rg+1], vbid.bestOut.llk0s[rg+1], vbid.bestOut.refHets[rg+1], vbid.bestOut.refAlts[rg+1], (double)vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+1], (double)vbid.bestOut.numReads[(rg+1)*4+2]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+2], (double)vbid.bestOut.numReads[(rg+1)*4+3]*vbid.bestOut.numGenos[(rg+1)*4+1]/vbid.bestOut.numReads[(rg+1)*4+1]/vbid.bestOut.numGenos[(rg+1)*4+3]); }
      else { error("Invalid option in handling bChip"); }
      ifprintf(bestRGF,"\n");
    }
    ifclose(bestRGF);
  }
  
  time(&t);
  Logger::gLogger->writeLog("Analysis finished on %s",ctime(&t));

  return 0;
}
Example #10
0
int main(int argc, char * argv[])
   {
   printf("PedWipe - (c) 2000 Goncalo Abecasis\n"
          "Automatically wipe out genotypes from a pedigree file\n\n");

   String pedfile("merlin.ped");
   String datafile("merlin.dat");
   String errorfile("merlin.err");

   bool showTallies = false;

   ParameterList pl;

   pl.Add(new StringParameter('d', "Data File", datafile));
   pl.Add(new StringParameter('p', "Pedigree File", pedfile));
   pl.Add(new StringParameter('e', "Errors File", errorfile));
   pl.Add(new SwitchParameter('t', "Show Tallies", showTallies));

   pl.Read(argc, argv);
   pl.Status();

   Pedigree ped;

   ped.Prepare(datafile);
   ped.Load(pedfile);

   StringArray errors, tokens;
   errors.Read(errorfile);

   int count = 0;
   StringIntMap perMarker, perFamily, perPerson;

   for (int i = 1; i < errors.Length(); i++)
      {
      tokens.Clear();
      tokens.AddTokens(errors[i]);

      if (tokens.Length() < 3) continue;

      Person * person = ped.FindPerson(tokens[0], tokens[1]);

      int markerid = ped.LookupMarker(tokens[2]);

      if (person == NULL)
         {
         printf("Person %s.%s not found ... \n",
                (const char *) tokens[0], (const char *) tokens[1]);
         continue;
         }

      if (markerid == -1)
         {
         printf("Marker %s not found ... \n",
             (const char *) tokens[2]);
         continue;
         }

      printf("Person %s.%s, marker %s wiped.\n",
         (const char *) tokens[0], (const char *) tokens[1],
         (const char *) tokens[2]);

      person->markers[markerid].one = 0;
      person->markers[markerid].two = 0;

      perPerson.IncrementCount(tokens[0] + "." + tokens[1]);
      perFamily.IncrementCount(tokens[0]);
      perMarker.IncrementCount(tokens[2]);

      count++;
      }

   if (perMarker.Length() == 0)
      {
      printf("No errors found in merlin.err\n");
      }
   else if (showTallies && count)
      {
      printf("\nSummary of Errors\n");
      printf("=================\n\n");

      QuickIndex index;

      printf("Per Marker:  (average = %.2f)\n"
             "-----------------------------\n",
            (double) count / (double) ped.markerCount);
      index.IndexCounts(perMarker);
      index.Reverse();
      for (int i = 0; i < perMarker.Length(); i++)
         printf(" %3d errors for marker %s\n",
                perMarker.GetCount(index[i]),
                (const char *) perMarker[index[i]]);

      printf("\nPer Family: (average = %.2f)\n"
             "----------------------------\n",
            (double) count / (double) ped.familyCount);
      index.IndexCounts(perFamily);
      index.Reverse();
      for (int i = 0; i < perFamily.Length(); i++)
         printf(" %3d errors for family %s\n",
                perFamily.GetCount(index[i]), (const char *) perFamily[index[i]]);

      printf("\nPer Person: (average = %.2f)\n"
             "----------------------------\n",
            (double) count / (double) ped.count);
      index.IndexCounts(perPerson);
      index.Reverse();
      for (int i = 0; i < perPerson.Length(); i++)
         printf(" %3d errors for person %s\n",
               perPerson.GetCount(index[i]), (const char *) perPerson[index[i]]);
      }

   printf("\nWriting out edited files [wiped.*] ...\n\n");

   if (ped.markerInfoCount)
      {
      ped.WriteMapFile("wiped.map");
      ped.WriteFreqFile("wiped.freq");
      }

   ped.WriteDataFile("wiped.dat");
   ped.WritePedigreeFile("wiped.ped");
   }
Example #11
0
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
Example #12
0
int main(int argc, char ** argv)
{   
   ParameterList inputParameters;
   String filename;
   int minReadLength = 10;
   int printableErrors = 20;
   int maxErrors = -1;
   String testParam;
   BaseAsciiMap::SPACE_TYPE myBaseType = BaseAsciiMap::UNKNOWN;
   
   // Read the parameters from the command line.
   bool baseSpace = false;
   bool colorSpace = false;
   bool autoDetect = false;
   bool ignoreErrors = false;
   bool baseComposition = false;
   bool avgQual = false;
   bool quiet = false;
   bool noeof = false;
   bool params = false;
   bool disableSeqIDCheck = false;
   bool interleaved = false;

   BEGIN_LONG_PARAMETERS(longParameterList)
      LONG_STRINGPARAMETER("file", &filename)
      LONG_PARAMETER("baseComposition", &baseComposition)
      LONG_PARAMETER("avgQual", &avgQual)
      LONG_PARAMETER("disableSeqIDCheck", &disableSeqIDCheck)
      LONG_PARAMETER("interleaved", &interleaved)
      LONG_PARAMETER("noeof", &noeof)
      LONG_PARAMETER("quiet", &quiet)
      LONG_PARAMETER("params", &params)
      LONG_INTPARAMETER("minReadLen", &minReadLength)
      LONG_INTPARAMETER("maxErrors", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("auto", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("printableErrors", &printableErrors)
   BEGIN_LEGACY_PARAMETERS()
      LONG_PARAMETER("printBaseComp", &baseComposition)       
      LONG_PARAMETER("disableAllMessages", &quiet)
      LONG_INTPARAMETER("quitAfterErrorNum", &maxErrors)
      LONG_PARAMETER_GROUP("Space Type")
         EXCLUSIVE_PARAMETER("baseSpace", &baseSpace)
         EXCLUSIVE_PARAMETER("colorSpace", &colorSpace)
         EXCLUSIVE_PARAMETER("autoDetect", &autoDetect)
      LONG_PARAMETER_GROUP("Errors")
         EXCLUSIVE_PARAMETER("ignoreAllErrors", &ignoreErrors)
         LONG_SMARTINTPARAMETER("maxReportedErrors", &printableErrors)
   END_LONG_PARAMETERS();
   
   inputParameters.Add(new LongParameters ("Input Parameters", longParameterList));

   inputParameters.Read(argc, argv);

   if(ignoreErrors)
   {
      // Ignore all errors, so set printableErrors to 0.
      printableErrors = 0;
   }

   // Set the base type based on the passed in parameters.
   if(baseSpace)
   {
      // Base Space
      myBaseType = BaseAsciiMap::BASE_SPACE;
   }
   else if(colorSpace)
   {
      myBaseType = BaseAsciiMap::COLOR_SPACE;
   }
   else
   {
      myBaseType = BaseAsciiMap::UNKNOWN;
      // Set autoDetect
      autoDetect = true;
   }

   // If no eof block is required for a bgzf file, set the bgzf file type to 
   // not look for it.
   if(noeof)
   {
       // Set that the eof block is not required.
       BgzfFileType::setRequireEofBlock(false);
   }

   // DO not print status if set to quiet.
   if((!quiet) && params)
   {
      inputParameters.Status();
   }

   if(filename == "")
   {
      if(quiet)
      {
         return(-1);
      }
      // No filename was specified so print a usage description.
      std::cout << "ERROR: No filename specified.  See below for usage help.";
      std::cout << std::endl << std::endl;

      std::cout << "  Required Parameters:" << std::endl;
      std::cout << "\t--file  :  FastQ filename with path to be prorcessed.\n";
      std::cout << std::endl;

      std::cout << "  Optional Parameters:" << std::endl;
      std::cout << "\t--minReadLen         : Minimum allowed read length (Defaults to 10).\n";
      std::cout << "\t--maxErrors          : Number of errors to allow before quitting\n";
      std::cout << "\t                       reading/validating the file.\n";
      std::cout << "\t                       -1 (default) indicates to not quit until\n";
      std::cout << "\t                       the entire file is read.\n";
      std::cout << "\t                       0 indicates not to read/validate anything\n";
      std::cout << "\t--printableErrors    : Maximum number of errors to print before\n";
      std::cout << "\t                       suppressing them (Defaults to 20).\n";
      std::cout << "\t                       Different than maxErrors since \n";
      std::cout << "\t                       printableErrors will continue reading and\n";
      std::cout << "\t                       validating the file until the end, but\n";
      std::cout << "\t                       just doesn't print the errors.\n";
      std::cout << "\t--ignoreErrors       : Ignore all errors (same as printableErrors = 0)\n";
      std::cout << "\t                       overwrites the printableErrors option.\n";
      std::cout << "\t--baseComposition    : Print the Base Composition Statistics.\n";
      std::cout << "\t--avgQual            : Print the average phred quality per cycle & overall average quality.\n";
      std::cout << "\t--disableSeqIDCheck  : Disable the unique sequence identifier check.\n";
      std::cout << "\t                       Use this option to save memory since the sequence id\n";
      std::cout << "\t                       check uses a lot of memory.\n";
      std::cout << "\t--noeof              : Disable checking that the eof block is present in gzipped files\n.";
      std::cout << "\t--interleaved        : Validate consequtive reads have the same sequence identifier\n";
      std::cout << "\t                       (only allowed difference is 1/2, but not required) and validate\n";
      std::cout << "\t                       that otherwise reads have unique sequence identifiers.\n";
      std::cout << "\t                       Cannot be used if '--disableSeqIDCheck' is specified.\n";
      std::cout << "\t--params             : Print the parameter settings.\n";
      std::cout << "\t--quiet              : Suppresses the display of errors and summary statistics.\n";
      std::cout << "\t                       Does not affect the printing of Base Composition Statistics.\n";

      std::cout << "\n  Optional Space Options for Raw Sequence (Last one specified is used):\n";
      std::cout << "\t--auto       : Determine baseSpace/colorSpace from the Raw Sequence in the file (Default).\n";
      std::cout << "\t--baseSpace  : ACTGN only\n";
      std::cout << "\t--colorSpace : 0123. only\n";
      std::cout << std::endl;

      std::cout << "  Usage:" << std::endl;
      std::cout << "\t./fastQValidator --file <fileName> [--minReadLen <minReadLen>] [--maxErrors <numErrors>] [--printableErrors <printableErrors>|--ignoreErrors] [--baseComposition] [--disableSeqIDCheck] [--interleaved] [--quiet] [--baseSpace|--colorSpace|--auto] [--params]\n\n";
      std::cout << "  Examples:" << std::endl;
      std::cout << "\t../fastQValidator --file testFile.txt\n";
      std::cout << "\t../fastQValidator --file testFile.txt --minReadLen 10 --baseSpace --printableErrors 100\n";
      std::cout << "\t./fastQValidator --file test/testFile.txt --minReadLen 10 --colorSpace --ignoreErrors\n";
      std::cout << std::endl;
      return (-1);
   }
   
   FastQFile validator(minReadLength, printableErrors);
   
   if(quiet)
   {
      validator.disableMessages();
   }

   if(disableSeqIDCheck)
   {
       validator.disableSeqIDCheck();
   }

   if(interleaved)
   {
       validator.interleaved();
   }

   if(interleaved && disableSeqIDCheck)
   {
       if(!quiet)
       {
           std::cout << "ERROR: --interleaved and --disableSeqIDCheck cannot both be specified.\n";
       }
       return(-1);
   }

   validator.setMaxErrors(maxErrors);

   FastQStatus::Status status = validator.validateFastQFile(filename, baseComposition, myBaseType, avgQual);

   if(!quiet)
   {
      std::cout << "Returning: " << status << " : " << FastQStatus::getStatusString(status)
                << std::endl;
   }

   return(status);
}
Example #13
0
int Dump::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Optional Other Parameters")
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(params)
    {
        inputParameters.Status();
    }

    GlfFile glfIn;
    GlfHeader glfHeader;

    // Open the file for reading.   
    glfIn.openForRead(inFile);

    // Read the glf header.
    glfIn.readHeader(glfHeader);

    // Output the glf header.
    std::string headerText = "";
    glfHeader.getHeaderTextString(headerText);
    std::cout << "GlfHeader:\n";
    std::cout << headerText << std::endl;

    int numSections = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    GlfStatus::Status returnStatus = GlfStatus::SUCCESS;
    
    GlfRefSection refSection;
    while(glfIn.getNextRefSection(refSection))
    {
        ++numSections;
        std::string refName;
        refSection.getName(refName);
        std::cout << "\tRefName = " << refName 
                  << "; RefLen = " << refSection.getRefLen() << "\n";
        int64_t numSectionRecords = 0;
        GlfRecord record;
        int pos = 0;
        while(glfIn.getNextRecord(record))
        {
            // Print the position.
            pos += record.getOffset();
            std::cout << "position: " << pos << "\n\t";
            record.print();
            ++numSectionRecords;
        }
    }
//         // Keep reading records until they aren't anymore.
//         while(glfIn.ReadRecord(glfHeader, glfRecord))
//         {
//             if(!readName.IsEmpty())
//             {
//                 // Check for readname.
//                 if(strcmp(glfRecord.getReadName(), readName.c_str()) != 0)
//                 {
//                     // not a matching read name, so continue to the next record.
//                     continue;
//                 }
//             }
            
//             // Check to see if the read has already been processed.
//             if(myPrevEnd != UNSPECIFIED_INT)
//             {
//                 // Because we already know that the bed was sorted, 
//                 // we know that the previous section started before
//                 // this one, so if the previous end is greater than
//                 // this record's end position we know that it
//                 // was already written in the previous section.
//                 // Note: can't be equal to the previous end since
//                 // the end range was exclusive, while
//                 // get0BasedAlignmentEnd is inclusive.
//                 // myPrevEnd is reset by getNextSection when a new
//                 // chromosome is hit.
//                 if(glfRecord.get0BasedAlignmentEnd() < myPrevEnd)
//                 {
//                     // This record was already written.
//                     continue;
//                 }
//             }

//             // Shift left if applicable.
//             if(lshift)
//             {
//                 glfRecord.shiftIndelsLeft();
//             }

//             // Successfully read a record from the file, so write it.
//             glfOut.WriteRecord(glfHeader, glfRecord);
//             ++numSectionRecords;
//         }
//         myWroteReg = true;
//     }

//     if(myBedFile != NULL)
//     {
//         ifclose(myBedFile);
//     }
//     std::cerr << "Wrote " << outFile << " with " << numSectionRecords
//               << " records.\n";
     return(returnStatus);
}
Example #14
0
int GapInfo::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String refFile = "";
    bool detailed = false;
    bool checkFirst = false;
    bool checkStrand = false;
    bool noeof = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("refFile", &refFile)
        LONG_PARAMETER("detailed", &detailed)
        LONG_PARAMETER_GROUP("Optional Detailed Parameters")
        LONG_PARAMETER("checkFirst", &checkFirst)
        LONG_PARAMETER("checkStrand", &checkStrand)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Check to see if the out file was specified, if not, report an error.
    if(outFile == "")
    {
        usage();
        inputParameters.Status();
        // Out file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    return(processFile(inFile.c_str(), outFile.c_str(),
                       refFile, detailed, 
                       checkFirst, checkStrand));
}
Example #15
0
int WriteRegion::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    String indexFile = "";
    String readName = "";
    String bed = "";
    myStart = UNSPECIFIED_INT;
    myEnd = UNSPECIFIED_INT;
    myPrevStart = UNSPECIFIED_INT;
    myPrevEnd = UNSPECIFIED_INT;
    myRefID = UNSET_REF;
    myRefName.Clear();
    myPrevRefName.Clear();
    myBedRefID = SamReferenceInfo::NO_REF_ID;
    bool lshift = false;
    bool noeof = false;
    bool params = false;
    myWithinReg = false;
    myWroteReg = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER_GROUP("Optional Region Parameters")        
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("refName", &myRefName)
        LONG_INTPARAMETER("refID", &myRefID)
        LONG_INTPARAMETER("start", &myStart)
        LONG_INTPARAMETER("end", &myEnd)
        LONG_STRINGPARAMETER("bed", &bed)
        LONG_PARAMETER("withinReg", &myWithinReg)
        LONG_STRINGPARAMETER("readName", &readName)
        LONG_PARAMETER_GROUP("Optional Other Parameters")
        LONG_PARAMETER("lshift", &lshift)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --in" << std::endl;
        return(-1);
    }
    if(outFile == "")
    {
        usage();
        // mandatory argument was not specified.
        inputParameters.Status();
        std::cerr << "Missing mandatory argument: --out" << std::endl;
        return(-1);
    }
    
    if(indexFile == "")
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }

    if(myRefID != UNSET_REF && myRefName.Length() != 0)
    {
        std::cerr << "Can't specify both refID and refName" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefID != UNSET_REF && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refID and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }
    if(myRefName.Length() != 0 && bed.Length() != 0)
    {
        std::cerr << "Can't specify both refName and bed" << std::endl;
        inputParameters.Status();
        return(-1);
    }

    if(!bed.IsEmpty())
    {
        myBedFile = ifopen(bed, "r");
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.   
    mySamIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Open the bam index file for reading if a region was specified.
    if((myRefName.Length() != 0) || (myRefID != UNSET_REF) || (myBedFile != NULL))
    {
        mySamIn.ReadBamIndex(indexFile);
    }

    // Read & write the sam header.
    mySamIn.ReadHeader(mySamHeader);
    samOut.WriteHeader(mySamHeader);

    // Read the sam records.
    SamRecord samRecord;
    // Track the status.
    int numSectionRecords = 0;

    // Set returnStatus to success.  It will be changed
    // to the failure reason if any of the writes fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;
        
    while(getNextSection())
    {
        // Keep reading records until they aren't anymore.
        while(mySamIn.ReadRecord(mySamHeader, samRecord))
        {
            if(!readName.IsEmpty())
            {
                // Check for readname.
                if(strcmp(samRecord.getReadName(), readName.c_str()) != 0)
                {
                    // not a matching read name, so continue to the next record.
                    continue;
                }
            }
            
            // Check to see if the read has already been processed.
            if(myPrevEnd != UNSPECIFIED_INT)
            {
                // Because we already know that the bed was sorted, 
                // we know that the previous section started before
                // this one, so if the previous end is greater than
                // this record's end position we know that it
                // was already written in the previous section.
                // Note: can't be equal to the previous end since
                // the end range was exclusive, while
                // get0BasedAlignmentEnd is inclusive.
                // myPrevEnd is reset by getNextSection when a new
                // chromosome is hit.
                if(samRecord.get0BasedAlignmentEnd() < myPrevEnd)
                {
                    // This record was already written.
                    continue;
                }
            }

            // Shift left if applicable.
            if(lshift)
            {
                samRecord.shiftIndelsLeft();
            }

            // Successfully read a record from the file, so write it.
            samOut.WriteRecord(mySamHeader, samRecord);
            ++numSectionRecords;
        }
        myWroteReg = true;
    }

    if(myBedFile != NULL)
    {
        ifclose(myBedFile);
    }
    std::cerr << "Wrote " << outFile << " with " << numSectionRecords
              << " records.\n";
    return(returnStatus);
}
Example #16
0
int Validate::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    int maxErrors = -1;
    int printableErrors = 100;
    bool so_flag = false;
    bool so_coord = false;
    bool so_query = false;
    bool noeof = false;
    bool disableStatistics = false;
    bool verbose = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER("noeof", &noeof)
        LONG_INTPARAMETER("maxErrors", &maxErrors)
        LONG_PARAMETER("verbose", &verbose)
        LONG_INTPARAMETER("printableErrors", &printableErrors)
        LONG_PARAMETER("disableStatistics", &disableStatistics)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("SortOrder")
        EXCLUSIVE_PARAMETER("so_flag", &so_flag)
        EXCLUSIVE_PARAMETER("so_coord", &so_coord)
        EXCLUSIVE_PARAMETER("so_query", &so_query)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // Determine the sort type for validation based on the parameters.
    SamFile::SortedType sortType = SamFile::UNSORTED;
    if(so_flag)
    {
        sortType = SamFile::FLAG;
    } 
    else if(so_coord)
    {
        sortType = SamFile::COORDINATE;
    }
    else if(so_query)
    {
        sortType = SamFile::QUERY_NAME;
    }
   
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for validate, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Since we want to accumulate multiple errors, use RETURN rather
    // than throwing exceptions.
    SamFile samIn(ErrorHandler::RETURN);
    // Open the file for reading.   
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Set the sorting validation type.
    samIn.setSortedValidation(sortType);

    // Set that statistics should be generated.
    samIn.GenerateStatistics(!disableStatistics);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Read the sam records.
    SamRecord samRecord(ErrorHandler::RETURN);

    // Track the status.
    SamStatus::Status status = SamStatus::SUCCESS;

    // Keep reading records until the end of the file is reached.
    int numValidRecords = 0;
    int numInvalidRecords = 0;
    int numErrorRecords = 0;
    int numRecords = 0;
    int numReportedErrors = 0;
    int totalErrorRecords = 0;

    std::map<SamStatus::Status, uint64_t> errorStats;
    std::map<SamValidationError::Type, uint64_t> invalidStats;

    SamValidationErrors invalidSamErrors;

    // Keep reading records from the file until SamFile::ReadRecord
    // indicates to stop (returns false).
    while( ( (maxErrors < 0) || (totalErrorRecords < maxErrors) ) &&
           ( (samIn.ReadRecord(samHeader, samRecord)) || (SamStatus::isContinuableStatus(samIn.GetStatus())) ) )
    {
        ++numRecords;
        if(samIn.GetStatus() == SamStatus::SUCCESS)
        {
            // Successfully set the record, so check to see if it is valid.
            // Clear any errors in the list.
            invalidSamErrors.clear();
            if(!SamValidator::isValid(samHeader, samRecord, invalidSamErrors))
            {
                // The record is not valid.
                ++numInvalidRecords;
                ++totalErrorRecords;
                if(verbose && (numReportedErrors < printableErrors))
                {
                    std::cerr << "Record " << numRecords << std::endl
                              << invalidSamErrors << std::endl;
                    ++numReportedErrors;
                }
                // Update the statistics for all validation errors found in this record.
                invalidSamErrors.resetErrorIter();
                const SamValidationError* errorPtr = invalidSamErrors.getNextError();
                while(errorPtr != NULL)
                {
                    ++invalidStats[errorPtr->getType()];
                    errorPtr = invalidSamErrors.getNextError();
                }

                // If the status is not yet set, set it.
                if(status == SamStatus::SUCCESS)
                {
                    status = SamStatus::INVALID;
                }
            }
            else
            {
                // Valid record, so increment the counter.
                ++numValidRecords;
            }
        }
        else
        {
            // Error reading the record.
            ++numErrorRecords;
            ++totalErrorRecords;
            if(verbose && (numReportedErrors < printableErrors))
            {
                // report error.
                std::cerr << "Record " << numRecords << std::endl
                          << samIn.GetStatusMessage() << std::endl
                          << std::endl;
                ++numReportedErrors;
            }
            // Increment the statistics
            ++errorStats[samIn.GetStatus()];

            // If the status is not yet set, set it.
            if(status == SamStatus::SUCCESS)
            {
                status = samIn.GetStatus();
            }
        }
    }

    if( (samIn.GetStatus() != SamStatus::NO_MORE_RECS) &&
        (totalErrorRecords < maxErrors) )
    {
        // The last read call had a failure, so report it.
        // If the number of errors is >= ,maxErrors we don't
        // want to print any more failures.
        ++numErrorRecords;
        ++totalErrorRecords;
        if(numReportedErrors < printableErrors)
        {
            std::cerr << "Record " << numRecords << ": ";
            std::cerr << std::endl << samIn.GetStatusMessage() << std::endl;
        }

        // Increment the statistics
        ++errorStats[samIn.GetStatus()];

        if(status == SamStatus::SUCCESS)
        {
            status = samIn.GetStatus();
        }
    }

    if(totalErrorRecords == maxErrors)
    {
        if(maxErrors == 0)
        {
            std::cerr << "WARNING file was not read at all due to maxErrors setting, but returning Success.\n";
        }
        else
        {
            // Print a note that the entire file was not read.
            std::cerr << "File was not completely read due to the number of errors.\n";
            std::cerr << "Statistics only reflect the part of the file that was read.\n";
        }
    }

    fprintf(stderr, "\nNumber of records read = %d\n", numRecords);
    fprintf(stderr, "Number of valid records = %d\n", numValidRecords);

    std::cerr << std::endl;
    if(numRecords != numValidRecords)
    {
        std::cerr << "Error Counts:\n";

        // Loop through the non-validation errors.
        std::map<SamStatus::Status, uint64_t>::iterator statusIter;
        for(statusIter = errorStats.begin(); statusIter != errorStats.end(); statusIter++)
        {
            std::cerr << "\t" << SamStatus::getStatusString(statusIter->first) << ": "
                      << statusIter->second << std::endl;
        }

        std::map<SamValidationError::Type, uint64_t>::iterator invalidIter;
        for(invalidIter = invalidStats.begin(); invalidIter != invalidStats.end(); invalidIter++)
        {
            std::cerr << "\t" << SamValidationError::getTypeString(invalidIter->first) << ": "
                      << invalidIter->second << std::endl;
        }

        std::cerr << std::endl;
    }
    samIn.PrintStatistics();

    fprintf(stderr, "Returning: %d (%s)\n", status, SamStatus::getStatusString(status));
    return(status);
}
Example #17
0
int main(int argc, char ** argv)
{
	// Parameter Options

    String refHaps = "";
	String haps = "", snps = "",removeSam="";
	String outfile = "Minimac3.Output";
	String format = "GT,DS";
	String recFile = "", errFile = "",chr="",golden="";
	int cpus=1,start=0, end=0, window=0, max_indiv = 0, max_marker = 0, rounds=5, states=200;
    vector<bool> formatVector(3,false);
    #ifdef _OPENMP
    cpus=5;
    #endif

	bool log = false, duplicates=false, unphasedOutput=false, phased = false,passOnly = false, doseOutput = false, vcfOutput = true, gzip = true, nobgzip = false, rsid=false;
	bool processReference=false,updateModel=false, typedOnly=false, help = false, params = false;
    String MyChromosome="";


	ParameterList inputParameters;
	PhoneHome::allThinning = 50;

	BEGIN_LONG_PARAMETERS(longParameterList)
		LONG_PARAMETER_GROUP("Reference Haplotypes")
		LONG_STRINGPARAMETER("refHaps", &refHaps)
		LONG_PARAMETER("passOnly", &passOnly)
		LONG_PARAMETER("rsid", &rsid)
		LONG_PARAMETER_GROUP("Target Haplotypes")
		LONG_STRINGPARAMETER("haps", &haps)
//		LONG_STRINGPARAMETER("snps", &snps)
		LONG_PARAMETER_GROUP("Output Parameters")
		LONG_STRINGPARAMETER("prefix", &outfile)
		LONG_PARAMETER("processReference", &processReference)
		LONG_PARAMETER("updateModel", &updateModel)
		LONG_PARAMETER("nobgzip", &nobgzip)
		LONG_PARAMETER("vcfOutput", &vcfOutput)
		LONG_PARAMETER("doseOutput", &doseOutput)
		LONG_PARAMETER("hapOutput", &phased)
		LONG_STRINGPARAMETER("format", &format)
		LONG_PARAMETER("allTypedSites", &typedOnly)
		LONG_PARAMETER_GROUP("Subset Parameters")
		LONG_STRINGPARAMETER("chr", &chr)
		LONG_INTPARAMETER("start", &start)
		LONG_INTPARAMETER("end", &end)
		LONG_INTPARAMETER("window", &window)
		//LONG_INTPARAMETER("block", &max_block)
		LONG_PARAMETER_GROUP("Starting Parameters")
		LONG_STRINGPARAMETER("rec", &recFile)
		LONG_STRINGPARAMETER("err", &errFile)
		LONG_PARAMETER_GROUP("Estimation Parameters")
		LONG_INTPARAMETER("rounds", &rounds)
		LONG_INTPARAMETER("states", &states)
		LONG_PARAMETER_GROUP("Other Parameters")
		LONG_PARAMETER("log", &log)
		LONG_PARAMETER("help", &help)
		LONG_INTPARAMETER("cpus", &cpus)
		LONG_PARAMETER("params", &params)
		LONG_PHONEHOME(VERSION)
		BEGIN_LEGACY_PARAMETERS()
		LONG_STRINGPARAMETER("MyChromosome", &MyChromosome)
//		LONG_INTPARAMETER("transFactor", &transFactor)
//		LONG_INTPARAMETER("cisFactor", &cisFactor)
		LONG_INTPARAMETER("sample", &max_indiv)
		LONG_INTPARAMETER("marker", &max_marker)
		LONG_PARAMETER("duplicates", &duplicates)
		LONG_PARAMETER("unphasedOutput", &unphasedOutput)
		END_LONG_PARAMETERS();


	inputParameters.Add(new LongParameters(" Command Line Options: ",longParameterList));

    String compStatus;
	inputParameters.Read(argc, &(argv[0]));

    FILE *LogFile=NULL;
    if(log)
        LogFile=freopen(outfile+".logfile","w",stdout);
    dup2(fileno(stdout), fileno(stderr));


    Minimac3Version();
	if (help)
	{
		helpFile();
		return(-1);
	}

	inputParameters.Status();

    #ifdef _OPENMP
        omp_set_num_threads(cpus);
    #else
        cpus=1;
    #endif


    if(nobgzip)
        gzip=false;

    cout<<endl<<endl;
	if (refHaps == "")
	{
		cout<< " Missing \"--refHaps\", a required parameter.\n";
		cout<< " Type \"--help\" for more help.\n\n";
		compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
	}


    if(processReference)
    {

        cout<<" NOTE: Since \"--processReference\" is ON, all options under \"Target Haplotypes\" \n";
        cout<<"       and \"Starting Parameters\" will be ignored !!!\n";
        cout<<"       Program will only estimate parameters and create M3VCF file.\n";
        cout<<"       No imputation will be performed, hence other parameters are unnecessary !!!"<<endl<<endl;

        cout<<" NOTE: If \"--processReference\" is ON, Parameter Estimation will be done by default ! \n";
        cout<<"       Use \"--rounds 0\" to AVOID Parameter Estimation !!!\n"<<endl<<endl;

        if(updateModel)
        {

            cout<<" Handle \"--updateModel\" does NOT work with handle \"--processReference\" !!! \n";
            cout<<" Type \"--help\" for more help.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
		}

    }

    if(updateModel)
    {

        cout<<" NOTE: Handle \"--updateModel\" works only on M3VCF files ! \n";
        cout<<"       Program will NOT run if \"--refHaps\" is a VCF file !!!\n"<<endl;

        if(rounds<=0)
        {
            cout << " Invalid input for \"--rounds\" = "<<rounds<<"\n";;
            cout << " Value must be POSITIVE if \"--updateModel\" is ON !!! \n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(states<=0)
        {
            cout << " Invalid input for \"--states\" = "<<states<<"\n";;
            cout << " Value must be POSITIVE if \"--updateModel\" is ON !!! \n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
    }

    if(rounds<0)
    {
        cout << " Invalid input for \"--rounds\" = "<<rounds<<"\n";;
        cout << " Value must be non-negative !!! \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }
    if(states<0)
    {
        cout << " Invalid input for \"--states\" = "<<states<<"\n";;
        cout << " Value must be non-negative !!! \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }


    if(window<0)
    {
        cout << " Invalid input for \"--window\" = "<<window<<"\n";;
        cout << " Value must be non-negative !!! \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }

    if(start<0)
    {
        cout << " Invalid input for \"--start\" = "<<start<<"\n";;
        cout << " Value must be non-negative !!! \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }
    if(end<0)
    {
        cout << " Invalid input for \"--end\" = "<<end<<"\n";;
        cout << " Value must be non-negative !!! \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }
    if(start>0 && end> 0 && start>=end)
    {
        cout << " Invalid Input !!!\n Value of \"--start\" must be less than value of \"--end\"."<<endl;
        cout << " User Input \"--start\" = "<<start<<" and \"--end\" = " <<end<<" \n\n";
        compStatus="Command.Line.Error";
		PhoneHome::completionStatus(compStatus.c_str());
		return(-1);
    }
    if(start>0)
    {
        if(chr=="")
        {
            cout << "\n Missing \"--chr\", a required parameter if using \"--start\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(end==0)
        {
            cout << "\n Non-zero value of \"--end\" required parameter if using \"--start\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(window==0)
        {
            window=500000;
            cout<<" NOTE: Default \"--window\" parameter to be used = 500000\n";
        }
    }
    if(end>0)
    {
        if(chr=="")
        {
            cout << "\n Missing \"--chr\", a required parameter if using \"--end\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(start==0)
        {
            cout << "\n Non-zero value of \"--start\" required parameter if using \"--end\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(window==0)
        {
            window=500000;
            cout<<" NOTE: Default \"--window\" parameter to be used = 500000\n";
        }
    }
    if(window>0)
    {

        if(chr=="")
        {
            cout << "\n Missing \"--chr\", a required parameter if using \"--end\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        if(start==0 && end==0)
        {
            cout << "\n Missing \"--start\" or  \"--end\", a required parameter if using \"--window\" parameter.\n";
            cout << " Try --help for more information.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
    }
    else
    {
        if(start>0 || end>0)
         {
            cout<<" NOTE: No \"--window\" parameter provided  !!! \n";
            cout<<"       No buffer region will be used on either side of the chunk"<<endl<<endl;
        }
    }


    if(!processReference)
    {

        if (haps == "")
        {
            cout <<" Missing \"--haps\", a required parameter (for imputation).\n";
            cout <<" OR use \"--processReference\" to just process the reference panel.\n";
            cout<< " Type \"--help\" for more help.\n\n";
            compStatus="Command.Line.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }

        string formatPiece,formatTemp=format.c_str();
        char *end_str1;

        for(char * pch = strtok_r ((char*)formatTemp.c_str(),",", &end_str1);
            pch!=NULL;
            pch = strtok_r (NULL, ",", &end_str1))
        {

            formatPiece=(string)pch;
            if(formatPiece.compare("GT")==0)
                formatVector[0]=true;
            else if(formatPiece.compare("DS")==0)
                formatVector[1]=true;
            else if(formatPiece.compare("GP")==0)
                formatVector[2]=true;
            else
            {
                cout << " Cannot identify handle for \"--format\" parameter : "<<formatPiece<<endl;
                cout << " Available handles GT, DS and GP (for genotype, dosage and posterior probability). \n\n";
                cout << " Type \"--help\" for more help.\n\n";
                compStatus="Command.Line.Error";
                PhoneHome::completionStatus(compStatus.c_str());
                return(-1);
            }
        }
    }

    HaplotypeSet target,reference;
    target.MyChromosome=(string)MyChromosome;
    reference.MyChromosome=(string)MyChromosome;
    reference.CPU=cpus;
    reference.Duplicates=duplicates;



	if(!processReference)
    {
        cout<<" ------------------------------------------------------------------------------"<<endl;
        cout<<"                       PRELIMINARY GWAS/TARGET FILE CHECK                      "<<endl;
        cout<<" ------------------------------------------------------------------------------"<<endl;


	    if (!target.BasicCheckForTargetHaplotypes(haps))
        {
            cout << "\n Program Exiting ... \n\n";
            compStatus="Target.Panel.Load.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }

        cout<<" ------------------------------------------------------------------------------"<<endl;
        cout<<"                       PRELIMINARY REFERENCE FILE CHECK                        "<<endl;
        cout<<" ------------------------------------------------------------------------------"<<endl<<endl;


        cout << " Performing basic file check on Reference haplotype file ..." << endl;

        cout << "\n Checking File ..." << endl;

        if(reference.DetectReferenceFileType(refHaps).compare("NA")==0)
        {
            cout << "\n Program could NOT open file : " << refHaps << endl;
            cout << "\n Program Exiting ... \n\n";
            compStatus="Reference.Panel.Load.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        cout << " File Exists ..." << endl;
        cout << "\n Checking File Format ..." << endl;

        if(reference.DetectReferenceFileType(refHaps).compare("Invalid")==0)
        {
            cout << "\n Reference File provided by \"--refHaps\" must be a VCF/M3VCF file !!! \n";
            cout << " Please check the following file : "<<refHaps<<endl;
            cout << "\n Program Exiting ... \n\n";
            compStatus="Reference.Panel.Load.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        else if(reference.DetectReferenceFileType(refHaps).compare("m3vcf")==0)
        {
            cout<<"\n Reference File Format = M3VCF (Minimac3 VCF File) "<<endl;

            cout <<"\n NOTE: For M3VCF files, if parameter estimates are available in the file, \n";
               cout<<"       they will be used by default (RECOMMENDED !). If the user has reasons\n";
               cout<<"       to believe that updating the parameters would increase accuracy, they\n";
             cout<<"       should use handle \"--updateModel\" (not required in typical GWAS studies).\n";
               cout<<"       If estimates are NOT available in file, it will estimate by default."<<endl;


            if(rounds==0)
            {

                cout <<"\n NOTE: User has specified \"--rounds\" = 0 !!!\n";
                cout<<"       Please verify that the M3VCF file has parameter estimates in it.\n";
                cout<<"       Otherwise program will use default estimates leading to possibly inaccurate estimates."<<endl;


            }
            else
            {
                if(!updateModel)
                {
                 cout <<"\n NOTE: For M3VCF files, if estimates are available in file \n";
                    cout<<"       value of \"--rounds\" will be ignored unless user has\n";
                    cout<<"       \"--updateModel\" ON (since, otherwise estimates are\n";
                    cout<<"       not going to be updated and value of \"--rounds\" would\n";
                    cout<<"       not make sense) !!!"<<endl;

                }
            }


        }
        else if(reference.DetectReferenceFileType(refHaps).compare("vcf")==0)
        {
            cout<<"\n Reference File Format = VCF (Variant Call Format)"<<endl;

            cout <<"\n NOTE: For VCF files, parameter estimation will be done by default (unless \"--rounds\" = 0)."<< endl;

            if(updateModel && recFile=="" && errFile=="")
            {
                cout << "\n Handle \"--updateModel\" does NOT work for VCF reference file.\n";
                cout << " This works only for M3VCF files or when \"--rec\" or \"--err\" is provided.\n";
                cout << " For VCF files, parameter estimation will be done by default (unless \"--rounds\" = 0).\n";
                cout << " Please turn OFF \"--updateModel\" or use M3VCF file.\n";
                cout << " Try --help for more information.\n\n";
                compStatus="Command.Line.Error";
                PhoneHome::completionStatus(compStatus.c_str());
                return(-1);
            }

            if(rounds==0)
            {

                cout <<"\n NOTE: User has specified \"--rounds\" = 0 !!!\n";
                cout<<"       No parameter estimation will be done on VCF file.\n";
                cout<<"       Program will use default estimates leading to possibly inaccurate estimates."<<endl;
            }
        }
        cout<<endl;

    }

	int start_time = time(0);
	int time_prev = start_time;

    cout<<" ------------------------------------------------------------------------------"<<endl;
    cout<<"                           REFERENCE HAPLOTYPE PANEL                           "<<endl;
    cout<<" ------------------------------------------------------------------------------"<<endl;



    reference.updateCoeffs(transFactor,cisFactor);


	if (!reference.FasterLoadHaplotypes(refHaps, max_indiv, max_marker,chr,start,end,window,rsid,processReference,passOnly))
	{
		cout << "\n Program Exiting ... \n\n";
		compStatus="Reference.Panel.Load.Error";
        PhoneHome::completionStatus(compStatus.c_str());
        return(-1);

	}


    cout<<endl;
	if(processReference)
    {
	    Imputation thisDataFast(reference, reference, outfile, errFile, recFile, phased
                             , gzip, rsid, rounds, states, vcfOutput, doseOutput
                             , typedOnly,formatVector);
        thisDataFast.createEstimates(reference, reference, reference.optEndPoints, true);
	}

	int time_load = time(0) - time_prev;
	time_prev = time(0);



    cout << "\n Time taken to load reference haplotype set = " << time_load << " seconds."<<endl<<endl;


	if(!processReference)
    {
        cout<<" ------------------------------------------------------------------------------"<<endl;
        cout<<"                          TARGET/GWAS HAPLOTYPE PANEL                         "<<endl;
        cout<<" ------------------------------------------------------------------------------"<<endl;


	    if (!target.LoadTargetHaplotypes(haps, snps, reference.markerName,reference,typedOnly,passOnly))
        {

            cout << "\n Program Exiting ... \n\n";
            compStatus="Target.Panel.Load.Error";
            PhoneHome::completionStatus(compStatus.c_str());
            return(-1);
        }
        time_load = time(0) - time_prev;
        time_prev = time(0);
        cout << "\n Time taken to load target haplotype set = " << time_load << " seconds. "<<endl<<endl;
    }



    if(!processReference)
	{
	    Imputation thisDataFast(target, reference, outfile, errFile, recFile, phased
                             , gzip, rsid, rounds, states, vcfOutput, doseOutput
                             , typedOnly,formatVector,updateModel,unphasedOutput);

        thisDataFast.performImputation(target, reference, golden);
	}


    cout<<" ------------------------------------------------------------------------------"<<endl;
    cout<<"                                END OF PROGRAM                                 "<<endl;
    cout<<" ------------------------------------------------------------------------------"<<endl;

    time_load = time(0) - time_prev;
	int time_tot = time(0) - start_time;

    cout << "\n Program Successfully Implemented... \n ";


	printf("\n Total Run completed in %d hours, %d mins, %d seconds.\n",
		time_tot / 3600, (time_tot % 3600) / 60, time_tot % 60);

    cout<<"\n Thank You for using Minimac3 !!! "<<endl<<endl;

    if(log)
        fclose (LogFile);


    compStatus="Success";
    PhoneHome::completionStatus(compStatus.c_str());

	return 0;

}
Example #18
0
int main(int argc, char ** argv)
   {
   setbuf(stdout, NULL);

   time_t start = time(NULL);

   printf("MiniMac - Imputation into phased haplotypes\n"
          "(c) 2011 Goncalo Abecasis\n");
#ifdef __VERSION__
   printf("VERSION 5.0\n");
#else
   printf("UNDOCUMENTED RELEASE\n");
#endif

   int rounds = 5, states = 200, cpus = 0;
   bool em = false, gzip = false, phased = false;

   String referenceHaplotypes, referenceSnps;
   String haplotypes, snps;
   String prefix("minimac");
   String firstMarker, lastMarker;

   String recombinationRates, errorRates;

   BEGIN_LONG_PARAMETERS(longParameters)
      LONG_PARAMETER_GROUP("Reference Haplotypes")
         LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes)
         LONG_STRINGPARAMETER("refSnps", &referenceSnps)
      LONG_PARAMETER_GROUP("Target Haplotypes")
         LONG_STRINGPARAMETER("haps", &haplotypes)
         LONG_STRINGPARAMETER("snps", &snps)
      LONG_PARAMETER_GROUP("Starting Parameters")
         LONG_STRINGPARAMETER("rec", &recombinationRates)
         LONG_STRINGPARAMETER("erate", &errorRates)
      LONG_PARAMETER_GROUP("Parameter Fitting")
         LONG_INTPARAMETER("rounds", &rounds)
         LONG_INTPARAMETER("states", &states)
         LONG_PARAMETER("em", &em)
      LONG_PARAMETER_GROUP("Output Files")
         LONG_STRINGPARAMETER("prefix", &prefix)
         LONG_PARAMETER("phased", &phased)
         LONG_PARAMETER("gzip", &gzip)
//    LONG_PARAMETER_GROUP("Clipping Window")
//      LONG_STRINGPARAMETER("start", &firstMarker)
//      LONG_STRINGPARAMETER("stop", &lastMarker)
#ifdef _OPENMP
      LONG_PARAMETER_GROUP("Multi-Threading")
         LONG_INTPARAMETER("cpus", &cpus)
#endif
   END_LONG_PARAMETERS();

   ParameterList pl;

   pl.Add(new LongParameters("Command Line Options", longParameters));
   pl.Read(argc, argv);
   pl.Status();

#ifdef _OPENMP
   if (cpus > 0)
      omp_set_num_threads(cpus);
#endif

   // Read marker list
   printf("Reading Reference Marker List ...\n");

   StringArray refMarkerList;
   refMarkerList.Read(referenceSnps);

   // Index markers
   StringIntHash referenceHash;
   for (int i = 0; i < refMarkerList.Length(); i++)
      referenceHash.Add(refMarkerList[i].Trim(), i);

   printf("  %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length());

   // Load reference haplotypes
   printf("Loading reference haplotypes ...\n");
   HaplotypeSet reference;

   reference.markerCount = refMarkerList.Length();
   reference.LoadHaplotypes(referenceHaplotypes);

   printf("  %d Reference Haplotypes Loaded ...\n\n", reference.count);

   // Read framework marker list
   printf("Reading Framework Marker List ...\n");
   StringArray markerList;
   markerList.Read(snps);

   ClipReference(reference, refMarkerList, referenceHash, markerList,
                 firstMarker, lastMarker);

   // Crossref Marker Names to Reference Panel Positions
   IntArray markerIndex;
   markerIndex.Dimension(markerList.Length());

   int matches = 0;

   for (int i = 0; i < markerList.Length(); i++)
      {
      markerIndex[i] = referenceHash.Integer(markerList[i].Trim());

      if (markerIndex[i] >= 0) matches++;
      }

   printf("  %d Markers in Framework Haplotypes Overlap Reference ...\n", matches);

   if (matches == 0)
      error("No markers overlap between target and reference\n"
            "Please check correct reference is being used and markers are named consistently");

   printf("  %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches);

   // Check for flips in reference vs. target haplotypes
   int flips = 0;
   int previous = -1;
   for (int i = 0; i < markerIndex.Length(); i++)
      if (markerIndex[i] >= 0)
         if (markerIndex[i] < previous)
            {
            if (flips++ < 10)
               printf("  -> Marker %s precedes %s in reference, but follows it in target\n",
                     (const char *) refMarkerList[previous],
                     (const char *) markerList[i]);
            previous = markerIndex[i];
            }
   if (flips > 10)
      printf("  -> %d Additional Marker Order Changes Not Listed\n", flips - 10);
   if (flips)
      printf("  %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips);

   // Load target haplotypes
   printf("Loading target haplotypes ...\n");
   HaplotypeSet target;

   target.markerCount = markerList.Length();
   target.LoadHaplotypes(haplotypes, true);

   reference.CalculateFrequencies();
   target.CalculateFrequencies();
   target.CompareFrequencies(reference, markerIndex, markerList);

   printf("  %d Target Haplotypes Loaded ...\n\n", target.count);

   int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker);
   int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker);

   if (startIndex < 0 || stopIndex < 0)
      error("Clipping requested, but no position available for one of the endpoints");

   printf("Setting up Markov Model...\n\n");

   // Setup Markov Model
   MarkovParameters mp;

   mp.Allocate(reference.markerCount);

   if (rounds > 0)
      printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n",
             em ? "E-M" : "MCMC", states);

   // Simple initial estimates of error and recombination rate
   for (int i = 0; i < reference.markerCount; i++)
      mp.E[i] = 0.01;

   for (int i = 0; i < reference.markerCount - 1; i++)
      mp.R[i] = 0.001;

   if (mp.ReadErrorRates(errorRates))
      printf("  Updated error rates using data in %s ...\n", (const char *) errorRates);

   if (mp.ReadCrossoverRates(recombinationRates))
      printf("  Updated recombination rates using %s ...\n", (const char *) recombinationRates);

   // Parameter estimation loop
   for (int round = 0; round < rounds; round++)
      {
      printf("  Round %d of Parameter Refinement ...\n", round + 1);

      int iterations = states < reference.count ? states : reference.count;

      MarkovModel original;
      original.CopyParameters(mp);

      #pragma omp parallel for
      for (int i = 0; i < iterations; i++)
         {
         MarkovModel mm;

         mm.Allocate(reference.markerCount, reference.count - 1);
         mm.CopyParameters(original);

         // Reference leave one out (loo) panel
         char ** reference_loo = new char * [reference.count - 1];
         for (int in = 0, out = 0; in < reference.count; in++)
            if (in != i)
               reference_loo[out++] = reference.haplotypes[in];

         mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq);

         if (em)
            mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq);
         else
            {
            #pragma omp critical
            { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); }
            }

         delete [] reference_loo;

         #pragma omp critical
         mp += mm;
         }

      if (round >= rounds / 2)
         {
         int iterations = states < target.count ? states : target.count;

         #pragma omp parallel for
         for (int i = 0; i < iterations; i++)
            {
            MarkovModel mm;

            mm.Allocate(reference.markerCount, reference.count);
            mm.CopyParameters(original);

            // Padded version of target haplotype, including missing sites
            char * padded = new char [reference.markerCount];
            for (int k = 0; k < reference.markerCount; k++)
               padded[k] = 0;

            // Copy current haplotype into padded vector
            for (int j = 0; j < target.markerCount; j++)
               if (markerIndex[j] >= 0)
                  padded[markerIndex[j]] = target.haplotypes[i][j];

            mm.WalkLeft(padded, reference.haplotypes, reference.freq);

            if (em)
               mm.CountExpected(padded, reference.haplotypes, reference.freq);
            else
               {
               #pragma omp critical
               { mm.ProfileModel(padded, reference.haplotypes, reference.freq); }
               }

            delete [] padded;

            #pragma omp critical
            mp += mm;
            }
         }

      mp.UpdateModel();

      double crossovers = 0;
      for (int i = 0; i < reference.markerCount - 1; i++)
         crossovers += mp.R[i];

      double errors = 0;
      for (int i = 0; i < reference.markerCount; i++)
         {
         double heterozygosity = 1.0 - square(reference.freq[1][i])
                                     - square(reference.freq[2][i])
                                     - square(reference.freq[3][i])
                                     - square(reference.freq[4][i]);

         errors += mp.E[i] * heterozygosity;
         }
      errors /= reference.markerCount + 1e-30;

      printf("      %.0f mosaic crossovers expected per haplotype\n", crossovers);
      printf("      %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.);
      printf("      %.3g errors in mosaic expected per marker\n", errors);
      }

   if (rounds > 0)
      {
      printf("  Saving estimated parameters for future use ...\n");
      mp.WriteParameters(refMarkerList, prefix, gzip);
      }

   printf("\n");

   // List the major allele at each location
   reference.ListMajorAlleles();

   printf("Generating Draft .info File ...\n\n");

   // Output some basic information
   IFILE info = ifopen(prefix + ".info.draft", "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n");

   for (int i = 0, j = 0; i <= stopIndex; i++)
      if (i >= startIndex)
         ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i),
            reference.freq[reference.major[i]][i],
            j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-");
      else
         if (j < markerIndex.Length() && i == markerIndex[j])
            j++;

   ifclose(info);

   printf("Imputing Genotypes ...\n");

   IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt");
   IFILE hapdose, haps;

   if (phased)
      {
      hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt");
      haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt");
      }

   ImputationStatistics stats(reference.markerCount);

   // Impute each haplotype
   #pragma omp parallel for
   for (int i = 0; i < target.count; i++)
      {
      if (i != 0 && target.labels[i] == target.labels[i-1])
         continue;

      MarkovModel mm;

      mm.Allocate(reference.markerCount, reference.count);
      mm.ClearImputedDose();
      mm.CopyParameters(mp);

      // Padded version of target haplotype, including missing sites
      char * padded = new char [reference.markerCount];
      for (int j = 0; j < reference.markerCount; j++)
         padded[j] = 0;

      int k = i;

      do {
         printf("  Processing Haplotype %d of %d ...\n", k + 1, target.count);

         // Copy current haplotype into padded vector
         for (int j = 0; j < target.markerCount; j++)
            if (markerIndex[j] >= 0)
               padded[markerIndex[j]] = target.haplotypes[k][j];

         mm.WalkLeft(padded, reference.haplotypes, reference.freq);
         mm.Impute(reference.major, padded, reference.haplotypes, reference.freq);

         #pragma omp critical
         { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); }

         #pragma omp critical
         if (phased)
            {
            ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1);
            for (int j = startIndex; j <= stopIndex; j++)
               {
               ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]);
               ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]);
               }
            ifprintf(hapdose, "\n");
            ifprintf(haps, "\n");
            }

         k++;
      } while (k < target.count && target.labels[k] == target.labels[i]);

      printf("    Outputting Individual %s ...\n", (const char *) target.labels[i]);

      #pragma omp critical
         {
         ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]);
         for (int j = startIndex; j <= stopIndex; j++)
            ifprintf(dosages, "\t%.3f", mm.imputedDose[j]);
         ifprintf(dosages, "\n");
         }

      delete [] padded;
      }

   ifclose(dosages);

   if (phased)
      {
      ifclose(hapdose);
      ifclose(haps);
      }

   // Output some basic information
   info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt");

   ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n");

   // Padded version of target haplotype, including missing sites
   char * padded = new char [reference.markerCount];
   for (int k = 0; k < reference.markerCount; k++)
      padded[k] = 0;

   // Mark genotyped SNPs in padded vector
   for (int j = 0; j < target.markerCount; j++)
      if (markerIndex[j] >= 0)
          padded[markerIndex[j]] = 1;

   for (int i = startIndex; i <= stopIndex; i++)
      {
      ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t",
            (const char *) refMarkerList[i],
            reference.MajorAlleleLabel(i),
            reference.MinorAlleleLabel(i),
            stats.AlleleFrequency(i),
            stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i),
            stats.AverageCallScore(i),
            stats.Rsq(i));

      if (padded[i])
         ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n",
                  stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i),
                  stats.LooMajorDose(i), stats.LooMinorDose(i));
      else
         ifprintf(info, "-\t-\t-\t-\t-\t-\n");
      }

   ifclose(info);

   delete [] padded;

   time_t stop = time(NULL);
   int seconds = stop - start;

   printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n",
          seconds / 3600, (seconds % 3600) / 60, seconds % 60,
          ctime(&stop));
   }
Example #19
0
int VcfConvert::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcf = "";
    String refName = "";
    bool uncompress = false;
    bool params = false;
    bool noeof = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("out", &outputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    VcfFileReader inFile;
    VcfFileWriter outFile;
    VcfHeader header;
    
    // Open the file.
    inFile.open(inputVcf, header);

    if(refName != "")
    {
        inFile.setReadSection(refName.c_str());
    }

    if(uncompress)
    {
        outFile.open(outputVcf, header, InputFile::DEFAULT);
    }
    else
    {
        outFile.open(outputVcf, header);
    }

    VcfRecord record;
    int numRecords = 0;

    while(inFile.readRecord(record))
    {
        ++numRecords;

        outFile.writeRecord(record);
    }
 
    inFile.close();   

    std::cerr << "NumRecords: " << numRecords << "\n";
    return(0);
}
Example #20
0
int Recab::execute(int argc, char *argv[])
{
    bool verboseFlag = false;

    String inFile,outFile,logFile;

    bool noeof = false;
    bool params = false;

    SamFile samIn,samOut;

    ParameterList inputParameters;

    LongParamContainer parameters;

    parameters.addGroup("Required Generic Parameters");
    parameters.addString("in", &inFile);
    parameters.addString("out", &outFile);
    parameters.addGroup("Optional Generic Parameters");
    parameters.addString("log", &logFile);
    parameters.addBool("verbose", &verboseFlag);
    parameters.addBool("noeof", &noeof);
    parameters.addBool("params", &params);
    parameters.addPhoneHome(VERSION);
    addRecabSpecificParameters(parameters);
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            parameters.getLongParameterList()));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);
    
    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    if(inFile.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing required --in parameter" << std::endl;
        return EXIT_FAILURE;
    }

    if(outFile.IsEmpty())
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing required --out parameter" << std::endl;
        return EXIT_FAILURE;
    }

    int status = processRecabParam();
    if(status != 0)
    {
        inputParameters.Status();
        return(status);
    }

    if ( logFile.IsEmpty() )
    {
        logFile = outFile + ".log";
    }
  
    if(params)
    {
        inputParameters.Status();
    }
    
    Logger::gLogger = new Logger(logFile.c_str(), verboseFlag);

    ////////////////
    //////  Errormodel
    Logger::gLogger->writeLog("Initialize errormodel structure...");

    ////////////////////////////////////////
    // SAM/BAM file open
    ////////////////////////////////////////
    ////////////////////////////////////////

    // Iterate SAM records
    if(!samIn.OpenForRead(inFile.c_str()))
    {
        Logger::gLogger->error("Failed to open SAM/BAM file %s",inFile.c_str() );
        return EXIT_FAILURE;
    }

    Logger::gLogger->writeLog("Start iterating SAM/BAM file %s",inFile.c_str());

    time_t now = time(0);
    tm* localtm = localtime(&now);

    Logger::gLogger->writeLog("Start: %s", asctime(localtm));
    SamRecord samRecord;
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    srand (time(NULL));

    int numRecs = 0;
    while(samIn.ReadRecord(samHeader, samRecord) == true)
    {
        processReadBuildTable(samRecord);

        //Status info
        numRecs++;
        if(verboseFlag)
        {
            if(numRecs%10000000==0)
                Logger::gLogger->writeLog("%ld records processed", numRecs);
        }
    }

    now = time(0);
    localtm = localtime(&now);
    Logger::gLogger->writeLog("End: %s", asctime(localtm));

    if((outFile[0] == '-') && (logFile[0] != '-'))
    {
        // Since outFile is to stdout, and logfile isn't, pass logfile name 
        modelFitPrediction(logFile);
    }
    else
    {
        modelFitPrediction(outFile);
    }

    Logger::gLogger->writeLog("Writing recalibrated file %s",outFile.c_str());

    ////////////////////////
    ////////////////////////
    //// Write file
    samIn.OpenForRead(inFile.c_str());
    samOut.OpenForWrite(outFile.c_str());
    samIn.ReadHeader(samHeader);
    samOut.WriteHeader(samHeader);
    
    while(samIn.ReadRecord(samHeader, samRecord) == true)
    {
        // Recalibrate.
        processReadApplyTable(samRecord);
        samOut.WriteRecord(samHeader, samRecord);
    }

    Logger::gLogger->writeLog("Total # Reads recab table not applied to: %ld", myNumApplySkipped);
    Logger::gLogger->writeLog("Total # Reads recab table applied to: %ld", myNumApplyReads);
    Logger::gLogger->writeLog("Recalibration successfully finished");
    return EXIT_SUCCESS;
}
Example #21
0
int Revert::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String outFile = "";
    bool cigar = false;
    bool qual = false;
    bool noeof = false;
    bool params = false;
    bool rmBQ = false;
    String rmTags = "";
    myKeepTags = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_STRINGPARAMETER("out", &outFile)
        LONG_PARAMETER("cigar", &cigar)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("keepTags", &myKeepTags)
        LONG_PARAMETER("rmBQ", &rmBQ)
        LONG_STRINGPARAMETER("rmTags", &rmTags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }
    
    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(outFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--out is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    samIn.OpenForRead(inFile);

    // Open the output file for writing.
    SamFile samOut;
    samOut.OpenForWrite(outFile);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    // Write the sam header.
    samOut.WriteHeader(samHeader);

    SamRecord samRecord;

    // Set returnStatus to success.  It will be changed to the
    // failure reason if any of the writes or updates fail.
    SamStatus::Status returnStatus = SamStatus::SUCCESS;

    // Keep reading records until ReadRecord returns false.
    while(samIn.ReadRecord(samHeader, samRecord))
    {
        // Update the cigar & position.
        if(cigar)
        {
            if(!updateCigar(samRecord))
            {
                // Failed to update the cigar & position.
                fprintf(stderr, "%s\n", samIn.GetStatusMessage());
                returnStatus = samIn.GetStatus();
            }
        }
        if(qual)
        {
            if(!updateQual(samRecord))
            {
                // Failed to update the quality.
                fprintf(stderr, "%s\n", samIn.GetStatusMessage());
                returnStatus = samIn.GetStatus();
            }
        }

        if(rmBQ)
        {
            if(!removeBQ(samRecord))
            {
                // Failed to remove BQ.
                fprintf(stderr, "%s\n", samIn.GetStatusMessage());
                returnStatus = samIn.GetStatus();
            }
        }

        if(rmTags != "")
        {
            if(!samRecord.rmTags(rmTags.c_str()))
            {
                // Failed to remove the specified tags.
                fprintf(stderr, "%s\n", samIn.GetStatusMessage());
                returnStatus = samIn.GetStatus();
            }
        }

        // Successfully read a record from the file, so write it.
        if(!samOut.WriteRecord(samHeader, samRecord))
        {
            // Failed to write a record.
            fprintf(stderr, "%s\n", samOut.GetStatusMessage());
            returnStatus = samOut.GetStatus();
        }
    }

    std::cerr << std::endl << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;
    std::cerr << "Number of records written = " << 
        samOut.GetCurrentRecordCount() << std::endl;

    // Since the reads were successful, return the status based
    // on the status of the writes.  If any failed, return
    // their failure status.
    return(returnStatus);
}
Example #22
0
int VcfConsensus::execute(int argc, char ** argv)
{
    String vcfName1;
    String vcfName2;
    String vcfName3;
    String outputFileName;
    bool uncompress = false;
    bool params = false;

    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in1", &vcfName1)
        LONG_STRINGPARAMETER("in2", &vcfName2)
        LONG_STRINGPARAMETER("in3", &vcfName3)
        LONG_STRINGPARAMETER("out", &outputFileName)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
       END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    std::string gtField = "GT";

    VcfFileReader vcf1;
    VcfFileReader vcf2;
    VcfFileReader vcf3;
    VcfHeader header1;
    VcfHeader header2;
    VcfHeader header3;
    VcfRecord record1;
    VcfRecord record2;
    VcfRecord record3;
    VcfRecordGenotype* genotypeInfoPtr1 = NULL;
    VcfRecordGenotype* genotypeInfoPtr2 = NULL;
    VcfRecordGenotype* genotypeInfoPtr3 = NULL;
    
    unsigned int numMissing2 = 0;
    unsigned int numMissing3 = 0;
    unsigned int numMismatchRefAlt = 0;
    unsigned int numMissingGT1 = 0;
    const unsigned int myMaxErrors = 4;

    // Check that the required parameters were set.
    if(vcfName1 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in1\", a required parameter.\n\n";
        return(-1);
    }
    if(vcfName2 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in2\", a required parameter.\n\n";
        return(-1);
    }
    if(vcfName3 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in3\", a required parameter.\n\n";
        return(-1);
    }
    if(outputFileName == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    
    // Open the files.
    vcf1.open(vcfName1, header1);
    vcf2.open(vcfName2, header2);
    vcf3.open(vcfName3, header3);

    // Setup the sample name maps.
    int numSamples = header1.getNumSamples();
    std::vector<int> sample2Indices;
    std::vector<int> sample3Indices;
    std::vector<int> removeIndices;
    int numSamplesSkipped1 = 0;
    int numSamplesSkipped2 = 0;
    int numSamplesSkipped3 = 0;
    for(int i = 0; i < numSamples; i++)
    {
        int sm2Index = header2.getSampleIndex(header1.getSampleName(i));
        int sm3Index = header3.getSampleIndex(header1.getSampleName(i));
        // Look for this sample name in vcf2.
        if((sm2Index != -1) && (sm3Index != -1))
        {
            sample2Indices.push_back(sm2Index);
            sample3Indices.push_back(sm3Index);
        }
        else
        {
            // Sample not found in all three vcfs.
            removeIndices.push_back(i);
            ++numSamplesSkipped1;
        }
    }
    // Remove samples not found in all 3 vcfs from header1.
    // Remove them in reverse order so they are removed from the end of the header first.
    VcfSubsetSamples subset1;
    subset1.init(header1, true);
    for(int i = (removeIndices.size() - 1); i >= 0; i--)
    {
        subset1.addExcludeSample(header1.getSampleName(removeIndices[i]));
        header1.removeSample(removeIndices[i]);
    }

    // Set numSamples to the new number of samples in header1.
    numSamples = header1.getNumSamples();

    // Calculate the number of samples skipped for files 2 & 3.
    numSamplesSkipped2 = header2.getNumSamples() - sample2Indices.size();
    numSamplesSkipped3 = header3.getNumSamples() - sample3Indices.size();

    if(numSamplesSkipped1 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped1 << " samples from --in1\n";
    }
    if(numSamplesSkipped2 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped2 << " samples from --in2\n";
    }
    if(numSamplesSkipped3 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped3 << " samples from --in3\n";
    }

    VcfFileWriter outputVcf;
    // Open and write the header
    if(uncompress)
    {
        outputVcf.open(outputFileName, header1, InputFile::DEFAULT);
    }
    else
    {
        outputVcf.open(outputFileName, header1);
    }

    const char* chrom1 = NULL;
    int pos1 = UNSET_POS;

    // Read the first record from vcf2 & vcf3.
    vcf2.readRecord(record2);
    vcf3.readRecord(record3);

    bool newChrom = true;
    static std::string prevChrom = "";

    uint64_t numAllMatch = 0;
    uint64_t num1Match2Only = 0;
    uint64_t num1Match3Only = 0;
    uint64_t num2Match3Only = 0;
    uint64_t numNoMatches = 0;

    uint64_t numAllMatch00 = 0;
    uint64_t num1Match2Only00 = 0;
    uint64_t num1Match3Only00 = 0;
    uint64_t num2Match3Only00 = 0;

    uint64_t numAllMatch01 = 0;
    uint64_t num1Match2Only01 = 0;
    uint64_t num1Match3Only01 = 0;
    uint64_t num2Match3Only01 = 0;

    uint64_t numAllMatch11 = 0;
    uint64_t num1Match2Only11 = 0;
    uint64_t num1Match3Only11 = 0;
    uint64_t num2Match3Only11 = 0;

    // Loop through vcf1.
    while(vcf1.readRecord(record1, &subset1))
    {
        chrom1 = record1.getChromStr();
        pos1 = record1.get1BasedPosition();

        if(strcmp(chrom1, prevChrom.c_str()) == 0)
        {
            newChrom = false;
        }
        else
        {
            prevChrom = chrom1;
            newChrom = true;
        }

        bool found = true;
        if(!findPos(newChrom, chrom1, pos1, record2, vcf2))
        {
            // Failed to find the position, continue to the next position
            if(++numMissing2 <= myMaxErrors)
            {
                std::cerr << "Failed to find " << chrom1 << ":" << pos1 
                          << " in " << vcfName2 << ", so skipping this pos\n";
            }
            found = false;
        }
        
        if(!findPos(newChrom, chrom1, pos1, record3, vcf3))
        {
            // Failed to find the position, continue to the next position
            if(++numMissing3 <= myMaxErrors)
            {
                std::cerr << "Failed to find " << chrom1 << ":" << pos1 
                          << " in " << vcfName3 << ", so skipping this pos\n";
            }
            found = false;
        }

        if(found == false)
        {
            continue;
        }

        // Found the position in all files.
        
        // Validate that the reference & alternate alleles are the same.
        const char* ref1 = record1.getRefStr();
        const char* alt1 = record1.getAltStr();
        if((strcmp(ref1, record2.getRefStr()) != 0) ||
           (strcmp(ref1, record3.getRefStr()) != 0) ||
           (strcmp(alt1, record2.getAltStr()) != 0) ||
           (strcmp(alt1, record3.getAltStr()) != 0))
        {
            if(++numMismatchRefAlt <= myMaxErrors)
            {
                std::cerr << "Mismatching ref/alt found at " << chrom1 << ":" << pos1 << ", so skipping this pos\n";
            }
            continue;
        }

        // Get the genotype information for each.
        genotypeInfoPtr1 = &(record1.getGenotypeInfo());
        genotypeInfoPtr2 = &(record2.getGenotypeInfo());
        genotypeInfoPtr3 = &(record3.getGenotypeInfo());

        // Loop through all the samples in vcf1.
        // Get the Genotype Information.
        for(int i = 0; i < numSamples; i++)
        {
            const std::string* genotypeVal1 = genotypeInfoPtr1->getString(gtField, i);
            const std::string* genotypeVal2 = genotypeInfoPtr2->getString(gtField, sample2Indices[i]);
            const std::string* genotypeVal3 = genotypeInfoPtr3->getString(gtField, sample3Indices[i]);
            // Need to make sure the field was found.
            if(genotypeVal1 == NULL)
            {
                // GT not found in the first record, so just continue.
                if(++numMissingGT1 <= myMaxErrors)
                {
                    std::cerr << "Missing GT for " << header1.getSampleName(i) << " in " << vcfName1 << "\n";
                }
                continue;
            }


            if(isSame(genotypeVal1, genotypeVal2))
            {
                // genotypeVal1 is majority, so make no change.
                if(isSame(genotypeVal1, genotypeVal3))
                {
                    ++numAllMatch;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++numAllMatch00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++numAllMatch01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++numAllMatch11;
                    }
                }
                else
                {
                    ++num1Match2Only;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++num1Match2Only00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++num1Match2Only01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++num1Match2Only11;
                    }
                }
            }
            else if(isSame(genotypeVal1, genotypeVal3))
            {
                // genotypeVal1 is majority, so make no change.
                    ++num1Match3Only;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++num1Match3Only00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++num1Match3Only01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++num1Match3Only11;
                    }
            }
            else if(isSame(genotypeVal2, genotypeVal3))
            {
                // genotypeVal2 is majority, so change genotypeVal1.
                genotypeInfoPtr1->setString(gtField, i, *genotypeVal2);
                ++num2Match3Only;
                if(*genotypeVal2 == "0/0")
                {
                    ++num2Match3Only00;
                }
                else if((*genotypeVal2 == "0/1") || 
                        (*genotypeVal2 == "1/0"))
                {
                    ++num2Match3Only01;
                }
                if(*genotypeVal2 == "1/1")
                {
                    ++num2Match3Only11;
                }
            }
            else
            {
                // None are the same so set to "./."
                genotypeInfoPtr1->setString(gtField, i, "./.");
                ++numNoMatches;
            }
        } // loop back to vcf1 samples.

        // Write this record.
        outputVcf.writeRecord(record1);
    } // loop back to next vcf1 record.

    std::cerr << "\n";
    if(numMissing2 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissing2 - myMaxErrors
                  << " errors about skipped positions because they were not in "
                  << vcfName2
                  << "\n";
    }

    if(numMissing3 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissing3 - myMaxErrors
                  << " errors about skipped positions because they were not in "
                  << vcfName3
                  << "\n";
    }

    if(numMismatchRefAlt > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMismatchRefAlt - myMaxErrors
                  << " errors about mismatched ref/alt positions\n";
    }

    if(numMissingGT1 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissingGT1 - myMaxErrors
                  << " errors about missing GT for "
                  << vcfName1
                  << "\n";
    }
    std::cerr << "\n";
    // Output the stats.
    std::cerr << "File1 = " << vcfName1 << std::endl;
    std::cerr << "File2 = " << vcfName2 << std::endl;
    std::cerr << "File3 = " << vcfName3 << std::endl;
    std::cerr << "\nType\tTotal\t0/0\t0/1|1/0\t1/1\n";
    std::cerr << "AllMatched" 
              << "\t" << numAllMatch
              << "\t" << numAllMatch00 
              << "\t" << numAllMatch01 
              << "\t" << numAllMatch11 << std::endl;
    std::cerr << "1matched2"
              << "\t" << num1Match2Only 
              << "\t" << num1Match2Only00 
              << "\t" << num1Match2Only01 
              << "\t" << num1Match2Only11 << std::endl;
    std::cerr << "1matched3"
              << "\t" << num1Match3Only 
              << "\t" << num1Match3Only00 
              << "\t" << num1Match3Only01 
              << "\t" << num1Match3Only11 << std::endl;
    std::cerr << "2matched3"
              << "\t" << num2Match3Only
              << "\t" << num2Match3Only00 
              << "\t" << num2Match3Only01 
              << "\t" << num2Match3Only11 << std::endl;
    std::cerr << "NoneMatched\t" << numNoMatches << std::endl;

    return(0);
}
Example #23
0
int VcfCleaner::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcf = "";
    bool uncompress = false;
    bool params = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("out", &outputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    VcfFileReader inFile;
    VcfFileWriter outFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file.
    inFile.open(inputVcf, header);
    if(uncompress)
    {
        outFile.open(outputVcf, header, InputFile::DEFAULT);
    }
    else
    {
        outFile.open(outputVcf, header);
    }

    int numReadRecords = 0;
    int numWrittenRecords = 0;
    int returnVal = 0;

    // Set to only store/write the GT field.
    VcfRecordGenotype::addStoreField("GT");
    while(inFile.readRecord(record))
    {
        ++numReadRecords;
        // Check if any samples are missing GT or if any are not phased.
        if(!record.hasAllGenotypeAlleles() || !record.allPhased())
        {
            // Missing a GT or not phased, so continue without writing.
            continue;
        }
        
        // Clear the INFO field.
        record.getInfo().clear();
        // Write the record.
        if(!outFile.writeRecord(record))
        {
            // Write error.
            std::cerr << "Failed writing a vcf record.\n";
            returnVal = -1;
        }
        ++numWrittenRecords;
    }
 
    inFile.close();   
    outFile.close();   

    std::cerr << "NumReadRecords: " << numReadRecords
              << "; NumWrittenRecords: " << numWrittenRecords << "\n";
    return(returnVal);
}
Example #24
0
// Dump the reference information from specified SAM/BAM file.
int DumpRefInfo::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    bool noeof = false;
    bool printRecordRefs = false;
    bool params = false;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("printRecordRefs", &printRecordRefs)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    // parameters start at index 2 rather than 1.
    inputParameters.Read(argc, argv, 2);

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the input file for reading.
    SamFile samIn;
    samIn.OpenForRead(inFile);

    // Read the sam header.
    SamFileHeader samHeader;
    samIn.ReadHeader(samHeader);

    const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
    int numReferences = refInfo.getNumEntries();
    
    for(int i = 0; i < numReferences; i++)
    {
        std::cout << "Reference Index " << i;
        std::cout << "; Name: " << refInfo.getReferenceName(i)
                  << std::endl;
    }
    if(numReferences == 0)
    {
        // There is no reference info.
        std::cerr << "The header contains no reference information.\n";
    }

    // If we are to print the references as found in the records, loop
    // through reading the records.
    if(printRecordRefs)
    {
        SamRecord samRecord;

        // Track the prev name/id.
        std::string prevName = "";
        int prevID = -2;
        int recCount = 0; // track the num records in a ref.
        // Keep reading records until ReadRecord returns false.
        while(samIn.ReadRecord(samHeader, samRecord))
        {
            const char* name = samRecord.getReferenceName();
            int id = samRecord.getReferenceID();
            if((strcmp(name, prevName.c_str()) != 0) || (id != prevID))
            {
                if(prevID != -2)
                {
                    std::cout << "\tRef ID: " << prevID
                              << "\tRef Name: " << prevName 
                              << "\tNumRecs: " << recCount
                              << std::endl;
                }
                recCount = 0;
                prevID = id;
                prevName = name;
            }
            ++recCount;
        }
        // Print the last index.
        if(prevID != -2)
        {
            std::cout << "\tRef ID: " << prevID
                      << "\tRef Name: " << prevName 
                      << "\tNumRecs: " << recCount
                      << std::endl;
        }
    }
    return(SamStatus::SUCCESS);
}