Beispiel #1
0
// close the BAM files
void BamMultiReader::Close(void) {
  
    // close all BAM readers and clean up pointers
    vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
    vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd  = readers.end();
    for ( ; readerIter != readerEnd; ++readerIter) {
      
        BamReader* reader = (*readerIter).first;
        BamAlignment* alignment = (*readerIter).second;
        
        // close the reader
        if ( reader) reader->Close();  
        
        // delete reader pointer
        delete reader;
        reader = 0;

        // delete alignment pointer
        delete alignment;
        alignment = 0;
    }

    // clear out the container
    readers.clear();
}
Beispiel #2
0
void BedIntersectPE::IntersectBamPE(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader = reader.GetHeaderText();
    RefVector refs   = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    // track the previous and current sequence
    // names so that we can identify blocks of
    // alignments for a given read ID.
    string prevName, currName;
    prevName = currName = "";

    vector<BamAlignment> alignments;        // vector of BAM alignments for a given ID in a BAM file.
    alignments.reserve(100);

    _bedA->bedType = 10;                    // it's a full BEDPE given it's BAM

    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    while (reader.GetNextAlignment(bam1)) {
        // the alignment must be paired
        if (bam1.IsPaired() == true) {
            // grab the second alignment for the pair.
            reader.GetNextAlignment(bam2);

            // require that the alignments are from the same query
            if (bam1.Name == bam2.Name) {
                ProcessBamBlock(bam1, bam2, refs, writer);
            }
            else {
                cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl;
                exit(1);
            }
        }
    }
    // close up
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #3
0
/*
Description:
	Load all the bam into memory at one time if no parameters set, otherwise load the needed part of the bam.
	Save the parsed info into vector. 
*/
bool BamParse::parseAlignment(int chrom1, int chrom1_begin, int chrom2, int chrom2_end)
{
	BamReader reader;
    if ( !reader.Open(filename) ) {
        cerr << "Bamtools ERROR: could not open input BAM file: " << filename << endl;
        return false;
    }
		
	//check whether need to set a region.
	if(chrom1>-1 && chrom1_begin>-1 && chrom2>-1 && chrom2_end>-1)
	{
		this->loadIndex(reader);
		BamRegion br(chrom1,chrom1_begin,chrom2,chrom2_end);
		bool is_set=reader.SetRegion(br);
		if(is_set==false)
		{
			return false;//cannot set the region.
		}
	}

	//process input data
    BamAlignment al;   
	while ( reader.GetNextAlignment(al) )
	{
		if(al.Position<0) continue;

		BamAlignmentRecord* bar=new BamAlignmentRecord();
		setAlignmentRecord(al,bar);
		bam_aln_records.push_back(bar);
	}

	reader.Close();
	return true;
}
Beispiel #4
0
int main (int argc, char *argv[]) {
    
     string bamfiletopen = string(argv[1]);
     BamReader reader;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }

    BamAlignment al;
    while ( reader.GetNextAlignment(al) ) {
	string reconstructedReference = reconstructRef(&al);
	cout<<al.QueryBases<<endl;
	cout<<reconstructedReference<<endl;

	
	pair< string, vector<int> >  reconP = reconstructRefWithPos(&al);
	for(unsigned int i=0;i<reconP.first.size();i++){
	    cout<<reconP.first[i]<<"\t"<<reconP.second[i]<<endl;
	}

    }
    reader.Close();

    return 0;
}
Beispiel #5
0
bool RevertTool::RevertToolPrivate::Run(void) {
  
    // opens the BAM file without checking for indexes
    BamReader reader;
    if ( !reader.Open(m_settings->InputFilename) ) {
        cerr << "Could not open input BAM file... quitting." << endl;
        return false;
    }

    // get BAM file metadata
    const string& headerText = reader.GetHeaderText();
    const RefVector& references = reader.GetReferenceData();
    
    // open writer
    BamWriter writer;
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
    if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) {
        cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl;
        return false;
    }

    // plow through file, reverting alignments
    BamAlignment al;
    while ( reader.GetNextAlignment(al) ) {
        RevertAlignment(al);
        writer.SaveAlignment(al);
    }
    
    // clean and exit
    reader.Close();
    writer.Close();
    return true; 
}
Beispiel #6
0
/**
 * Main work method.  Reads the BAM file once and collects sorted information about
 * the 5' ends of both ends of each read (or just one end in the case of pairs).
 * Then makes a pass through those determining duplicates before re-reading the
 * input file and writing it out with duplication flags set correctly.
 */
int MarkDuplicates::runInternal() {
    
    ogeNameThread("am_MarkDuplicates");

    if(verbose)
        cerr << "Reading input file and constructing read end information." << endl;
    buildSortedReadEndLists();
    
    generateDuplicateIndexes();
    
    if(verbose)
        cerr << "Marking " << numDuplicateIndices << " records as duplicates." << endl;
    
    BamReader in;

    in.Open(getBufferFileName());

    // Now copy over the file while marking all the necessary indexes as duplicates
    long recordInFileIndex = 0;
    
    long written = 0;
    while (true) {
        BamAlignment * prec = in.GetNextAlignment();
        if(!prec)
            break;
        
        if (prec->IsPrimaryAlignment()) {
            if (duplicateIndexes.count(recordInFileIndex) == 1)
                prec->SetIsDuplicate(true);
            else
                prec->SetIsDuplicate(false);
        }
        recordInFileIndex++;
        
        if (removeDuplicates && prec->IsDuplicate()) {
            // do nothing
        }
        else {
            putOutputAlignment(prec);
            if (verbose && read_count && ++written % 100000 == 0) {
                cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << std::flush;
            }
        }
    }

    if (verbose && read_count)
        cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << endl;

    in.Close();

    remove(getBufferFileName().c_str());
    
    return 0;
}
Beispiel #7
0
void BedCoverage::CollectCoverageBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    // open the BAM file
    BamReader reader;
    reader.Open(bamFile);

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        if (bam.IsMapped()) {
            // treat the BAM alignment as a single "block"
            if (_obeySplits == false) {
                // construct a new BED entry from the current BAM alignment.
                BED a;
                a.chrom  = refs.at(bam.RefID).RefName;
                a.start  = bam.Position;
                a.end    = bam.GetEndPosition(false, false);
                a.strand = "+";
                if (bam.IsReverseStrand()) a.strand = "-";

                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            }
            // split the BAM alignment into discrete blocks and
            // look for overlaps only within each block.
            else {
                // vec to store the discrete BED "blocks" from a
                bedVector bedBlocks;
                // since we are counting coverage, we do want to split blocks when a
                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
    // close the BAM file
    reader.Close();
}
Beispiel #8
0
bool CoverageTool::CoverageToolPrivate::Run(void) {  
  
    // if output filename given
    ofstream outFile;
    if ( m_settings->HasOutputFile ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename
                 << " for output" << endl;
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    } 
    
    //open our BAM reader
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools coverage ERROR: could not open input BAM file: " << m_settings->InputBamFilename << endl;
        return false;
    }

    // retrieve references
    m_references = reader.GetReferenceData();
    
    // set up our output 'visitor'
    CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out);
    
    // set up pileup engine with 'visitor'
    PileupEngine pileup;
    pileup.AddVisitor(cv);
    
    // process input data
    BamAlignment al;    
    while ( reader.GetNextAlignment(al) ) 
        pileup.AddAlignment(al);
    
    // clean up 
    reader.Close();
    if ( m_settings->HasOutputFile )
        outFile.close();
    delete cv;
    cv = 0;
    
    // return success
    return true;
}
Beispiel #9
0
int main (int argc, char *argv[]) {

     if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl;
    	return 1;
    }

     string bamfiletopen = string(argv[1]);
     string bamFileOUT   = string(argv[2]);

     BamReader reader;
     BamWriter writer;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    if ( !writer.Open(bamFileOUT,header,references) ) {
    	cerr << "Could not open output BAM file "<<bamFileOUT << endl;
    	return 1;
    }

    BamAlignment al;
 
    while ( reader.GetNextAlignment(al) ) {
	if(al.IsMapped()){
	    cerr << "Cannot yet handle mapped reads " << endl;
	    return 1;
	}

	
	al.SetIsPaired (false);
	
	writer.SaveAlignment(al);    

    } //while al

    reader.Close();
    writer.Close();

    return 0;
}
Beispiel #10
0
int IndexTool::Run(int argc, char* argv[]) {
  
    // parse command line arguments
    Options::Parse(argc, argv, 1);
    
    // open our BAM reader
    BamReader reader;
    reader.Open(m_settings->InputBamFilename);
    
    // create index for BAM file
    bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex;
    reader.CreateIndex(useDefaultIndex);
    
    // clean & exit
    reader.Close();
    return 0;
}
Beispiel #11
0
int main (int argc, char *argv[]) {

     if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl;
	 return 1;
     }

     string bamfiletopen = string(argv[1]);
     // cout<<bamfiletopen<<endl;
     BamReader reader;
     // cout<<"ok"<<endl;
     if ( !reader.Open(bamfiletopen) ) {
	 cerr << "Could not open input BAM files." << endl;
	 return 1;
     }

     BamAlignment al;
     // cout<<"ok"<<endl;
     while ( reader.GetNextAlignment(al) ) {
	 // cout<<al.Name<<endl;
	 if(!al.IsMapped())
	     continue;

	 if(al.HasTag("NM") ){
	     int editDist;
	     if(al.GetTag("NM",editDist) ){
		 cout<<editDist<<endl;
	     }else{
		 cerr<<"Cannot retrieve NM field for "<<al.Name<<endl;
		 return 1;
	     }
	 }else{
	     cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl;
	 }

		    

     } //while al

     reader.Close();

     return 0;
}
Beispiel #12
0
int main (int argc, char** argv)
{

    // Print Commandline
    string ss(argv[0]);   // convert Char to String
    string commandline = "##Print Command line " + ss;

    int c;

    FastaReference* reference = NULL;
    int minbaseQ        = 10;   //default
    int windowlen       = 40;  //by default
    string regionstr;
    string RegionFile;
    string bamfile;
    bool STdin          = false;
    bool has_region     = false;
    bool has_regionFile = false;
    bool has_bamfile    = false;
    bool has_ref        = false;
    int ploidy         = 2;
    bool SetLowComplexityRegionSWGapExt = false;
    bool SetLowComplexityRegion = false;
   

    if (argc < 2)
    {
        printSummary(argv);
        exit(1);
    }

    while (true)
    {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"ploidy", required_argument, 0, 'p'},
            {"window-size", required_argument, 0, 'w'},
            {"reference", required_argument, 0, 'f'},
            {"min-base-quality", required_argument, 0,'q'},
            {"Region", required_argument, 0, 'R'},
            {"STdin", no_argument, 0, 's'},
            {"bam", required_argument, 0, 'b'},
            {"Repeat-Extgap", no_argument, 0, 'E'},
            {"LowCompex", no_argument, 0, 'l'},
            {0, 0, 0, 0}
        };

        int option_index = 0;

        c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c)
        {
            case 'f':
                reference = new FastaReference(optarg); // will exit on open failure
                commandline = commandline + " -f " + optarg;
                has_ref = true;
                break;

            case 'b':
                has_bamfile = true;
                bamfile = optarg;
                commandline = commandline + " -b " + optarg;
                break;

            case 'r':
                regionstr = optarg;
                has_region = true;
                commandline = commandline + " -r " + optarg;
                break;

             case 'R':
                RegionFile = optarg;
                has_regionFile = true;
                commandline = commandline + " -R " + optarg;
                break;

            case 's':
                STdin = true;
                commandline = commandline + " -s ";
                break;
                
            case 'q':
                minbaseQ = atoi(optarg);
                commandline = commandline + " -q " + optarg;
                break;

            case 'w':
                windowlen = atoi(optarg);
                commandline = commandline + " -w " + optarg;
                break;

            case 'p':
                ploidy = atoi(optarg);
                commandline = commandline + " -p " + optarg;
                break;

            case 'E':
                SetLowComplexityRegionSWGapExt = true;
                commandline = commandline + " -E ";
                break;

            case 'l':
                SetLowComplexityRegion = true;
                commandline = commandline + " -l ";
                break;

            case 'h':
                printSummary(argv);
                commandline = commandline + " -h ";
                exit(0);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

              default:
                abort();
                break;
        }
    }

    //// Open Error log files
    ofstream cerrlog("bonsaiReport.txt");
    streambuf *cerrsave = std::cerr.rdbuf();

    // Redirect stream buffers
    if (cerrlog.is_open())
        cerr.rdbuf(cerrlog.rdbuf());

    cerr << commandline << endl;
    

    //Check for Reference Fasta sequence
    if (!has_ref)
    {
        cerr << "no FASTA reference provided, cannot realign" << endl;
        exit(1);
    }

    ////Check for reader
    BamReader reader;
    if (STdin == true)
    {
        if (!reader.Open("stdin"))
        {
            cerr << "could not open stdin bam for reading" << endl;
            cerr << reader.GetErrorString() << endl;
            reader.Close();
            printSummary(argv);
        }
    }
    else
    {
        if (has_bamfile == true)
        {
            if (!reader.Open(bamfile))
            {
                cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl;
                cerr << reader.GetErrorString() << endl;
                reader.Close();
                printSummary(argv);
            }

            if ( !reader.LocateIndex() )
                reader.CreateIndex();
        }
        else
        {
            cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl;
            reader.Close();
            printSummary(argv);
        }
    }

    //// Check Region Tags
    if ( (has_regionFile == true) && (has_region == true) )
    {
        cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl;
        exit(1);
    }

    //// store the names of all the reference sequences in the BAM file
    vector<RefData> referencedata = reader.GetReferenceData();
   
    //// Store Region LIST
    vector<BamRegion> regionlist;
    if (has_region == true)
    {
        BamRegion region;
        ParseRegionString(regionstr, reader, region);
        regionlist.push_back(region);
    }
    else if (has_regionFile == true)
    {
        ifstream RG(RegionFile.c_str(), ios_base::in);
        string line;
        while(getline(RG,line))
        {
            BamRegion region;
            ParseRegionString(line, reader, region);
            regionlist.push_back(region);
        }
        RG.close();
    }
    else if ( (has_regionFile == false) && (has_region == false) )
    {
        for (int i= 0; i < (int)referencedata.size(); i++)
        {
            string regionstr = referencedata.at(i).RefName;
            BamRegion region;
            ParseRegionString(regionstr, reader, region);
            if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened)
            {
                cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl;
                reader.Close();
                exit(1);
            }
            else
                regionlist.push_back(region);
        }
    }

    //// 
    BamWriter writer;
    if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData()))
    {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    //// Smallest start position and Largest end position for Req Seq
    vector<RefData>::iterator refdataIter = referencedata.begin();
    vector<BamRegion>::iterator regionListIter = regionlist.begin();
   

    // CLASS
    RealignFunctionsClass RealignFunction;

    map<int, string> RefIDRedName;
    vector<SalRealignInfo> AlGroups;
    multimap<int, BamAlignment> SortRealignedAlignmentsMultimap;

    int refid               = 0;
    BamAlignment alignment;
    bool IsNextAlignment = reader.GetNextAlignment(alignment);
    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

    int windowrealigned     = 0;
    int TotalWindowDetected = 0;
    int TotalReadsAligned   = 0;
    int TotalWindow         = 0;
    int TotalReads          = 0;

    while (refdataIter != referencedata.end() )
    {
        string refname = refdataIter->RefName;
        RefIDRedName[refid] = refname;
        int reflength = refdataIter->RefLength;
        int winstartpos, winendpos;
        int AllowableBasesInWindow = 1;
        bool nextChrName = false;

        cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl;
        
        while (nextChrName == false )
        {
            vector<int> minmaxRefSeqPos;
            bool IsPassDetectorNoRealignment = false;
            minmaxRefSeqPos.push_back(-1);
            minmaxRefSeqPos.push_back(0);
            //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl;
            if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) )
            {
                ////
                if ( (has_region == true) || (has_regionFile == true) )
                {                    
                    winstartpos = (*regionListIter).LeftPosition;
                    winendpos   = winstartpos + windowlen - 1;
                    reflength = (*regionListIter).RightPosition;
                    if (reflength < winendpos)
                        reflength = winendpos;
                                       
                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                else if (has_region == false)
                {
                    winstartpos = 0;
                    winendpos   = winstartpos + windowlen - 1;

                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

                ////
                while ((winstartpos < reflength))
                {
                    //// Check window end position
                    if (winendpos > reflength)
                        winendpos = reflength;

                    // Reinitialized
                    unsigned int NewReadMappedcount = 0;
                
                    //// Save and Erase alignments that are outside of window (Deque?)
                    if (!AlGroups.empty())
                    {
                        minmaxRefSeqPos.at(0) = -1;
                        minmaxRefSeqPos.at(1) = 0;

                        //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl;
                        vector<SalRealignInfo>::iterator Iter = AlGroups.begin();

                        while (Iter != AlGroups.end())
                        {
                            // Erase alignment s
                            if ((*Iter).al.GetEndPosition() < winstartpos)
                            {
                                //cerr << "  ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                                AlGroups.erase(Iter);

                                //cerr << "  ToWrite: DONE " << endl;
                            } 
                            else
                            {
                                string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length);
                            
                                if ((*Iter).HasRealign == true )
                                {
                                    (*Iter).currentReadPosition = 0;
                                    (*Iter).currentGenomeSeqPosition = 0;
                                    (*Iter).currentAlPosition = (*Iter).al.Position;
                                    (*Iter).cigarindex = 0;
                                }

                                (*Iter).CigarSoftclippingLength = 0;
                                SalRealignInfo talr = (*Iter);
                                //cerr << "  ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false);
                           
                                ++Iter; //Increment iterator
                            }
                        }
                    }
                

                    // Write Sorted Alignments that are outside of window
                    //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;
                    if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) )
                    {
                        //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                        multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin();

                        while (sraIter != SortRealignedAlignmentsMultimap.end()) 
                        {
                            //cerr << " (*sraIter).first= " <<  (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl;
                            if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) {
                                //writer.SaveAlignment((*sraIter).second);  // Why sometimes, it doesn't work ?????
                                if (!writer.SaveAlignment((*sraIter).second))
                                    cerr << writer.GetErrorString() << endl;

                                SortRealignedAlignmentsMultimap.erase(sraIter++);
                            } else {
                                ++sraIter;
                            }
                    }
                    //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                    }

                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;
                    //cerr <<  ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl;

                    //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;
                    // Gather Reads within a window frame
                  
                    while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions
                    {
                        if (SetLowComplexityRegion == true) 
                        {
                            string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) );

                            if (IsWindowInRepeatRegion(sequenceInWindow) == true)
                            {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                                {
                                    TotalReads++;
                                    if (alignment.IsMapped())
                                    {
                                        string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length);
 
                                        vector<SalRealignInfo>::iterator tIter;
                                        SalRealignInfo alr;
                                        alr.al = alignment;
                                        alr.currentReadPosition = 0;
                                        alr.currentGenomeSeqPosition = 0;
                                        alr.currentAlPosition = alignment.Position;
                                        alr.cigarindex = 0;
                                        alr.HasRealign = false;
                                        alr.CigarSoftclippingLength = 0;

                                        string str = "ZZZZZZZZZZZZZZZZZ";
                                        if (alignment.Name.find(str) != string::npos) {
                                            stringstream cigar;
                                            for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                                cigar << cigarIter->Length << cigarIter->Type;

                                            string cigarstr = cigar.str();
                                            cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                        }

                                        RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);
                                        NewReadMappedcount++;
                                    } 
                                    else
                                    {
                                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                        cerr << "UNmapped : " << alignment.Name << endl;
                                    }
                                } 
                                else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1)
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                }
                                else
                                    break;
                            } else {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2)
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                else
                                    break;
                            }
                        }
                        else // (SetLowComplexityRegion == false)
                        {
                            if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                            {
                                TotalReads++;
                                if (alignment.IsMapped())
                                {
                                    string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length);

                                    vector<SalRealignInfo>::iterator tIter;
                                    SalRealignInfo alr;
                                    alr.al = alignment;
                                    alr.currentReadPosition = 0;
                                    alr.currentGenomeSeqPosition = 0;
                                    alr.currentAlPosition = alignment.Position;
                                    alr.cigarindex = 0;
                                    alr.HasRealign = false;
                                    alr.CigarSoftclippingLength = 0;

                                    string str = "ZZZZZZZZZZZZZZZZZ";
                                    if (alignment.Name.find(str) != string::npos)
                                    {
                                        stringstream cigar;
                                        for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                            cigar << cigarIter->Length << cigarIter->Type;

                                        string cigarstr = cigar.str();
                                        cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                    }

                                    RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);

                                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                                    //cerr << "   INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl;
                                    NewReadMappedcount++;
                                } 
                                else
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                    cerr << "UNmapped : " << alignment.Name << endl;
                                }
                            }
                            else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) {
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                            }
                            else
                                break;
                        }

                        ////Get next alignment
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                    }

                   //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;

                    //// Detector Corner
                    bool ToRealign = MeetIndelDetectorThresholdv(AlGroups);
                    cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl;
                    
                    // **************
                    if (ToRealign)
                    {
                        //cerr << "  ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl;
                        //cerr << "             minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;

                        ////// Perform Realign routines
                        int TotalAlR = 0; // Total number of alignments to be realigned
                        int NumAlR = 0; // Now many alignments are aligned
                        TotalWindowDetected++;

                        cerr << "#Start: Meet Threshold, Realigning ... " << endl;

                        if (minmaxRefSeqPos.at(1) < winendpos)
                            minmaxRefSeqPos.at(1) = winendpos;

                        if (minmaxRefSeqPos.at(0) > winstartpos)
                            minmaxRefSeqPos.at(0) = winstartpos;

                        bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference);

                        if (IsToRealign == true)
                        {
                            RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt);

                            minmaxRefSeqPos.at(0) = -1;
                            minmaxRefSeqPos.at(1) = 0;

                            int nextwinstartpos = winendpos + 1;
                            int nextwinendpos = winstartpos + windowlen - 1;
                            if (nextwinendpos > reflength)
                                nextwinendpos = reflength;

                            //cerr <<  "   Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl;
                            RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy);
                            IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions

                            //cerr <<  "   After Realign : " << SortRealignedAlignmentsMultimap.size() << endl;

                            TotalReadsAligned += NumAlR;

                            if (NumAlR > 0) // Realignment done
                                windowrealigned++;
                        } else


                        cerr << "#Done: Meet Threshold, Realigning ... " << endl;
                    }


                    if (NewReadMappedcount > 0)
                        TotalWindow++;

                    RealignFunction.Clear();

                    //// Move the window frame
                    winstartpos = winendpos + 1;
                    winendpos = winstartpos + windowlen - 1;
                }

                //// Save and Erase remaining alignments that are outside of window (Deque?)
                if ((!AlGroups.empty())) {
                    cerr << "#Start: Write Remaining alignments and delete all alignments" << endl;

                    for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) {
                        //cerr << "    Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position  << " < " << winstartpos << "  " << winendpos << endl;
                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                    }

                    cerr << "#Done: Write Remaining alignments and delete all alignments" << endl;
                }

                AlGroups.clear();


                // Write Sorted remaining Alignments that are outside of window
                if (!SortRealignedAlignmentsMultimap.empty())
                {
                    for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter)
                    {
                        //writer.SaveAlignment((*sraIter).second);
                        if (!writer.SaveAlignment((*sraIter).second))
                            cerr << writer.GetErrorString() << endl;
                    }
                    SortRealignedAlignmentsMultimap.clear();
                }

            }

            ++regionListIter;
            if ((*regionListIter).LeftRefID > refid)
                nextChrName = true;
        }

        //// If End of the chromosome position
        //// increament iterator
        ++refdataIter;
        ++refid;
    }


    reader.Close();
    writer.Close();

    cerr << "##-Completed- " << endl;
    cerr << " Total Reads processed =  " << TotalReads << endl;
    cerr << " Total Reads Aligned =    " << TotalReadsAligned << endl;
    cerr << " Total Window processed = " << TotalWindow << endl;
    cerr << " Total Window Detected =  " << TotalWindowDetected << endl;
    cerr << " Total Windows Aligned =  " << windowrealigned << endl;


    // Restore cerr's stream buffer before terminating
    if (cerrlog.is_open())
        cerr.rdbuf(cerrsave);

    commandline.clear();
    return 0;
}
Beispiel #13
0
//{{{bool sort_inter_chrom_bam(string in_file_name,
bool sort_inter_chrom_bam(string in_file_name,
						  string out_file_name)
{
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(in_file_name) ) {
        cerr << "sort ERROR: could not open " << 
			in_file_name << " for reading... Aborting." << endl;
        return false;
    }

    SamHeader header = reader.GetHeader();
    if ( !header.HasVersion() )
        header.Version = Constants::SAM_CURRENT_VERSION;

    string header_text = header.ToString();
    RefVector ref = reader.GetReferenceData();

    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(SORT_DEFAULT_MAX_BUFFER_COUNT*1.1) );
    bool bufferFull = false;

	
    int buff_count = 0;
    // iterate through file
    while ( reader.GetNextAlignment(al)) {

        // check buffer's usage
        bufferFull = ( buffer.size() >= SORT_DEFAULT_MAX_BUFFER_COUNT );

        // store alignments until buffer is "full"
        if ( !bufferFull )
            buffer.push_back(al);
        // if buffer is "full"
        else {
            // so create a sorted temp file with current buffer contents
            // then push "al" into fresh buffer
            create_sorted_temp_file(buffer,
                                    out_file_name,
                                    buff_count,
                                    header_text,
                                    ref);
                                    ++buff_count;
            buffer.push_back(al);
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() ) {
        create_sorted_temp_file(buffer,
                                out_file_name,
                                buff_count,
                                header_text,
                                ref);

        ++buff_count;
    }

    reader.Close();

    return merge_sorted_files(out_file_name, buff_count, header_text, ref);

/*
	for (int i = 0; i < buff_count; ++i) {
    	stringstream temp_name;
    	temp_name << out_file_name << i;
	}
*/
}
Beispiel #14
0
void BedIntersectPE::IntersectBamPE(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader = reader.GetHeaderText();
    RefVector refs   = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    // track the previous and current sequence
    // names so that we can identify blocks of
    // alignments for a given read ID.
    string prevName, currName;
    prevName = currName = "";

    vector<BamAlignment> alignments;        // vector of BAM alignments for a given ID in a BAM file.
    alignments.reserve(100);

    _bedA->bedType = 10;                    // it's a full BEDPE given it's BAM

    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    while (reader.GetNextAlignment(bam1)) {
        reader.GetNextAlignment(bam2);        
        if (bam1.Name != bam2.Name) {
            while (bam1.Name != bam2.Name)
            {
                if (bam1.IsPaired()) 
                {
                    cerr << "*****WARNING: Query " << bam1.Name
                         << " is marked as paired, but it's mate does not occur"
                         << " next to it in your BAM file.  Skipping. " << endl;
                }
                bam1 = bam2;
                reader.GetNextAlignment(bam2);
            }
        }
        else if (bam1.IsPaired() && bam1.IsPaired()) {
            ProcessBamBlock(bam1, bam2, refs, writer);
        }
    }
    // close up
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #15
0
int main (int argc, char *argv[]) {


    bool produceUnCompressedBAM=false;
    bool verbose=false;
    bool ancientDNA=false;
    bool keepOrig=false;

    string adapter_F=options_adapter_F_BAM;
    string adapter_S=options_adapter_S_BAM;
    string adapter_chimera=options_adapter_chimera_BAM;
    string key="";
    bool allowMissing=false;
    int trimCutoff=1;

    bool allowAligned=false;
    bool printLog=false;
    string logFileName;

    BamReader reader;
    BamWriter writer;

    string bamFile;
    string bamFileOUT="";

    string key1;
    string key2;
    
    bool useDist=false;
    double location=-1.0;
    double scale   =-1.0;

    bool fastqFormat=false;
    string fastqfile1   = "";
    string fastqfile2   = "";
    string fastqoutfile = "";
    bool singleEndModeFQ=true;

    const string usage=string(string(argv[0])+
			      
			      " [options] BAMfile"+"\n"+
			      "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+

			      "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+			      
			      "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+
			      "\t\t"+"-fq2" +"\t\t"+"Second  fastq file (for paired-end)"+"\n"+
			      "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+
			      //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+
			      "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+


			      "\t"+"-u            " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+

			      //	"\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+
			      //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+
			      "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+
			      "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+
			      "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+
			      
			      "\n\t"+"Paired End merging/Single Read trimming  options"+"\n"+
			      "\t\t"+"You can specify either:"+"\n"+
			      "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+
			      "\t\t"+"            "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+
			      "\n\t\t"+"or if you know your size length distribution:"+"\n"+
			      "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+
			      "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+
			      //			      "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n"
			      "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged  (default "+boolStringify(keepOrig)+")"+"\n"+
			      "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n"
			      "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+
			      "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+
			      "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")");

    if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
    	cout<<"Usage:"<<endl;
    	cout<<""<<endl;
    	cout<<usage<<endl;
    	return 1;
    }

    

    for(int i=1;i<(argc-1);i++){ //all but the last arg

	if(strcmp(argv[i],"-fq1") == 0 ){
	    fastqfile1=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fq2") == 0 ){
	    fastqfile2=string(argv[i+1]);
	    fastqFormat=true;
	    singleEndModeFQ=false;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fqo") == 0 ){
	    fastqoutfile=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}




	if(strcmp(argv[i],"--log") == 0 ){
	    logFileName =string(argv[i+1]);
	    printLog=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){
	    cerr<<"This version no longer works with pipe, exiting"<<endl;
	    return 1;	    
	}

	if(strcmp(argv[i],"-u") == 0  ){
	    produceUnCompressedBAM=true;
	    continue;
	}

	if(strcmp(argv[i],"--aligned") == 0  ){
	    allowAligned=true;
	    continue;
	}



	if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){
	    bamFileOUT =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){
	    verbose=true;
	    continue;
	}

	if(strcmp(argv[i],"--ancientdna") == 0 ){
	    ancientDNA=true;
	    continue;
	}

	if(strcmp(argv[i],"--keepOrig") == 0 ){
	    keepOrig=true;
	    continue;
	}

	if(strcmp(argv[i],"--loc") == 0 ){
	    location =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"--scale") == 0 ){
	    scale =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}



	if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){
	    adapter_F =string(argv[i+1]);
	    i++;
	    continue;
	}


	if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){
	    adapter_S =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){
	    adapter_chimera =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){
	    key =string(argv[i+1]);
	    i++;
	    continue;
	}
	

	if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){
	    allowMissing=true;
	    continue;
	}

	if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){
	    trimCutoff=atoi(argv[i+1]);
	    i++;
	    continue;
	}
	
	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
	return 1;	    
    }

    bamFile=argv[argc-1];

    if( (location != -1.0 && scale == -1.0) ||
	(location == -1.0 && scale != -1.0) ){
	cerr<<"Cannot specify --location without specifying --scale"<<endl;
	return 1;	    
    }
	
    if( (location != -1.0 && scale != -1.0) ){
	useDist=true;
	    
	if(ancientDNA){
	    cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl;
	    return 1;	    
	}
    }
    
    MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera,
			key1,key2,
			trimCutoff,allowMissing,ancientDNA,location,scale,useDist);

    fqwriters onereadgroup;

    if(fastqFormat){
	
	if( bamFileOUT != ""  || produceUnCompressedBAM || allowAligned){
	    cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl;
	    return 1;
	}

	if(fastqfile1 == ""){
	    cerr<<"ERROR : Must specify as least the first file for fastq"<<endl;
	    return 1;	    
	}



	FastQParser * fqp1;
	FastQParser * fqp2;

	if(singleEndModeFQ){
	    fqp1 = new FastQParser (fastqfile1);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdirsf  = fastqoutfile+".fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }

	    
	}else{
	    fqp1 = new FastQParser (fastqfile1);
	    fqp2 = new FastQParser (fastqfile2);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdir1   = fastqoutfile+"_r1.fq.gz";
	    string outdir2   = fastqoutfile+"_r2.fq.gz";

	    string outdirsf  = fastqoutfile+".fail.fq.gz";
	    string outdir1f  = fastqoutfile+"_r1.fail.fq.gz";
	    string outdir2f  = fastqoutfile+"_r2.fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.pairr1.open(outdir1.c_str(), ios::out);
	    onereadgroup.pairr2.open(outdir2.c_str(), ios::out);

	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);
	    onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out);
	    onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.pairr1.good()){       cerr<<"Cannot write to file "<<outdir1<<endl; return 1; }
	    if(!onereadgroup.pairr2.good()){       cerr<<"Cannot write to file "<<outdir2<<endl; return 1; }
	    
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }
	    if(!onereadgroup.pairr1f.good()){      cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; }
	    if(!onereadgroup.pairr2f.good()){      cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; }
	    
	}


	unsigned int totalSeqs=0;
	while(fqp1->hasData()){

	    FastQObj * fo1=fqp1->getData();
	    vector<string> def1=allTokens( *(fo1->getID()), ' '  );
	    string def1s=def1[0];
	

	    FastQObj * fo2;
	    string def2s;
	    string ext2s;

	    if(!singleEndModeFQ){
		if(!fqp2->hasData()){
		    cerr << "ERROR: Discrepency between fastq files at record " <<  *(fo1->getID()) <<endl;
		    return 1;
		}

		fo2=fqp2->getData();
		vector<string> def2=allTokens( *(fo2->getID()), ' ' );
		def2s=def2[0];




		if(strEndsWith(def1s,"/1")){
		    def1s=def1s.substr(0,def1s.size()-2);
		}
		if(strEndsWith(def2s,"/2")){
		    def2s=def2s.substr(0,def2s.size()-2);
		}

		if(strBeginsWith(def1s,"@")){
		    def1s=def1s.substr(1,def1s.size()-1);
		}
		if(strBeginsWith(def2s,"@")){
		    def2s=def2s.substr(1,def2s.size()-1);
		}


		if(def1s != def2s){
		    cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl;
		    return 1;
		}

		merged result=	mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()),
					       *(fo2->getSeq()),*(fo2->getQual()));

		mtr.incrementCountall();

		if(result.code != ' '){ //keys or chimeras

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }
			
		    onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
		    onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;

		}else{
		        if(result.sequence != ""){ //new sequence			    
			    onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;    	    

			    if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){
				mtr.incrementCountmergedoverlap();
			    }else{
				mtr.incrementCountmerged();			  
			    }

			}else{ //keep as is
			    mtr.incrementCountnothing();

			    onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
			    onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
			    

			}
		}

	    }else{
		
		
		merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual()));
		mtr.incrementCountall();

		if(result.code != ' '){ //either chimera or missing key

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }

		    onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;
		}

		if(result.sequence != ""){ //new sequence
		    mtr.incrementCounttrimmed();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;
		}else{
		    mtr.incrementCountnothing();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		}

	    }
	


	    totalSeqs++;
	}
    
	delete fqp1;
	if(!singleEndModeFQ){
	    delete fqp2;
	}

	if(singleEndModeFQ){

	    onereadgroup.single.close();
	    onereadgroup.singlef.close();
	    
	}else{
	    onereadgroup.single.close();
	    onereadgroup.pairr1.close();
	    onereadgroup.pairr2.close();
	    
	    onereadgroup.singlef.close();
	    onereadgroup.pairr1f.close();
	    onereadgroup.pairr2f.close();
	}
    
	//fastq
    }else{
	//else BAM


	//  initMerge();
	//     set_adapter_sequences(adapter_F,
	// 			  adapter_S,
	// 			  adapter_chimera);
	//     set_options(trimCutoff,allowMissing,mergeoverlap);
	if(key != ""){
	    size_t found=key.find(",");
	    if (found == string::npos){ //single end reads
		key1=key;
		key2="";
	    } else{                     //paired-end
		key1=key.substr(0,found);
		key2=key.substr(found+1,key.length()-found+1);
	    }
	}






	if( bamFileOUT == ""  ){
	    cerr<<"The output must be a be specified, exiting"<<endl;
	    return 1;
	}

	if ( !reader.Open(bamFile) ) {
	    cerr << "Could not open input BAM file  "<<bamFile << endl;
	    return 1;
	}
	SamHeader header = reader.GetHeader();

    

	string pID          = "mergeTrimReadsBAM";   
	string pName        = "mergeTrimReadsBAM";   
	string pCommandLine = "";
	for(int i=0;i<(argc);i++){
	    pCommandLine += (string(argv[i])+" ");
	}
	putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));

	const RefVector references = reader.GetReferenceData();
	//we will not call bgzip with full compression, good for piping into another program to 
	//lessen the load on the CPU
	if(produceUnCompressedBAM) 
	    writer.SetCompressionMode(BamWriter::Uncompressed);

	if ( !writer.Open(bamFileOUT,header,references) ) {
	    cerr << "Could not open output BAM file "<<bamFileOUT << endl;
	    return 1;
	}



	SamHeader sh=reader.GetHeader();
	//Up to the user to be sure that a sequence is followed by his mate
	// if(!sh.HasSortOrder() || 
	//    sh.SortOrder != "queryname"){
	// 	cerr << "Bamfile must be sorted by queryname" << endl;
	// 	return 1;
	// }
    

	BamAlignment al;
	BamAlignment al2;
	bool al2Null=true;
    
	while ( reader.GetNextAlignment(al) ) {

	
	    if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD")  ){
		if(!allowAligned){
		    cerr << "Reads should not be aligned" << endl;
		    return 1;
		}else{
		    //should we remove tags ?
		}
	    }


	    if(al.IsPaired() && 
	       al2Null ){
		al2=al;
		al2Null=false;
		continue;
	    }else{
		if(al.IsPaired() && 
		   !al2Null){

		    bool  result =  mtr.processPair(al,al2);
		
		    if( result ){//was merged
			BamAlignment orig;
			BamAlignment orig2;

			if(keepOrig){
			    orig2 = al2;
			    orig  = al;
			}

			writer.SaveAlignment(al);

			if(keepOrig){
			    orig.SetIsDuplicate(true);
			    orig2.SetIsDuplicate(true);
			    writer.SaveAlignment(orig2);
			    writer.SaveAlignment(orig);
			}

			//the second record is empty
		    }else{
			//keep the sequences as pairs

			writer.SaveAlignment(al2);		    
			writer.SaveAlignment(al);
		    }

		    //
		    //  SINGLE END
		    //
		}else{ 
		    BamAlignment orig;
		    if(keepOrig){
			orig =al;
		    }
		    mtr.processSingle(al);

		    if(keepOrig){
			//write duplicate
			if(orig.QueryBases.length()  != al.QueryBases.length()){
			    orig.SetIsDuplicate(true);
			    writer.SaveAlignment(orig);
			}
		    }
		    writer.SaveAlignment(al);



		} //end single end
		al2Null=true;
	    }//second pair
		    

	} //while al
	reader.Close();
	writer.Close();


    } //else BAM


    cerr <<mtr.reportSingleLine()<<endl;

    if(printLog){
	ofstream fileLog;
	fileLog.open(logFileName.c_str());

	if (fileLog.is_open()){
	    fileLog <<mtr.reportMultipleLines() <<endl;

	}else{
	    cerr << "Unable to print to file "<<logFileName<<endl;
	}
	fileLog.close();
    }
    return 0;
}
Beispiel #16
0
int main(const int argc, char* const argv[]) {
    int c, min_mapQ=0, seed=chrono::system_clock::now().time_since_epoch().count();
    unsigned int flag_on=0, flag_off=0;
    string fn_tgt, fn_in, fn_out="", out_format="b";

    while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) {
        switch (c) {
            case 's': seed = atoi(optarg); break;
            case 'm': break;
            case 'c': break;
            case 'S': break;
            case 'b': break;
            case 'C': break;
            case 'h': break;
            case 'H': break;
            case 'o': fn_out = optarg; break;
            case 'U': break;
            case 'f': flag_on |= strtol(optarg, 0, 0); break;
            case 'F': flag_off |= strtol(optarg, 0, 0); break;
            case 'q': min_mapQ = atoi(optarg); break;
            case 'u': out_format = "u"; break;
            case '1': break;
            case 'l': break;
            case 'r': break;
            case 't': fn_tgt = optarg; break;
            case 'R': break;
            case '?': return usage();
            case 'T': break;
            case 'B': break;
            case '@': break;
            case 'x': break;
            default: return usage();
        }
    }
    if (fn_tgt.compare("") == 0) return usage();
    if (argc == optind) return usage();
    fn_in = argv[optind];

    BamReader reader;
    if (!reader.Open(fn_in)) {
        cerr << "ERROR: cannot open [" << fn_in << "] for reading\n";
        return 1;
    }
    if (!reader.LocateIndex()) {
        cerr << "ERROR: cannot find BAM index for [" << fn_in << "]\n";
        return 1;
    }

    const SamHeader header = reader.GetHeader();
    if (header.SortOrder.compare("coordinate") != 0) {
        cerr << "ERROR: [" << fn_in << "] not sorted by coordinate\n";
        return 1;
    }
    const RefVector refseq = reader.GetReferenceData();

    vector<BamRegion> regions;
    vector<unsigned int> src_depths, tgt_depths;
    if (read_region_depth(fn_tgt.c_str(), reader, regions, src_depths, tgt_depths) != 0) return 1;

    BamWriter writer;
    if (!writer.Open(fn_out, header, refseq)) {
        cerr << "ERROR: cannot open [" << fn_out << "] for writing\n";
        return 1;
    }

    BamAlignment aln;
    vector<BamAlignment> reads;
    vector<string> paired, unpaired;
    unordered_map<int, int> kept;
    unordered_map<string, unsigned int> seen, sampled;
    unordered_map<string, vector<int> > pool;
    for (size_t i=0; i<regions.size(); ++i) {
        reads.clear();
        paired.clear();
        unpaired.clear();
        kept.clear();
        pool.clear();

        char region_string[256];
        sprintf(region_string, "%s:%d-%d", refseq[regions[i].LeftRefID].RefName.c_str(), regions[i].LeftPosition, regions[i].RightPosition);
        if (!reader.SetRegion(regions[i])) {
            cerr << "WARNING: failed to locate [" << region_string << "]\n";
            //cerr << "WARNING: failed to locate [" << refseq[regions[i].LeftRefID].RefName << ':' << regions[i].LeftPosition << '-' << regions[i].RightPosition << "]\n";
            continue;
        }
        while (reader.GetNextAlignment(aln)) {
            if ((aln.AlignmentFlag & flag_on) == flag_on && !(aln.AlignmentFlag & flag_off) && aln.MapQuality >= min_mapQ)
                reads.push_back(aln);
        }
        if (reads.size() == 0) continue;

        unsigned int depth = 0;
        for (size_t k=0; k<reads.size(); ++k) {
            aln = reads[k];
            string rn = aln.Name;
            if (seen.find(rn) != seen.end()) { // if seen in previous regions
                if (sampled.find(rn) != sampled.end()) { // if self or mate sampled before, sample it
                    if (sampled[rn] != aln.AlignmentFlag) kept[k] = 1; // if mate sampled before, keep it
                    depth += get_overlap(aln, regions[i]);
                }
                if (seen[rn] != aln.AlignmentFlag) seen[rn] = aln.AlignmentFlag;
            }
            else { // if not seen in previous regions
                pool[rn].push_back(k);
            }
            if (depth > tgt_depths[i]) break;
        }
        if (depth < tgt_depths[i]) {
            for (auto it=pool.begin(); it!=pool.end(); ++it) {
                if (it->second.size()>1)
                    paired.push_back(it->first);
                else
                    unpaired.push_back(it->first);
            }
            shuffle(paired.begin(), paired.end(), default_random_engine(seed));
            shuffle(unpaired.begin(), unpaired.end(), default_random_engine(seed));
            int n1=paired.size(), n2=unpaired.size(), k1, k2, k3;
            while (depth < tgt_depths[i] && n1+n2 > 0) {
                if (n1>0) {
                    k1 = pool[paired[--n1]][0];
                    k2 = pool[paired[n1]][1];
                    depth += get_overlap(reads[k1], regions[i]);
                    depth += get_overlap(reads[k2], regions[i]);
                    kept[k1] = 1; kept[k2] = 1;
                    continue;
                }
                if (n2>0) {
                    k3 = pool[unpaired[--n2]][0];
                    depth += get_overlap(reads[k3], regions[i]);
                    kept[k3] = 1;
                    continue;
                }
            }
        }
        for (auto it=pool.begin(); it!=pool.end(); ++it) {
            string rn = it->first;
            seen[rn] = reads[pool[rn].back()].AlignmentFlag;
        }
        for (auto it=kept.begin(); it!=kept.end(); ++it) {
            int k = it->first;
            string rn = reads[k].Name;
            sampled[rn] = reads[k].AlignmentFlag;
            writer.SaveAlignment(reads[k]);
        }
        cerr << "INFO: target=[" << tgt_depths[i] << "], actual=[" << depth << "], N(reads)=[" << reads.size() << "], N(kept)=[" << kept.size() << "] at [" << region_string << "]\n";
    }

    reader.Close();

    return 0;
}
Beispiel #17
0
// generates mutiple sorted temp BAM files from single unsorted BAM file
bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
    
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
             << " for reading... Aborting." << endl;
        return false;
    }
    
    // get basic data that will be shared by all temp/output files 
    SamHeader header = reader.GetHeader();
    header.SortOrder = ( m_settings->IsSortingByName
                       ? Constants::SAM_HD_SORTORDER_QUERYNAME
                       : Constants::SAM_HD_SORTORDER_COORDINATE );
    m_headerText = header.ToString();
    m_references = reader.GetReferenceData();
    
    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
    bool bufferFull = false;
    
    // if sorting by name, we need to generate full char data
    // so can't use GetNextAlignmentCore()
    if ( m_settings->IsSortingByName ) {

        // iterate through file
        while ( reader.GetNextAlignment(al)) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // sorting by position, can take advantage of GNACore() speedup
    else {

        // iterate through file
        while ( reader.GetNextAlignmentCore(al) ) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() )
        CreateSortedTempFile(buffer);
    
    // close reader & return success
    reader.Close();
    return true;
}
Beispiel #18
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
int 
main(int argc, char* argv[]) {

	// validate argument count
	if( argc != 2 ) {
		cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl;
		return EXIT_FAILURE;
	}

	string filename = argv[1];
	//cerr << "Printing alignments from file: " << filename << endl;
	
	BamReader reader;
	if (!reader.Open(filename)) {
        cerr << "could not open filename " << filename << endl;
        return EXIT_FAILURE;
    }
    cerr << filename << ": Done opening" << endl;

    // Header can't be used to accurately determine sort order because samtools never
    // changes it; instead, check after loading each read as is done with "samtools index"

    // We don't need to load an index (right?)
	// if (!reader.LocateIndex()) {
    //     const string index_filename = filename + ".bai";
	//     if (!reader.OpenIndex(index_filename)) {
    //         cerr << "could not open index" << endl;
    //     }
    // }


    const SamHeader header = reader.GetHeader();
    cerr << filename << ": Done getting header" << endl;
    const RefVector refs = reader.GetReferenceData();
    cerr << filename << ": Done getting reference data" << endl;
	
    BamWriter writer;
    if (! output_bam_filename.empty()) {
        if (! writer.Open(output_bam_filename, header, refs)) {
            cerr << "Could not open BAM output file " << output_bam_filename << endl;
            return EXIT_FAILURE;
        }
        cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl;
    }

    alignmentMap read1Map;  // a single map, for all reads awaiting their mate
    typedef map<string,int32_t> stringMap;
    typedef stringMap::iterator stringMapI;
    stringMap ref_mates;
    // alignmentMap read1Map, read2Map;

	BamAlignment full_al;
    int32_t count = 0;
    uint32_t max_reads_in_map = 0;
    int32_t n_reads_skipped_unmapped = 0;
    int32_t n_reads_skipped_mate_unmapped = 0;
    int32_t n_reads_skipped_wont_see_mate = 0;
    int32_t n_reads_skipped_mate_tail_est = 0;
    int32_t n_reads_skipped_ref_mate = 0;
    int32_t n_reads = 0;
    int32_t n_singleton_reads = 0;
    int32_t last_RefID = -1;
    int32_t last_Position = -1;

    cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs,"
        << " total tail = " << link_pair_total_tail 
        << " critical tail = " << link_pair_crit_tail 
        << ", must be on diff chromosome = " << link_pair_diff_chrom << endl;

	while (reader.GetNextAlignment(full_al) 
           && (! pairs_to_process || count < pairs_to_process)) {

        BamAlignment al = full_al;

        //printAlignmentInfo(al, refs);
        //++count;
        ++n_reads;

        if (last_RefID < 0) last_RefID = al.RefID;
        if (last_Position < 0) last_Position = al.Position;
        if (al.RefID > last_RefID) {
            // We've moved to the next reference sequence
            // Clean up reads with mates expected here that haven't been seen
            if (debug_ref_mate) {
                cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference "
                    << last_RefID << " " << refs[last_RefID].RefName << endl;
            }
            for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) {
                ++n_reads_skipped_ref_mate;
                read1Map.erase(read1Map.find(rmI->first));
                ref_mates.erase(ref_mates.find(rmI->first));
            }
            last_RefID = al.RefID;
            last_Position = al.Position;
        } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) {
            cerr << filename << " is not sorted, " << al.Name << " out of position" << endl;
            return EXIT_FAILURE;
        }

        if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; }

        if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; }

        alignmentMapI mI = read1Map.find(al.Name);

        if (mI == read1Map.end()) {
            // the read name has not been seen before

            if (al.MateRefID < al.RefID
                || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) {
                // we should have seen its mate earlier, so skip it
                ++n_reads_skipped_wont_see_mate;
                continue;
            }

            // If the mate likely to also be a link pair candidate, add the read
            int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(),
                            al.MatePosition, refs[al.MateRefID].RefLength, max_read_length);
            if (mate_tail_est <= mate_tail_est_crit) {
                // the mate tail estimate suggests it might be a link pair candidate
                read1Map[al.Name] = al;  // add the read to the map
            } else {
                // the mate tail estimate appears too long for the mate to be a candidate
                ++n_reads_skipped_mate_tail_est;
                continue;
            }

            if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size();
            if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) {
                // the mate is expected later on this contig
                ref_mates[al.Name] = al.MateRefID;
            }

        } else {
            // get the mate's alignment, and process the pair

            const BamAlignment& al_mate = mI->second;

            if (processReadPair(al, al_mate, refs, link_pair_total_tail, 
                                link_pair_crit_tail, link_pair_diff_chrom)) {
                ++count;

                // write to the new BAM file, if the string is not empty
                if (! output_bam_filename.empty()) {
                    writer.SaveAlignment(al_mate);  // the first one seen
                    writer.SaveAlignment(al);  // the second one seen
                }
            }

            read1Map.erase(mI);

            if (al.MateRefID == al.RefID) {
                stringMapI rmI = ref_mates.find(al.Name);
                if (rmI == ref_mates.end()) {
                    cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl;
                    return EXIT_FAILURE;
                }
                ref_mates.erase(rmI);
            }

        }

	}

	cerr << "===============================" << endl;
    cerr << read1Map.size() << " alignments left in read1Map" << endl;
    cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl;
    cerr << count << " pairs processed" << endl;
	cerr << "===============================" << endl;
    cerr << n_reads << " total reads" << endl;
    cerr << n_singleton_reads << " singleton reads" << endl;
    cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl;
    cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl;
    cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl;
    cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl;
    cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl;

	reader.Close();
    if (! output_bam_filename.empty()) {
	    writer.Close();
    }
	return EXIT_SUCCESS;
}
Beispiel #20
0
void BamToFastq::PairedFastqUseTags() {

    // open the 1st fastq file for writing
    ofstream fq1(_fastq1.c_str(), ios::out);
    if ( !fq1 ) {
        cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }
    // open the 2nd fastq file for writing
    ofstream fq2(_fastq2.c_str(), ios::out);
    if ( !fq2 ) {
        cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }

    // open the BAM file
    BamReader reader;
    reader.Open(_bamFile);
    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    while (reader.GetNextAlignment(bam1)) {
        
        reader.GetNextAlignment(bam2);        
        if (bam1.Name != bam2.Name) {
            while (bam1.Name != bam2.Name)
            {
                if (bam1.IsPaired()) 
                {
                    cerr << "*****WARNING: Query " << bam1.Name
                         << " is marked as paired, but it's mate does not occur"
                         << " next to it in your BAM file.  Skipping. " << endl;
                }
                bam1 = bam2;
                reader.GetNextAlignment(bam2);
            }
        }
        else if (bam1.IsPaired() && bam2.IsPaired()) {
            // assume the R2 and Q2 tags are on the + strand.
            string mateSequence, mateQualities;
            bam1.GetTag("R2", mateSequence);
            bam1.GetTag("Q2", mateQualities);

            string seq1  = bam1.QueryBases;
            string qual1 = bam1.Qualities;
            if (bam1.IsReverseStrand() == true) {
                reverseComplement(seq1);
                reverseSequence(qual1);
            }
            
            // since the info for both ends are contained in each BAM record,
            // we only need to process one of the two records (bam1) in order
            // to produce FASTQ entries for both ends.
            // NOTE: Assumes that R2 and Q2 have already been rev 
            //      and revcomped if necessary
            if (bam1.IsFirstMate() == true) {
                // end1
                fq1 << "@" << bam1.Name << "/1" << endl;
                fq1 << seq1 << endl;
                fq1 << "+" << endl;
                fq1 << qual1 << endl;
                // end2
                fq2 << "@" << bam1.Name << "/2" <<endl;
                fq2 << mateSequence << endl;
                fq2 << "+" << endl;
                fq2 << mateQualities << endl;
            }
            else {
                // end 2
                fq2 << "@" << bam1.Name << "/2" <<endl;
                fq2 << seq1 << endl;
                fq2 << "+" << endl;
                fq2 << qual1 << endl;
                // end 1
                fq1 << "@" << bam1.Name << "/1" <<endl;
                fq1 << mateSequence << endl;
                fq1 << "+" << endl;
                fq1 << mateQualities << endl;
            }
        }
    }
    reader.Close();
}
Beispiel #21
0
void BamToFastq::PairedFastq() {
    // open the 1st fastq file for writing
    ofstream fq1(_fastq1.c_str(), ios::out);
    if ( !fq1 ) {
        cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }
    // open the 2nd fastq file for writing
    ofstream fq2(_fastq2.c_str(), ios::out);
    if ( !fq2 ) {
        cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }
    // open the BAM file
    BamReader reader;
    reader.Open(_bamFile);
    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    bool shouldConsumeReads = true;
    while (true) {
        
        if (shouldConsumeReads) {
            if (!reader.GetNextAlignment(bam1) || !reader.GetNextAlignment(bam2)) break;
        } else {
            shouldConsumeReads = true;
        }
        if (bam1.Name != bam2.Name) {
            while (bam1.Name != bam2.Name)
            {
                if (bam1.IsPaired()) 
                {
                    cerr << "*****WARNING: Query " << bam1.Name
                         << " is marked as paired, but its mate does not occur"
                         << " next to it in your BAM file.  Skipping. " << endl;
                }
                bam1 = bam2;
                if (!reader.GetNextAlignment(bam2)) break;
                shouldConsumeReads = false;
            }
        }
        else if (bam1.IsPaired() && bam2.IsPaired()) {
            // extract the sequence and qualities for the BAM "query"
            string seq1  = bam1.QueryBases;
            string qual1 = bam1.Qualities;
            string seq2  = bam2.QueryBases;
            string qual2 = bam2.Qualities;
            if (bam1.IsReverseStrand() == true) {
                reverseComplement(seq1);
                reverseSequence(qual1);
            }
            if (bam2.IsReverseStrand() == true) {
                reverseComplement(seq2);
                reverseSequence(qual2);
            }
            fq1 << "@" << bam1.Name << "/1" << endl;
            fq1 << seq1 << endl;
            fq1 << "+" << endl;
            fq1 << qual1 << endl;
            
            fq2 << "@" << bam2.Name << "/2" << endl;
            fq2 << seq2 << endl;
            fq2 << "+" << endl;
            fq2 << qual2 << endl;
        }
    }
    reader.Close();
}
Beispiel #22
0
int main (int argc, char *argv[]) {

    bool mapped  =false;
    bool unmapped=false;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "This program takes a BAM file as input and produces\n"+
			      "another where the putative deaminated bases have\n"+
			      "have been cut\n"+
			      "\n"+
			      "Options:\n");
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    // for(int i=1;i<(argc-1);i++){ //all but the last arg

    // 	if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){
    // 	    mapped=true;
    // 	    continue;
    // 	}

    // 	if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){
    // 	    unmapped=true;
    // 	    continue;
    // 	}
       
    // 	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    // 	return 1;
    // }

    if(argc != 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;
			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);			 
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			}
		    }else{
			int indexToCheck;
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			}
		    }
		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //second to last
			    indexToCheck=al.QueryBases.length()-2;
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    }else{
				//last
				indexToCheck=al.QueryBases.length()-1;
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				    al.Qualities  = al.Qualities.substr(0, indexToCheck);
				}
			    }
			}else{
			    int indexToCheck;
			    //second base
			    indexToCheck=1;
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    }else{
				//first base
				indexToCheck=0;
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				    al.Qualities  = al.Qualities.substr(indexToCheck+1);
				}
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;



		    //second base
		    indexToCheck=1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"51 "<<al.QueryBases<<endl;
			// cout<<"51 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"52 "<<al.QueryBases<<endl;
			// cout<<"52 "<<al.Qualities<<endl;
		    }else{
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"61 "<<al.QueryBases<<endl;
			    // cout<<"61 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    // cout<<"62 "<<al.QueryBases<<endl;
			    // cout<<"62 "<<al.Qualities<<endl;
			}

		    }


		    //last
		    indexToCheck=al.QueryBases.length()-1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"21 "<<al.QueryBases<<endl;
			// cout<<"21 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"22 "<<al.QueryBases<<endl;
			// cout<<"22 "<<al.Qualities<<endl;
		    }
		}else{

		    int indexToCheck;
		    //first base
		    indexToCheck=0;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);

			// cout<<"11 "<<al.QueryBases<<endl;
			// cout<<"11 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"12 "<<al.QueryBases<<endl;
			// cout<<"12 "<<al.Qualities<<endl;

		    }

		    //second to last
		    indexToCheck=al.QueryBases.length()-2;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"31 "<<al.QueryBases<<endl;
			// cout<<"31 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"32 "<<al.QueryBases<<endl;
			// cout<<"32 "<<al.Qualities<<endl;
		    }else{

			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"41 "<<al.QueryBases<<endl;
			    // cout<<"41 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    // cout<<"42 "<<al.QueryBases<<endl;
			    // cout<<"42 "<<al.Qualities<<endl;
			}

		    }
		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();
   
    return 0;
}
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+
			"\nwhich is good if you have many reads in the bam file.\n"+			
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"\narguments:\n"+
			"\t"+"--bq    [base qual]  : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\t"+"--1000g [vcf file]   : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

        if(string(argv[i]) == "--1000g"){
	    vcf1000g=string(argv[i+1]);
            i++;
            continue;
	}



    }

    unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19
    bool * hasCnoT;
    bool * hasGnoA;
    bool * thousandGenomesHasA;
    bool * thousandGenomesHasT;

    cerr<<"Trying to allocating memory"<<endl;
    try{
	hasCnoT              = new bool[ maxSizeChromosome ];
	hasGnoA              = new bool[ maxSizeChromosome ];
	thousandGenomesHasA  = new bool[ maxSizeChromosome ];
	thousandGenomesHasT  = new bool[ maxSizeChromosome ];
    }catch(bad_alloc& exc){
	cerr<<"ERROR: allocating memory failed"<<endl;
	return 1;
    }
    cerr<<"Success in allocating memory"<<endl;

    for(unsigned int i = 0;i<maxSizeChromosome;i++){
	hasCnoT[i]=false;
	hasGnoA[i]=false;
	thousandGenomesHasA[i]=false;
	thousandGenomesHasT[i]=false;
    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    cerr<<"Reading consensus VCF "<<vcffiletopen<<"  ... "<<endl;

    VCFreader vcfr (vcffiletopen,
 		    // vcffiletopen+".tbi",
 		    // chrname,
 		    // 1,
 		    // maxSizeChromosome,
 		    0);

    
    while(vcfr.hasData()){
	SimpleVCF * toprint=vcfr.getData();

	if(toprint->getRef().length() != 1 )
	    continue;
	

	//if the VCF has a at least one G but no A
	if(  toprint->hasAtLeastOneG() && 
	     !toprint->hasAtLeastOneA() ){
	    hasGnoA[ toprint->getPosition() ] =true;
	}

	if(  toprint->hasAtLeastOneC() && 
	     !toprint->hasAtLeastOneT() ){
	    hasCnoT[ toprint->getPosition() ] =true;    
	}

	   
    }
    cerr<<"done reading VCF"<<endl;







    cerr<<"Reading 1000g VCF :"<<vcf1000g<<"  ..."<<endl;
    string line1000g;
    ifstream myFile1000g;
    
    myFile1000g.open(vcf1000g.c_str(), ios::in);

    if (myFile1000g.is_open()){
	while ( getline (myFile1000g,line1000g)){
	    vector<string> fields=allTokens(line1000g,'\t');
	    //0 chr
	    //1 pos
	    //2 id
	    //3 ref
	    //4 alt

	    //check if same chr
	    if(fields[0] != chrname){
		cerr <<"Error, wrong chromosome in 1000g file for line=  "<<line1000g<<endl;
		return 1;
	    }

	    //skip indels
	    if(fields[3].size() != 1 ||
	       fields[4].size() != 1 )
		continue;

	    char         ref=toupper(fields[3][0]);
	    char         alt=toupper(fields[4][0]);
	    unsigned int pos=destringify<unsigned int>( fields[1] );
	    
	    thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') );
	    thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') );
	    
	}
	myFile1000g.close();
    }else{
	cerr <<"Unable to open file "<<vcf1000g<<endl;
	return 1;
    }



    cerr<<"done reading 1000g VCF"<<endl;











    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }


    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+1] && 
		    !thousandGenomesHasA[al.Position+1] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;			
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }
		    
		//     //if the VCF has a at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+2] && 
		   !thousandGenomesHasA[al.Position+2] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);


		// vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one G but no A 
		//     // if(toprint->hasAtLeastOneG() &&
		//     //    toprint->getAlt().find("A") == string::npos){
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasGnoA[positionJump] && 
		   !thousandGenomesHasA[positionJump] )
		    isDeaminated=true; 


		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();

		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }


		//     //if the VCF has at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		// transformRef(&refeBase,&readBase);

		if(hasCnoT[al.Position+1]  && 
		   !thousandGenomesHasT[al.Position+1] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one C but no T
		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }

		// }

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		//transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();
    delete(hasCnoT);
    delete(hasGnoA);

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Beispiel #24
0
// close requested BAM files
bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames)
{

    bool errorsEncountered = false;
    m_errorString.clear();

    // iterate over filenames
    std::vector<std::string>::const_iterator filesIter = filenames.begin();
    std::vector<std::string>::const_iterator filesEnd = filenames.end();
    for (; filesIter != filesEnd; ++filesIter) {
        const std::string& filename = (*filesIter);
        if (filename.empty()) continue;

        // iterate over readers
        std::vector<MergeItem>::iterator readerIter = m_readers.begin();
        std::vector<MergeItem>::iterator readerEnd = m_readers.end();
        for (; readerIter != readerEnd; ++readerIter) {
            MergeItem& item = (*readerIter);
            BamReader* reader = item.Reader;
            if (reader == 0) continue;

            // if reader matches requested filename
            if (reader->GetFilename() == filename) {

                // remove reader's entry from alignment cache
                m_alignmentCache->Remove(reader);

                // clean up reader & its alignment
                if (!reader->Close()) {
                    m_errorString.append(1, '\t');
                    m_errorString.append(reader->GetErrorString());
                    m_errorString.append(1, '\n');
                    errorsEncountered = true;
                }
                delete reader;
                reader = 0;

                // delete reader's alignment entry
                BamAlignment* alignment = item.Alignment;
                delete alignment;
                alignment = 0;

                // remove reader from reader list
                m_readers.erase(readerIter);

                // on match, just go on to next filename
                // (no need to keep looking and item iterator is invalid now anyway)
                break;
            }
        }
    }

    // make sure we clean up properly if all readers were closed
    if (m_readers.empty()) {

        // clean up merger
        if (m_alignmentCache) {
            m_alignmentCache->Clear();
            delete m_alignmentCache;
            m_alignmentCache = 0;
        }

        // reset merge flags
        m_hasUserMergeOrder = false;
        m_mergeOrder = BamMultiReader::RoundRobinMerge;
    }

    // return whether all readers closed OK
    return !errorsEncountered;
}
Beispiel #25
0
//{{{ bool merge_sorted_files(string out_file_name,
bool merge_sorted_files(string out_file_name,
						int buff_count,
						string header_text,
						RefVector &ref)
{

    map<string,BamReader*> bam_readers;
    priority_queue< BamAlignment, vector<BamAlignment>, inter_chrom_rev_sort > q;

    for (int i = 0; i < buff_count; ++i) {
        stringstream temp_name;
        temp_name << out_file_name << i;

        BamReader *reader = new BamReader();

        if ( !reader->Open(temp_name.str()) ) {
            cerr << "sort ERROR: could not open " << 
                    temp_name.str() << " for reading... Aborting." << endl;
            return false;
        }

        bam_readers[temp_name.str()] = reader;
        // place an item from each bam onto the q
        BamAlignment al;
        if (reader->GetNextAlignment(al))
        q.push(al);
    }

    BamWriter merged_writer;
    if ( !merged_writer.Open(out_file_name, header_text, ref) ) {
        cerr << "sort ERROR: could not open " << out_file_name
                << " for writing." << endl;
        return false;
    }


    while (!q.empty()) {
        BamAlignment al = q.top();
        q.pop();
        merged_writer.SaveAlignment(al);

        BamReader *reader = bam_readers[al.Filename];

        BamAlignment new_al;

        if (reader->GetNextAlignment(new_al))
            q.push(new_al);
    }

    merged_writer.Close();

    //close and remove temp files
    map<string,BamReader*>::iterator it;
    for (it = bam_readers.begin(); it != bam_readers.end(); ++it) {
        BamReader *reader =	it->second;
        reader->Close();
        delete reader;
        remove(it->first.c_str());
    }

    return true;
}
Beispiel #26
0
void BedIntersect::IntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB = new BedFile(_bedBFile);
    _bedB->loadBedFileIntoMap();

    // create a dummy BED A file for printing purposes if not
    // using BAM output.
    if (_bamOutput == false) {
        _bedA = new BedFile(_bedAFile);
        _bedA->bedType = 12;
    }
    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();
    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }
    vector<BED> hits;
    // reserve some space
    hits.reserve(100);
    BamAlignment bam;    
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        // save an unaligned read if -v
        if (!bam.IsMapped()) {
            if (_noHit == true)
                writer.SaveAlignment(bam);
            continue;
        }   
        // break alignment into discrete blocks,
        bedVector bed_blocks;
        string chrom = refs.at(bam.RefID).RefName;
        GetBamBlocks(bam, chrom, bed_blocks, false, true);
        // create a basic BED entry from the BAM alignment
        BED bed;
        MakeBedFromBam(bam, chrom, bed_blocks, bed);
        bool overlapsFound = false;
        if ((_bamOutput == true) && (_obeySplits == false))
        {
            overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, 
                                           bed.strand, _sameStrand, _diffStrand,
                                           _overlapFraction, _reciprocal);
        }
        else if ( ((_bamOutput == true)  && (_obeySplits == true)) ||
                  ((_bamOutput == false) && (_obeySplits == true)) )
        {
            // find the hits that overlap with the full span of the blocked BED
            _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
                           hits, _sameStrand, _diffStrand,
                           _overlapFraction, _reciprocal);
            // find the overlaps between the block in A and B
            overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
        }
        else if ((_bamOutput == false) && (_obeySplits == false))
        {
            FindOverlaps(bed, hits);
        }
        // save the BAM alignment if overlap reqs. were met
        if (_bamOutput == true) {
            if ((overlapsFound == true) && (_noHit == false))
                writer.SaveAlignment(bam);
            else if ((overlapsFound == false) && (_noHit == true))
                writer.SaveAlignment(bam);
        }
        hits.clear();
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #27
0
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"arguments:\n"+
			"\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    //dummy reader, will need to reposition anyway
    VCFreader vcfr (vcffiletopen,
 		    vcffiletopen+".tbi",
 		    chrname,
 		    1,
 		    1,
 		    0);

    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }

    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;			
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }
		    
		    //if the VCF has a at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one G but no A 
		    // if(toprint->hasAtLeastOneG() &&
		    //    toprint->getAlt().find("A") == string::npos){
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();

		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }


		    //if the VCF has at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one C but no T
		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }

		}

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Beispiel #28
0
// close the BAM files
void BamMultiReader::Close(void) {
    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        reader->Close();  // close the reader
    }
}
Beispiel #29
0
int main (int argc, char *argv[]) {

     if( (argc!= 3) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cerr<<"Usage:splitByRG [in bam] [out prefix]"<<endl<<"this program creates one bam file per RG in the with the outprefix\nFor example splitByRG in.bam out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl;
    	return 1;
    }


     string bamfiletopen = string(argv[1]);
     // if(!strEndsWith(bamfiletopen,".bam")){

     // }
     string bamDirOutPrefix    = string(argv[2]);
     map<string,BamWriter *> rg2BamWriter;
     
     // if(!isDirectory(bamDirOut)){
     // 	 cerr<<"ERROR: the out directory does not exist"<<endl;
     // 	return 1;
     // }

     BamReader reader;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }

    SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    vector<RefData>  refData=reader.GetReferenceData();
    string pID          = "splitByRG";   
    string pName        = "splitByRG";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
        pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));


    SamReadGroupDictionary 	srgd=header.ReadGroups;
    for(SamReadGroupConstIterator srgci=srgd.ConstBegin();
	srgci<srgd.ConstEnd();
	srgci++){
	//cout<<*srgci<<endl;
	const SamReadGroup rg = (*srgci);
	//cout<<rg.ID<<endl;
	rg2BamWriter[rg.ID] = new  BamWriter();
	rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); 
    }



    BamAlignment al;
    unsigned int total=0;
    while ( reader.GetNextAlignment(al) ) {

	// al.SetIsFailedQC(false);
	// writer.SaveAlignment(al);
	// if(al.IsMapped () ){
	//     if(rg2BamWriter.find(refData[al.RefID].RefName) == rg2BamWriter.end()){ //new
	// 	rg2BamWriter[refData[al.RefID].RefName] = new  BamWriter();
	// 	if ( !rg2BamWriter[refData[al.RefID].RefName]->Open(bamDirOutPrefix+"."+refData[al.RefID].RefName+".bam",header,references) ) {
	// 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<refData[al.RefID].RefName<<".bam" << endl;
	// 	    return 1;
	// 	}
	
	//     }else{
	// 	rg2BamWriter[refData[al.RefID].RefName]->SaveAlignment(al);
	//     }
	// }else{
	//     unmapped.SaveAlignment(al);
	// }
	if(al.HasTag("RG")){
	    string rgTag;
	    al.GetTag("RG",rgTag);
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
		cerr<<"Found new RG "<<rgTag<<endl;
		rg2BamWriter[rgTag] = new  BamWriter();
	 	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	 	    return 1;
	 	}
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }else{
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }
	}else{
	    string rgTag="unknown";	    
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
		cerr<<"Found new RG "<<rgTag<<endl;
		rg2BamWriter[rgTag] = new  BamWriter();
	 	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	 	    return 1;
	 	}
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }else{
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }

	    // cerr << "Cannot get RG tag for " << al.Name<<endl;
	    // return 1;
	}

	total++;
    } //while al

    reader.Close();
    // writer.Close();
    
    // unmapped.Close();

    map<string,BamWriter *>::iterator rg2BamWriterIt;
    for (rg2BamWriterIt =rg2BamWriter.begin(); 
	 rg2BamWriterIt!=rg2BamWriter.end(); 
	 rg2BamWriterIt++){
	rg2BamWriterIt->second->Close();
    }
    cerr<<"Wrote succesfully "<<total<<" reads"<<endl;


    return 0;
}
Beispiel #30
0
int main (int argc, char *argv[]) {
    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<""<<endl;
	cout<<"plotQualScore input.bam"<<endl;
	return 1;
    }

    string bamfiletopen = string(argv[1]);
    BamReader reader;

    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex() ){
    // 	cerr << "warning: cannot locate index for file " << bamfiletopen<<endl;
    // 	//return 1;
    // }

    BamAlignment al;
    BamAlignment al2;
    
    bool unsurePEorSE=true;
    bool pe=true;
    int strLength=-1;
    int vecLengthToUse=-1;

    map<short,unsigned long>  ** counterA = 0;
    map<short,unsigned long>  ** counterC = 0;
    map<short,unsigned long>  ** counterG = 0;
    map<short,unsigned long>  ** counterT = 0;
    
    int lengthIndex1=0;
    int lengthIndex2=0;
    string seqInd1;
    string seqInd2;
    string qualInd1;
    string qualInd2;
    int offsetInd2;

    while ( reader.GetNextAlignment(al) ) {
	if(unsurePEorSE){
	    strLength=al.QueryBases.length();
	    if(al.IsPaired()){
		pe=true;
		vecLengthToUse=2*strLength;		
	    }else{
		pe=false;
		vecLengthToUse=strLength;
	    }
	    string index1;
	    string index2;
	
	    if(al.HasTag("XI")){
		al.GetTag("XI",index1);
		vecLengthToUse+=index1.length();
		lengthIndex1=index1.length();
	    }

	    if(al.HasTag("XJ")){
		al.GetTag("XJ",index2);
		vecLengthToUse+=index2.length();
		lengthIndex2=index2.length();
	    }

	    counterA     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterC     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterG     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterT     = new map<short,unsigned long>  * [vecLengthToUse];
	    for(int i=0;i<vecLengthToUse;i++){
		counterA[i]=new map<short,unsigned long>  ();
		counterC[i]=new map<short,unsigned long>  ();
		counterG[i]=new map<short,unsigned long>  ();
		counterT[i]=new map<short,unsigned long>  ();
		for(short k=minQualScore;k<=maxQualScore;k++){
		    (*counterA[i])[k]=0;
		    (*counterC[i])[k]=0;
		    (*counterG[i])[k]=0;
		    (*counterT[i])[k]=0;
		}
	    }
	    unsurePEorSE=false;
	}else{
	    if(pe  &&
	       !al.IsPaired()){
		cerr << "Cannot have unpaired reads in PE mode" << endl;
		return 1;
	    }

	    if(!pe  &&
	       al.IsPaired()){
		cerr << "Cannot have unpaired reads in SE mode" << endl;
		return 1;
	    }
	}
	
	if(al.QueryBases.length() !=  al.Qualities.length()){
	    cerr << "Cannot have different lengths for sequence and quality" << endl;
	    return 1;
	}
	if(int(al.QueryBases.length()) !=  strLength){
	    cerr << "Cannot have different lengths for sequence and quality" << endl;
	    return 1;
	}

	if(pe){
	    if(al.IsFirstMate()){
		reader.GetNextAlignment(al2);
		if(al2.QueryBases.length() !=  al2.Qualities.length()){
		    cerr << "Cannot have different lengths for sequence and quality" << endl;
		    return 1;
		}

	    }else{
		cerr << "First read should be the first mate" << endl;
		return 1;	    
	    }
	}



	//cycle
	for(unsigned int i=0;i<al.QueryBases.length();i++){
	    short x=(short(al.Qualities[i])-qualOffset);
	    if(al.QueryBases[i] == 'A'){
	    	(*counterA[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'C'){
	    	(*counterC[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'G'){
	    	(*counterG[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'T'){
	    	(*counterT[i])[x]++;
	    }
	}

	//The indices for al and al2 should hopefully be the same 
	if(lengthIndex1>0){
	    al.GetTag("XI",seqInd1);
	    al.GetTag("YI",qualInd1);
	    int j;

	    for(int i=0;i<lengthIndex1;i++){
		j=i+al.QueryBases.length();
		short x=(short(qualInd1[i])-qualOffset);
		if(seqInd1[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(seqInd1[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(seqInd1[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(seqInd1[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}

	if(pe){
	    offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length();
	    int j;
	    for(unsigned int i=0;i<al2.QueryBases.length();i++){
		j=i+al.QueryBases.length()+lengthIndex1;
		short x=(short(al2.Qualities[i])-qualOffset);
		if(al2.QueryBases[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(al2.QueryBases[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(al2.QueryBases[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(al2.QueryBases[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}else{
	    offsetInd2=al.QueryBases.length()+lengthIndex1;
	}

	//The indices for al and al2 should hopefully be the same 
	if(lengthIndex2>0){
	    al.GetTag("XJ",seqInd2);
	    al.GetTag("YJ",qualInd2);
	    int j;

	    for(int i=0;i<lengthIndex2;i++){
		j=offsetInd2+i;
		short x=(short(qualInd2[i])-qualOffset);
		if(seqInd2[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(seqInd2[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(seqInd2[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(seqInd2[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}

    }
    reader.Close();

    cout<<"cycle\t"<<"nuc\t";
    for(short k=minQualScore;k<maxQualScore;k++){
	cout<<k<<"\t";
    }
    cout<<maxQualScore<<endl;

    for(int i=0;i<vecLengthToUse;i++){
	cout<<(i+1)<<"\t";
	cout<<"A\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterA[i])[k]<<"\t";
	}
	cout<<(*counterA[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"C\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterC[i])[k]<<"\t";
	}
	cout<<(*counterC[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"G\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterG[i])[k]<<"\t";
	}
	cout<<(*counterG[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"T\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterT[i])[k]<<"\t";
	}
	cout<<(*counterT[i])[maxQualScore]<<endl;	
    }



    
    return 0;
}