Esempio n. 1
0
//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a,
// if both reads are on the same chrome, then read_l must map before read_r
// if the reads are on different strands then read_l must be on the lexo
// lesser chrom (using the string.compare() method)
SV_Pair::
SV_Pair(const BamAlignment &bam_a,
        const BamAlignment &bam_b,
        const RefVector &refs,
        int _weight,
        int _ev_id,
        SV_PairReader *_reader)
{
    reader = _reader;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct interval tmp_a, tmp_b;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition(false, false) - 1;
    tmp_a.chr = refs.at(bam_a.RefID).RefName;

    if ( bam_a.IsReverseStrand() == true )
        tmp_a.strand = '-';
    else
        tmp_a.strand = '+';

    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition(false, false) - 1;
    tmp_b.chr = refs.at(bam_b.RefID).RefName;

    if ( bam_b.IsReverseStrand() == true )
        tmp_b.strand = '-';
    else
        tmp_b.strand = '+';

    //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) {
    if ( bam_a.RefID < bam_b.RefID ) {
        read_l = tmp_a;
        read_r = tmp_b;
        //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) {
    } else if ( bam_a.RefID > bam_b.RefID) {
        read_l = tmp_b;
        read_r = tmp_a;
    } else { // ==
        if (tmp_a.start > tmp_b.start) {
            read_l = tmp_b;
            read_r = tmp_a;
        } else {
            read_l = tmp_a;
            read_r = tmp_b;
        }
    }

    weight = _weight;
    ev_id = _ev_id;
}
int computeInsertSize(const BamAlignment & firstEnd, const BamAlignment & secondEnd) {
    if (!firstEnd.IsMapped() || !secondEnd.IsMapped()) {
        return 0;
    }
    if (!firstEnd.RefID == secondEnd.RefID) {
        return 0;
    }
    
    const int firstEnd5PrimePosition = firstEnd.IsReverseStrand()? firstEnd.Position + firstEnd.Length: firstEnd.Position;
    const int secondEnd5PrimePosition = secondEnd.IsReverseStrand()? secondEnd.Position + secondEnd.Length: secondEnd.Position;
    
    const int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1;
    return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment;
}
int32_t
readTail(const BamAlignment& al, 
        const RefVector& refs)
{
    return readTailS(al.IsMapped(), al.IsReverseStrand(), al.Position, 
                     refs[al.RefID].RefLength, al.AlignedBases.length());
}
Esempio n. 4
0
void
clipAlignment(BamAlignment &al)
{
	int offset, length;
	CigarOp cop1 = al.CigarData[0];
	CigarOp cop2 = al.CigarData[al.CigarData.size() - 1];

	if (copcomp(cop2, cop1)) {
		offset = 0;
		length = min(al.Length, (signed)cop1.Length);
	} else {
		offset = al.Length - min(al.Length, (signed)cop2.Length);
		length = min(al.Length, (signed)cop2.Length);
	}

	try {
		al.Qualities = al.Qualities.substr(offset, length);
		al.QueryBases = al.QueryBases.substr(offset, length);
	} catch (exception &e) {
		cout << "ERROR: substr failed in clipAlignment()" << endl;
		cout << al.Name << " " << (al.IsReverseStrand() ? "(-)" : "(+)");
		cout << " offset: " << offset << " length: " << length
		     << " taglen: " << al.Length << endl;
		cout << "cop1: " << cop1.Length << cop1.Type << endl;
		cout << "cop2: " << cop2.Length << cop2.Type << endl;
		exit(1);
	}
}
Esempio n. 5
0
void BamToFastq::SingleFastq() {
    // open the 1st fastq file for writing
    ofstream fq(_fastq1.c_str(), ios::out);
    if ( !fq ) {
        cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened.  Exiting!" << endl;
        exit (1);
    }
    // open the BAM file
    BamReader reader;
    reader.Open(_bamFile);
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // extract the sequence and qualities for the BAM "query"
        string seq  = bam.QueryBases;
        string qual = bam.Qualities;
        if (bam.IsReverseStrand() == true) {
            reverseComplement(seq);
            reverseSequence(qual);
        }
        fq << "@" << bam.Name << endl;
        fq << seq << endl;
        fq << "+" << endl;
        fq << qual << endl;
    }
}
Esempio n. 6
0
// print BamAlignment in FASTQ format
// N.B. - uses QueryBases NOT AlignedBases
void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { 
  
    // @BamAlignment.Name
    // BamAlignment.QueryBases
    // +
    // BamAlignment.Qualities
    //
    // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand .
    //        Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is.
  
    // handle paired-end alignments
    string name = a.Name;
    if ( a.IsPaired() )
        name.append( (a.IsFirstMate() ? "/1" : "/2") );
  
    // handle reverse strand alignment - bases & qualities
    string qualities = a.Qualities;
    string sequence  = a.QueryBases;
    if ( a.IsReverseStrand() ) {
        Utilities::Reverse(qualities);
        Utilities::ReverseComplement(sequence);
    }
  
    // write to output stream
    m_out << "@" << name << endl
          << sequence    << endl
          << "+"         << endl
          << qualities   << endl;
}
Esempio n. 7
0
    void getBamBlocks(const BamAlignment &bam, const RefVector &refs,
                      vector<BED> &blocks, bool breakOnDeletionOps) {

        CHRPOS currPosition = bam.Position;
        CHRPOS blockStart   = bam.Position;
        string chrom        = refs.at(bam.RefID).RefName;
        string name         = bam.Name;
        string strand       = "+";
        string score        = ToString(bam.MapQuality);
        char  prevOp        = '\0';
        if (bam.IsReverseStrand()) strand = "-";
        bool blocksFound = false;

        vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
        vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
        for ( ; cigItr != cigEnd; ++cigItr ) {
            if (cigItr->Type == 'M') {
                currPosition += cigItr->Length;
                // we only want to create a new block if the current M op
                // was preceded by an N op or a D op (and we are breaking on D ops)
                if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) {
                    blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                    blockStart = currPosition;
                }
            }
            else if (cigItr->Type == 'D') {
                if (breakOnDeletionOps == false)
                    currPosition += cigItr->Length;
                else {
                    blocksFound = true;
                    currPosition += cigItr->Length;
                    blockStart    = currPosition;
                }
            }
            else if (cigItr->Type == 'N') {
                blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                blocksFound = true;
                currPosition += cigItr->Length;
                blockStart    = currPosition;
            }
            else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
                // do nothing
            }
            else {
                cerr << "Input error: invalid CIGAR type (" << cigItr->Type
                    << ") for: " << bam.Name << endl;
                exit(1);
            }
            prevOp = cigItr->Type;
        }
        // if there were no splits, we just create a block representing the contiguous alignment.
        if (blocksFound == false) {
            blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) );
        }
    }
Esempio n. 8
0
void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a)
{

    // tab-delimited, 0-based half-open
    // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
    // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>

    m_out << m_references.at(a.RefID).RefName << '\t' << a.Position << '\t' << a.GetEndPosition()
          << '\t' << a.Name << '\t' << a.MapQuality << '\t' << (a.IsReverseStrand() ? '-' : '+')
          << std::endl;
}
Esempio n. 9
0
void BedCoverage::CollectCoverageBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    // open the BAM file
    BamReader reader;
    reader.Open(bamFile);

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        if (bam.IsMapped()) {
            // treat the BAM alignment as a single "block"
            if (_obeySplits == false) {
                // construct a new BED entry from the current BAM alignment.
                BED a;
                a.chrom  = refs.at(bam.RefID).RefName;
                a.start  = bam.Position;
                a.end    = bam.GetEndPosition(false, false);
                a.strand = "+";
                if (bam.IsReverseStrand()) a.strand = "-";

                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            }
            // split the BAM alignment into discrete blocks and
            // look for overlaps only within each block.
            else {
                // vec to store the discrete BED "blocks" from a
                bedVector bedBlocks;
                // since we are counting coverage, we do want to split blocks when a
                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
    // close the BAM file
    reader.Close();
}
Esempio n. 10
0
// use current input alignment to update BAM file alignment stats
void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
  
    // increment total alignment counter
    ++numReads;
    
    // check the paired-independent flags
    if ( al.IsDuplicate() ) ++numDuplicates;
    if ( al.IsFailedQC()  ) ++numFailedQC;
    if ( al.IsMapped()    ) ++numMapped;
    
    // check forward/reverse strand
    if ( al.IsReverseStrand() ) 
        ++numReverseStrand; 
    else 
        ++numForwardStrand;
    
    // if alignment is paired-end
    if ( al.IsPaired() ) {
      
        // increment PE counter
        ++numPaired;
      
        // increment first mate/second mate counters
        if ( al.IsFirstMate()  ) ++numFirstMate;
        if ( al.IsSecondMate() ) ++numSecondMate;
        
        // if alignment is mapped, check mate status
        if ( al.IsMapped() ) {
            // if mate mapped
            if ( al.IsMateMapped() ) 
                ++numBothMatesMapped;
            // else singleton
            else 
                ++numSingletons;
        }
        
        // check for explicit proper pair flag
        if ( al.IsProperPair() ) ++numProperPair;
        
        // store insert size for first mate 
        if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
            int insertSize = abs(al.InsertSize);
            insertSizes.push_back( insertSize );
        }
    }
}
Esempio n. 11
0
//increases the counters mismatches and typesOfMismatches of a given BamAlignment object
inline void increaseCounters(BamAlignment & al,string & reconstructedReference,int firstCycleRead,int increment){

    char refeBase;
    char readBase;
    int cycleToUse=firstCycleRead;
    // cout<<"name "<<al.Name<<endl;
    // cout<<"firstCycleRead "<<firstCycleRead<<endl;
    // cout<<"increment      "<<increment<<endl;

    for(int i=0;i<numberOfCycles;i++,cycleToUse+=increment){
        // cout<<"i = "<<i<<" cyc "<<cycleToUse<<endl;
	refeBase=toupper(reconstructedReference[i]);
	readBase=toupper(         al.QueryBases[i]);
		     
	//match
	if(refeBase == 'M'){
	    matches[cycleToUse]++;
	    continue;
	}

	if(refeBase == 'S' ||refeBase == 'I'){ //don't care about soft clipped or indels
	    continue;
	}
		    
	//mismatch
	if( isResolvedDNA(refeBase)  && 
	    isResolvedDNA(readBase) ){
	    if(al.IsReverseStrand()){ //need to take the complement
		refeBase=complement(refeBase);
		readBase=complement(readBase);
	    }
	    if(readBase == refeBase){
		cerr<<"Internal error in reconstruction of read "<<al.Name<<", contact developer"<<endl;
		exit(1);;
	    }						
	    mismatches[cycleToUse]++;
	    typesOfMismatches[dimer2index(refeBase,readBase)][cycleToUse]++;
	    continue;
	}		    		     
    }
}
Esempio n. 12
0
 void getBamBlocks(const BamAlignment &bam, const RefVector &refs, 
                   BedVec &blocks, bool breakOnDeletionOps) {
 
 	CHRPOS currPosition = bam.Position;
     CHRPOS blockStart   = bam.Position;
     string chrom        = refs.at(bam.RefID).RefName;
     string name         = bam.Name;
     string strand       = "+";
     float  score        = bam.MapQuality;
     if (bam.IsReverseStrand()) strand = "-"; 
 	
 	vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
 	vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
     for ( ; cigItr != cigEnd; ++cigItr ) {
         if (cigItr->Type == 'M') {
             currPosition += cigItr->Length;
 			blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) );
 			blockStart    = currPosition;
         }
         else if (cigItr->Type == 'D') {
             if (breakOnDeletionOps == false)
                 currPosition += cigItr->Length;
             else {
                 currPosition += cigItr->Length;
                 blockStart    = currPosition;
             }
         }
         else if (cigItr->Type == 'N') {
             currPosition += cigItr->Length;
             blockStart    = currPosition;            }
         else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
             // do nothing
         }
         else {
             cerr << "Input error: invalid CIGAR type (" << cigItr->Type
 				<< ") for: " << bam.Name << endl;
 			exit(1);
         }
 	}
 }
Esempio n. 13
0
/*
 * snip() doesn't leave a valid BamAlignment; it contains
 * correct FASTQ data.  Handles negative strand alignments:
 * 'start=0' will always correspond to the 5'-most basepair
 * in the read.
 */
BamAlignment
snip(BamAlignment &a, int start, int len)
{
	BamAlignment copy(a);

	/* Handle reverse strand mappings */
	int converted_start = copy.IsReverseStrand() ?
		copy.Length - start - len : start;
	copy.Length = len;
	try {
		copy.QueryBases = copy.QueryBases.substr(converted_start, len);
		copy.Qualities = copy.Qualities.substr(converted_start, len);
	} catch (exception &e) {
		cout << "ERROR: substr failed in snip(" << a.Name << ", "
		     << start << ", " << len << ")" << endl;
		cout << (a.IsReverseStrand() ? "(-)" : "(+)")
		     << ", converted_start: " << converted_start << endl;
		cout << a.QueryBases << endl;
		cout << a.Qualities << endl;
		exit(1);
	}
	return copy;
}
Esempio n. 14
0
// print BamAlignment in FASTA format
// N.B. - uses QueryBases NOT AlignedBases
void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { 
    
    // >BamAlignment.Name
    // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
    // ...
    //
    // N.B. - QueryBases are reverse-complemented if aligned to reverse strand
  
    // print header
    m_out << ">" << a.Name << endl;
    
    // handle reverse strand alignment - bases 
    string sequence = a.QueryBases;
    if ( a.IsReverseStrand() )
        Utilities::ReverseComplement(sequence);
    
    // if sequence fits on single line
    if ( sequence.length() <= FASTA_LINE_MAX )
        m_out << sequence << endl;
    
    // else split over multiple lines
    else {
      
        size_t position = 0;
        size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth();
        
        // write subsequences to each line
        while ( position < (seqLength - FASTA_LINE_MAX) ) {
            m_out << sequence.substr(position, FASTA_LINE_MAX) << endl;
            position += FASTA_LINE_MAX;
        }
        
        // write final subsequence
        m_out << sequence.substr(position) << endl;
    }
}
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults)
{
    VariantCallSetting settingForPyroHMMsnp = snpCallSettings;

    // allele pool
    vector<Allele> allelePool;
    for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++)
    {
        allelePool.push_back(*allelesInBlockIter);
    }

    // add 10bp flanking segment at each side
    int windowLeftPosition  = leftPosition  - snpCallSettings.m_flankingSize;
    int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize;

    // genome
    string genome;
    fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome);

    int    globalDepth;
    double globalMapQual;
    int    globalStrandPos;
    int    globalStrandNeg;

    vector<PyroHMMsnp_Sequence_t> readsInWindow;

    // rewind BAM reader
    bamObj.Rewind();
    // set BAM region
    bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition);
    // read alignment
    BamAlignment al;
    while (bamObj.GetNextAlignment(al))
    {
        // skip if it is not a good alignment
        if (!GenericBamAlignmentTools::goodAlignment(al))
        {
            continue;
        }

        // skip if it is not valid at length
        if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength))
        {
            continue;
        }

        // skip if it is not valid at map quality
        if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality))
        {
            continue;
        }

        // skip if it is not valid at alignment identity
        if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac))
        {
            continue;
        }

        // global info
        globalDepth   += 1;
        globalMapQual += al.MapQuality*al.MapQuality;
        if (al.IsReverseStrand())
            globalStrandNeg += 1;
        else
            globalStrandPos += 1;

        // get local alignment
        string t_localRead, t_localGenome;
        Cigar  t_cigar;
        BamMD  t_md;
        int    t_numMismatch, t_numInDel;
        GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition,
                                                    t_localRead, t_localGenome, t_cigar, t_md,
                                                    t_numMismatch, t_numInDel);

        if (t_localRead.empty() || t_localGenome.empty())
            continue;


        // save into set
        PyroHMMsnp_Sequence_t t_seq;
        t_seq.t_ID           = GenericBamAlignmentTools::getBamAlignmentID(al);
        t_seq.t_sequence     = t_localRead;
        t_seq.t_cigar        = t_cigar;
        t_seq.t_md           = t_md;
        t_seq.t_numMismatch  = t_numMismatch;
        t_seq.t_numInDel     = t_numInDel;
        t_seq.t_mapQualScore = al.MapQuality;


        if (al.Position>windowLeftPosition)
            t_seq.t_startPositionShift = al.Position-windowLeftPosition;
        else
            t_seq.t_startPositionShift = 0;

        if (al.GetEndPosition()<windowRightPosition)
            t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition();
        else
            t_seq.t_endPositionShift = 0;

        readsInWindow.push_back(t_seq);
    }

    int numData = readsInWindow.size();

    // construct the consensus sequence graph
    GenericDagGraph consensusGraph;
    vector<string>  consensusGraphReads;
    vector<Cigar>   consensusGraphReadCigars;
    vector<int>     consensusGraphReadStarts;

    // set of aligned reads to construct the graph
    for (int i=0; i<numData; ++i)
    {
        consensusGraphReads.push_back(readsInWindow[i].t_sequence);
        consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar);
        consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift);
    }

    // build up the graph
    consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts);
    consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel);

    // search topK paths, excluding reference
    vector<string>       topRankConsensusGraphPaths;
    vector<list<Vertex>> topRankConsensusGraphPathVertexs;
    vector<double>       topRankConsensusGraphPathWeights;
    consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights);

    // change vertex list to vertex set
    vector<set<Vertex>>  topRankConsensusGraphPathVertexSet;
    for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++)
    {
        list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin();
        set<Vertex> vertexSet;
        for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++)
        {
            vertexSet.insert(*vertexIter);
        }
        topRankConsensusGraphPathVertexSet.push_back(vertexSet);
    }

    // get variant vertices
    vector<int>    allelePositions;
    vector<string> alleleChars;
    for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++)
    {
        Allele allele = *alleleIter;
        allelePositions.push_back(allele.m_chrPosition-windowLeftPosition);
        alleleChars.push_back(allele.m_allele);
    }
    // map allele to graph vertex
    set<Vertex> variantVertexs;
    map<int,Vertex> mapAlleleToVertex;
    map<Vertex,int> mapVertexToAllele;
    for (int v=0; v<consensusGraph.m_numVertexs; v++)
    {
        if (consensusGraph.m_skip[v])
            continue;

        if (!consensusGraph.m_isMismatch[v])
            continue;

        int gp = consensusGraph.m_genomePosition[v] - 1;


        for (int j=0; j<allelePool.size(); j++)
        {
            int ap = allelePositions[j];
            if (ap==gp)
            {
                if (alleleChars[j]==consensusGraph.m_labels[v])
                {
                    variantVertexs.insert(v);
                    mapAlleleToVertex[j] = v;
                    mapVertexToAllele[v] = j;
                }
            }
        }
    }


    // set up the haplotypes
    vector<string> haplotypes;
    vector<int>    haplotypeToPathIndex;
    vector<set<Vertex>> haplotypeVariantVertexs;

    haplotypes.push_back(genome);
    haplotypeToPathIndex.push_back(-1);
    haplotypeVariantVertexs.push_back(set<Vertex>());

    int kk = 0;
    for (int i=0; i<topRankConsensusGraphPaths.size(); i++)
    {
        if (kk>=snpCallSettings.m_topK)
            continue;

        bool hasVariantVertex = false;
        int  deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length());
        deltaLength = abs(deltaLength);

        if (deltaLength>5)
            continue;

        set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i];
        set<Vertex> pathVariantVertexs;
        for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++)
        {
            if (pathVertexs.find(*variantIter)!=pathVertexs.end())
            {
                hasVariantVertex = true;
                pathVariantVertexs.insert(*variantIter);
            }
        }

        int totalNumberVariantVertexInPath = 0;
        for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++)
        {
            int v = *vertexIter;
            if (consensusGraph.m_isMismatch[v])
            {
                totalNumberVariantVertexInPath += 1;
            }
        }

        if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size())
        {
            haplotypes.push_back(topRankConsensusGraphPaths[i]);
            haplotypeToPathIndex.push_back(i);
            haplotypeVariantVertexs.push_back(pathVariantVertexs);

            kk++;
        }
    }

    int numHaplotypes = haplotypes.size();

    // skip if there is no variant haplotype
    if (numHaplotypes==1)
    {
        return;
    }

    // compute haplotype data likelihood
    vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes);
    PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods);


    // genotype
    vector<vector<int>> genotypes;
    set<set<int>> genotypeDiscovered;
    for (int i=0; i<numHaplotypes; i++)
    {
        vector<int> precedeHaplotypes;
        PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered);
    }

    int numGenotypes = genotypes.size();

    // genotype variant vertex
    vector<set<Vertex>> genotypeVariantVertexs;
    for (int i=0; i<numGenotypes; i++)
    {
        set<Vertex> variantVertexInGenotype;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[i][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end());
        }
        genotypeVariantVertexs.push_back(variantVertexInGenotype);
    }

    // genotype priors
    vector<long double> genotypePriors(numGenotypes);
    PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors);

    // genotype likelihoods
    vector<long double> genotypeLikelihoods(numGenotypes);
    PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods);

    // genotype posteriors
    vector<long double> genotypePosteriors(numGenotypes);
    PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors);

    // search maximal genotype posterior
    long double maxGenotypePosterior = 0;
    int inferGenotype;
    for (int i=1; i<numGenotypes; i++)
    {
        if (maxGenotypePosterior<genotypePosteriors[i])
        {
            maxGenotypePosterior = genotypePosteriors[i];
            inferGenotype = i;
        }
    }

    // all variant vertexs in the inferred genotype
    set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype];

    // count haploid type of variant
    map<Vertex,vector<int>> inferGenotypeVariantHaploidType;
    set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        vector<int> variantHaploidType;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[inferGenotype][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end())
            {
                variantHaploidType.push_back(0);
            }else
            {
                variantHaploidType.push_back(1);
            }
        }
        inferGenotypeVariantHaploidType[v] = variantHaploidType;
    }
    // variant score
    map<Vertex,long double> inferGenotypeVariantScore;
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        long double variantScore = 0;
        for (int i=0; i<numGenotypes; i++)
        {
            set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i];
            if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end())
                variantScore += genotypePosteriors[i];
        }

        inferGenotypeVariantScore[v] = variantScore;
    }

    // save variant result
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        GenericVariant result;

        int v = *inferVariantIter;
        int a = mapVertexToAllele[v];

        int variantChrID;
        int variantChrPos;

        vector<int> haploidType = inferGenotypeVariantHaploidType[v];
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            if (haploidType[j]==0)
            {
                int g = consensusGraph.m_genomePosition[v];

                Allele allele;
                allele.m_allele = consensusGraph.m_labels[g];
                result.m_alleles.push_back(allele);
            }else
            {
                Allele allele = allelePool[a];
                result.m_alleles.push_back(allele);

                variantChrID  = allele.m_chrID;
                variantChrPos = allele.m_chrPosition;
            }
        }

        result.m_chrID           = variantChrID;
        result.m_chrPosition     = variantChrPos;
        result.m_probScoreRef    = genotypePosteriors[0];
        result.m_probScoreVar    = genotypePosteriors[inferGenotype];
        result.m_variantType     = VARIANT_SNP;
        long double variantScore = inferGenotypeVariantScore[v];
        if (fabs(1-variantScore)<1e-300)
            result.m_quality     = 3000;
        else if (variantScore<1e-300)
            result.m_quality     = 0;
        else
            result.m_quality     = -10*log10(1-variantScore);

        char refBase;
        fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase);
        result.m_reference       = refBase;

        for (int i=0; i<result.m_alleles.size(); i++)
        {
            if (result.m_alleles[i].m_allele==result.m_reference)
                result.m_haploidType.push_back(0);
            else
                result.m_haploidType.push_back(1);
        }


        // filter
        if (result.m_quality>=snpCallSettings.m_variantQualityFilter)
            variantResults.push_back(result);

    }

}
int main_asequantmultirg(const vector<string> &all_args)
{
    Init(all_args);
    
    cerr << "* Reading bam file " << endl;
    OpenBam(bam_reader, bam_file);
    bam_reader.OpenIndex(bam_file + ".bai");
    
    vector<string> readGroupVector;
    SamHeader header = bam_reader.GetHeader();
    SamReadGroupDictionary headerRG = header.ReadGroups;
    for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++)
    {
        readGroupVector.push_back(it -> ID);
    }
    
    
    vector<RefData> chroms = bam_reader.GetReferenceData();
    
    cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT";
    for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
    {
        cout << "\t" << *it;
    }
    cout << endl;
    
    StlFor(chrom_idx, chroms)
    {
        string &chrom = chroms[chrom_idx].RefName;
        vector<Snp> snps = snps_by_chrom[chrom];
        
        int s = 0; // Index into snp array
        
        BamAlignment bam;
        bam_reader.Jump(chrom_idx);
        
        string align;
        string qualities;
        
        cerr << "* On chrom " << chrom << endl;

        while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) 
        {
	  if (bam.MapQuality < min_map_qual || !bam.IsMapped())
                continue;
       
            string currentRG;
            Assert(bam.GetReadGroup(currentRG));
            
            int start = AlignStart(bam);
            int end = AlignEnd(bam);
            
            // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order)
            while (s < snps.size() && snps[s].pos < start)
                ++s;
            
            // Stop everything if we have visited all SNPs on this chrom
            if (s >= snps.size())
                break;
            
            // Find any/all SNPs that are within the bam alignment
            int n = 0; // Number of SNPs overlapped
            while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps!
                ++n;
            
            // Now, look at each SNP and see which way it votes
            AlignedString(bam, align);
            AlignedQualities(bam, qualities);
            Assert(align.size() == qualities.size());

            // Now, tally votes
            for (int i = 0; i < n; ++i)
            {
                Snp &snp = snps[s + i];
                char base = align[snp.pos - start]; // Base from the read
                int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read
                
                //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities);
                
                if (base == '-' || qual < min_base_qual)
                    continue;
                
                map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd;
                
                map<string, Counts>::iterator searchIt = RG_counts.find(currentRG);
                
                if (searchIt == RG_counts.end())
                {
                    if (base == snp.ref)
                    {
                        RG_counts[currentRG].num_ref = 1;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else if (base == snp.alt)
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 1;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 1;
                    }
                }
                else
                {
                    if (base == snp.ref)
                    {
                        searchIt -> second.num_ref += 1;
                    }
                    else if (base == snp.alt)
                    {
                        searchIt -> second.num_alt += 1;
                    }
                    else
                    {
                        searchIt -> second.num_other += 1;
                    }
                }
            }
        }
        
        // Output counts
        for (int s = 0; s < snps.size(); ++s)
        {
            cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt;
            for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
            {
                map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it);
                if (searchIt != snps[s].fwd.end())
                {
                    cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ",";
                }
                else
                {
                    cout << "\t" << "0,0,0,";
                }
                searchIt = snps[s].rev.find(*it);
                if (searchIt != snps[s].rev.end())
                {
                    cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other;
                }
                else
                {
                    cout << "0,0,0";
                }
            }
            cout << endl;
        }
    }
Esempio n. 17
0
int IonstatsTestFragments(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string fasta_filename       = opts.GetFirstString('r', "ref", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) {
    IonstatsTestFragmentsHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  map<string,string> tf_sequences;
  PopulateReferenceSequences(tf_sequences, fasta_filename);


  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  int num_tfs = input_bam.GetReferenceCount();


  SamHeader sam_header = input_bam.GetHeader();
  if(!sam_header.HasReadGroups()) {
    fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str());
    return 1;
  }

  string flow_order;
  string key;
  for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) {
    if(rg->HasFlowOrder())
      flow_order = rg->FlowOrder;
    if(rg->HasKeySequence())
      key = rg->KeySequence;
  }


  // Need these metrics stratified by TF.

  vector<ReadLengthHistogram> called_histogram(num_tfs);
  vector<ReadLengthHistogram> aligned_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ10_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ17_histogram(num_tfs);
  vector<SimpleHistogram> error_by_position(num_tfs);
  vector<MetricGeneratorSNR> system_snr(num_tfs);
  vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs);

  for (int tf = 0; tf < num_tfs; ++tf) {
    called_histogram[tf].Initialize(histogram_length);
    aligned_histogram[tf].Initialize(histogram_length);
    AQ10_histogram[tf].Initialize(histogram_length);
    AQ17_histogram[tf].Initialize(histogram_length);
    error_by_position[tf].Initialize(histogram_length);
  }

  vector<uint16_t> flow_signal_fz(flow_order.length());
  vector<int16_t> flow_signal_zm(flow_order.length());

  const RefVector& refs = input_bam.GetReferenceData();

  // Missing:
  //  - hp accuracy - tough, copy verbatim from TFMapper?


  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {


    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    // The check below eliminates unexpected alignments
    if (alignment.IsReverseStrand() or alignment.Position > 5)
      continue;

    int current_tf = alignment.RefID;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    called_histogram[current_tf].Add(alignment.Length);
    aligned_histogram[current_tf].Add(num_bases);
    AQ10_histogram[current_tf].Add(AQ10_bases);
    AQ17_histogram[current_tf].Add(AQ17_bases);

    if(alignment.GetTag("ZM", flow_signal_zm))
      system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order);
    else if(alignment.GetTag("FZ", flow_signal_fz))
      system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order);


    // HP accuracy - keeping it simple

    if (!alignment.IsReverseStrand()) {

      string genome = key + tf_sequences[refs[current_tf].RefName];
      string calls = key + alignment.QueryBases;
      const char *genome_ptr = genome.c_str();
      const char *calls_ptr = calls.c_str();

      for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) {
        int genome_hp = 0;
        int calls_hp = 0;
        while (*genome_ptr == flow_order[flow]) {
          genome_hp++;
          genome_ptr++;
        }
        while (*calls_ptr == flow_order[flow]) {
          calls_hp++;
          calls_ptr++;
        }
        hp_accuracy[current_tf].Add(genome_hp, calls_hp);
      }
    }
  }



  //
  // Processing complete, generate ionstats_tf.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_tf";
  output_json["meta"]["format_version"] = "1.0";

  output_json["results_by_tf"] = Json::objectValue;

  for (int tf = 0; tf < num_tfs; ++tf) {

    if (aligned_histogram[tf].num_reads() < 1000)
      continue;

    called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]);
    aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]);
    AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]);
    AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]);
    error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]);
    system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);
    hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);

    output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName];
  }

  input_bam.Close();

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }
}
Esempio n. 18
0
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block,
SV_SplitRead::
SV_SplitRead(const BamAlignment &bam_a,
             const BamAlignment &bam_b,
             const RefVector &refs,
             int _weight,
             int _id,
             int _sample_id,
             SV_SplitReadReader *_reader)
{
    reader = _reader;
    sample_id = _sample_id;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct cigar_query query_a =
        calc_query_pos_from_cigar(bam_a.CigarData,
                                  bam_a.IsReverseStrand() );
    struct cigar_query query_b =
        calc_query_pos_from_cigar(bam_b.CigarData,
                                  bam_b.IsReverseStrand() );

    struct interval tmp_a, tmp_b;

    tmp_a.strand = '+';
    if (bam_a.IsReverseStrand())
        tmp_a.strand = '-';
    tmp_a.chr = refs.at(bam_a.RefID).RefName;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition();

    tmp_b.strand = '+';
    if (bam_b.IsReverseStrand())
        tmp_b.strand = '-';
    tmp_b.chr = refs.at(bam_b.RefID).RefName;
    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition();


    //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) ||
    //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) &&
    //( tmp_a.start > tmp_b.start ) ) ) {

    if ( (bam_a.RefID > bam_b.RefID) ||
            ( (bam_a.RefID == bam_b.RefID) &&
              (tmp_a.start > tmp_b.start ) ) ) {
        side_r = tmp_a;
        side_l = tmp_b;
        query_r = query_a;
        query_l = query_b;
    } else {
        side_l = tmp_a;
        side_r = tmp_b;
        query_l = query_a;
        query_r = query_b;
    }

    if (side_l.strand != side_r.strand)
        type = SV_BreakPoint::INVERSION;
    else if ( (	( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos < query_r.qs_pos ) ) ||
              (	( side_l.strand == '-' ) &&
                  ( side_r.strand == '-' ) &&
                  ( query_l.qs_pos > query_r.qs_pos) ) )
        type = SV_BreakPoint::DELETION;
    else if ( ( ( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos > query_r.qs_pos ) ) ||
              ( ( side_l.strand == '-' ) &&
                ( side_r.strand == '-' ) &&
                ( query_l.qs_pos < query_r.qs_pos) ) )
        type = SV_BreakPoint::DUPLICATION;
    else {
        cerr << "ERROR IN BAM FILE.  " <<
             "TYPE not detected (DELETION,DUPLICATION,INVERSION)" <<
             endl;
        cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" <<
             query_r.qs_pos << "," << side_r.strand << "\t" <<
             tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" <<
             tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" <<
             endl;

        throw(1);
    }

    weight = _weight;
    id = _id;
}
Esempio n. 19
0
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"arguments:\n"+
			"\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    //dummy reader, will need to reposition anyway
    VCFreader vcfr (vcffiletopen,
 		    vcffiletopen+".tbi",
 		    chrname,
 		    1,
 		    1,
 		    0);

    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }

    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;			
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }
		    
		    //if the VCF has a at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one G but no A 
		    // if(toprint->hasAtLeastOneG() &&
		    //    toprint->getAlt().find("A") == string::npos){
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();

		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }


		    //if the VCF has at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one C but no T
		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }

		}

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Esempio n. 20
0
int IonstatsAlignment(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty()) {
    IonstatsAlignmentHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  ReadLengthHistogram called_histogram;
  ReadLengthHistogram aligned_histogram;
  ReadLengthHistogram AQ7_histogram;
  ReadLengthHistogram AQ10_histogram;
  ReadLengthHistogram AQ17_histogram;
  ReadLengthHistogram AQ20_histogram;
  ReadLengthHistogram AQ47_histogram;
  SimpleHistogram error_by_position;

  called_histogram.Initialize(histogram_length);
  aligned_histogram.Initialize(histogram_length);
  AQ7_histogram.Initialize(histogram_length);
  AQ10_histogram.Initialize(histogram_length);
  AQ17_histogram.Initialize(histogram_length);
  AQ20_histogram.Initialize(histogram_length);
  AQ47_histogram.Initialize(histogram_length);
  error_by_position.Initialize(histogram_length);

  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {

    // Record read length
    called_histogram.Add(alignment.Length);

    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ7_bases = 0;
    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int AQ20_bases = 0;
    int AQ47_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*5 <= num_bases)    AQ7_bases = num_bases;
      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
      if (num_errors*100 <= num_bases)  AQ20_bases = num_bases;
      if (num_errors == 0)              AQ47_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    if (num_bases >= 20)    aligned_histogram.Add(num_bases);
    if (AQ7_bases >= 20)    AQ7_histogram.Add(AQ7_bases);
    if (AQ10_bases >= 20)   AQ10_histogram.Add(AQ10_bases);
    if (AQ17_bases >= 20)   AQ17_histogram.Add(AQ17_bases);
    if (AQ20_bases >= 20)   AQ20_histogram.Add(AQ20_bases);
    if (AQ47_bases >= 20)   AQ47_histogram.Add(AQ47_bases);
  }

  input_bam.Close();


  //
  // Processing complete, generate ionstats_alignment.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_alignment";
  output_json["meta"]["format_version"] = "1.0";

  called_histogram.SaveToJson(output_json["full"]);
  aligned_histogram.SaveToJson(output_json["aligned"]);
  AQ7_histogram.SaveToJson(output_json["AQ7"]);
  AQ10_histogram.SaveToJson(output_json["AQ10"]);
  AQ17_histogram.SaveToJson(output_json["AQ17"]);
  AQ20_histogram.SaveToJson(output_json["AQ20"]);
  AQ47_histogram.SaveToJson(output_json["AQ47"]);
  error_by_position.SaveToJson(output_json["error_by_position"]);

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }

  return 0;
}
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+
			"\nwhich is good if you have many reads in the bam file.\n"+			
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"\narguments:\n"+
			"\t"+"--bq    [base qual]  : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\t"+"--1000g [vcf file]   : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

        if(string(argv[i]) == "--1000g"){
	    vcf1000g=string(argv[i+1]);
            i++;
            continue;
	}



    }

    unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19
    bool * hasCnoT;
    bool * hasGnoA;
    bool * thousandGenomesHasA;
    bool * thousandGenomesHasT;

    cerr<<"Trying to allocating memory"<<endl;
    try{
	hasCnoT              = new bool[ maxSizeChromosome ];
	hasGnoA              = new bool[ maxSizeChromosome ];
	thousandGenomesHasA  = new bool[ maxSizeChromosome ];
	thousandGenomesHasT  = new bool[ maxSizeChromosome ];
    }catch(bad_alloc& exc){
	cerr<<"ERROR: allocating memory failed"<<endl;
	return 1;
    }
    cerr<<"Success in allocating memory"<<endl;

    for(unsigned int i = 0;i<maxSizeChromosome;i++){
	hasCnoT[i]=false;
	hasGnoA[i]=false;
	thousandGenomesHasA[i]=false;
	thousandGenomesHasT[i]=false;
    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    cerr<<"Reading consensus VCF "<<vcffiletopen<<"  ... "<<endl;

    VCFreader vcfr (vcffiletopen,
 		    // vcffiletopen+".tbi",
 		    // chrname,
 		    // 1,
 		    // maxSizeChromosome,
 		    0);

    
    while(vcfr.hasData()){
	SimpleVCF * toprint=vcfr.getData();

	if(toprint->getRef().length() != 1 )
	    continue;
	

	//if the VCF has a at least one G but no A
	if(  toprint->hasAtLeastOneG() && 
	     !toprint->hasAtLeastOneA() ){
	    hasGnoA[ toprint->getPosition() ] =true;
	}

	if(  toprint->hasAtLeastOneC() && 
	     !toprint->hasAtLeastOneT() ){
	    hasCnoT[ toprint->getPosition() ] =true;    
	}

	   
    }
    cerr<<"done reading VCF"<<endl;







    cerr<<"Reading 1000g VCF :"<<vcf1000g<<"  ..."<<endl;
    string line1000g;
    ifstream myFile1000g;
    
    myFile1000g.open(vcf1000g.c_str(), ios::in);

    if (myFile1000g.is_open()){
	while ( getline (myFile1000g,line1000g)){
	    vector<string> fields=allTokens(line1000g,'\t');
	    //0 chr
	    //1 pos
	    //2 id
	    //3 ref
	    //4 alt

	    //check if same chr
	    if(fields[0] != chrname){
		cerr <<"Error, wrong chromosome in 1000g file for line=  "<<line1000g<<endl;
		return 1;
	    }

	    //skip indels
	    if(fields[3].size() != 1 ||
	       fields[4].size() != 1 )
		continue;

	    char         ref=toupper(fields[3][0]);
	    char         alt=toupper(fields[4][0]);
	    unsigned int pos=destringify<unsigned int>( fields[1] );
	    
	    thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') );
	    thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') );
	    
	}
	myFile1000g.close();
    }else{
	cerr <<"Unable to open file "<<vcf1000g<<endl;
	return 1;
    }



    cerr<<"done reading 1000g VCF"<<endl;











    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }


    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+1] && 
		    !thousandGenomesHasA[al.Position+1] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;			
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }
		    
		//     //if the VCF has a at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+2] && 
		   !thousandGenomesHasA[al.Position+2] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);


		// vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one G but no A 
		//     // if(toprint->hasAtLeastOneG() &&
		//     //    toprint->getAlt().find("A") == string::npos){
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasGnoA[positionJump] && 
		   !thousandGenomesHasA[positionJump] )
		    isDeaminated=true; 


		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();

		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }


		//     //if the VCF has at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		// transformRef(&refeBase,&readBase);

		if(hasCnoT[al.Position+1]  && 
		   !thousandGenomesHasT[al.Position+1] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one C but no T
		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }

		// }

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		//transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();
    delete(hasCnoT);
    delete(hasGnoA);

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Esempio n. 22
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
Esempio n. 23
0
int main (int argc, char *argv[]) {

    string usage=string(""+string(argv[0])+"  [in BAM file]"+
			"\nThis program reads a BAM file and computes the error rate for each cycle\n"+
			// "\nreads and the puts the rest into another bam file.\n"+
			// "\nTip: if you do not need one of them, use /dev/null as your output\n"+
			// "arguments:\n"+
			// "\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    // for(int i=1;i<(argc-2);i++){ 

	
    //     if(string(argv[i]) == "--bq"){
    // 	    minBaseQuality=destringify<int>(argv[i+1]);
    //         i++;
    //         continue;
    // 	}

    // }

    string bamfiletopen = string( argv[ argc-1 ] );
    // string deambam      = string( argv[ argc-2 ] );
    // string nondeambam   = string( argv[ argc-1 ] );


    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }








    //iterating over the alignments for these regions
    BamAlignment al;
    bool pairedEnd=false;
    bool firstRead=true;

    while ( reader.GetNextAlignment(al) ) {

	if(firstRead){ //reads are either all paired end or single end, I don't allow a mix
	    numberOfCycles=al.QueryBases.size();
	    // cout<<"numberOfCycles "<<numberOfCycles<<endl;
	    if(al.IsPaired() ){  
		pairedEnd=true;		
		matches    = vector<unsigned int> (2*numberOfCycles,0);
		mismatches = vector<unsigned int> (2*numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) );
	    }else{
		matches    = vector<unsigned int> (  numberOfCycles,0);
		mismatches = vector<unsigned int> (  numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (  numberOfCycles,0) );
	    }
	    firstRead=false;
	}

	if( (  pairedEnd && !al.IsPaired()) ||  
	    ( !pairedEnd &&  al.IsPaired())   ){
	    cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl;
	    return 1;
	}


	//skip unmapped
	if(!al.IsMapped())
	    continue;

	if(numberOfCycles!=int(al.QueryBases.size())){
	    cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl;
	    return 1;
	}


	string reconstructedReference = reconstructRef(&al);
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}


	if( pairedEnd ){
	    if( al.IsFirstMate() ){ //start cycle 0

		if( al.IsReverseStrand()  ){ 
		    increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
		}else{
		    increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
		}
       
	    }else{

		if( al.IsSecondMate()  ){ 
		    if( al.IsReverseStrand()  ){ 
			increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1
		    }else{
			increaseCounters(al,reconstructedReference,numberOfCycles    , 1); //start cycle numberOfCycles
		    }
		}else{
	    	    cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl;
	    	    return 1;
	    	}
	    }	
	}else{ //single end 

	    if( al.IsReverseStrand()  ){ 
		increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
	    }else{
		increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
	    }
	    
	}


    }//end while  each read
	


    reader.Close();

    cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl;



    for(unsigned int i=0;i<matches.size();i++){
cout<<(i+1);
	if( (matches[i]+mismatches[i]!=0) )
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ;
	else
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA";

	for(int j=0;j<12;j++){   
	    cout<<"\t"<<typesOfMismatches[j][i];
	    if( (matches[i]+mismatches[i]!=0) )
		cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]);
	    else
		cout<<"\tNA";
	}

	cout<<endl;
    }




   
    return 0;
}
int main_asequantmultibam(const vector<string> &all_args)
{
    Init(all_args);
    for (int bam_files_index = 0; bam_files_index < bam_files.size(); bam_files_index ++)
    {
        BamReader bam_reader;
        bam_file = bam_files[bam_files_index];

        cerr << "* Reading bam file " << endl;
        OpenBam(bam_reader, bam_file);
        bam_reader.OpenIndex(bam_file + ".bai");

        vector<RefData> chroms = bam_reader.GetReferenceData();

        StlFor(chrom_idx, chroms)
        {
            string &chrom = chroms[chrom_idx].RefName;
            map<string, vector<Snp> >::iterator snps_ptr = snps_by_chrom.find(chrom);

            cerr << "* On chrom " << chrom << endl;

            int s = 0; // Index into snp array

            BamAlignment bam;
            bam_reader.Jump(chrom_idx);

            string align;
            string qualities;

            while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx)
            {
                if (bam.MapQuality < min_map_qual)
                    continue;

                int start = AlignStart(bam);
                int end = AlignEnd(bam);

                // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order)
                while (s < snps_ptr->second.size() && snps_ptr->second[s].pos < start)
                    ++s;

                // Stop everything if we have visited all SNPs on this chrom
                if (s >= snps_ptr->second.size())
                    break;

                // Find any/all SNPs that are within the bam alignment
                int n = 0; // Number of SNPs overlapped
                while ((s + n) < snps_ptr->second.size() && snps_ptr->second[s + n].pos < end) // Then it overlaps!
                    ++n;

                // Now, look at each SNP and see which way it votes
                AlignedString(bam, align);
                AlignedQualities(bam, qualities);
                Assert(align.size() == qualities.size());

                // Now, tally votes
                for (int i = 0; i < n; ++i)
                {
                    Snp &snp = snps_ptr->second[s + i];
                    char base = align[snp.pos - start]; // Base from the read
                    int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read

                    //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities);

                    if (base == '-' || qual < min_base_qual)
                        continue;

                    Counts &counts = bam.IsReverseStrand() ? snp.rev[bam_files_index] : snp.fwd[bam_files_index];

                    if (base == snp.ref)
                    {
                        counts.num_ref += 1;
                    }
                    else if (base == snp.alt)
                    {
                        counts.num_alt += 1;
                    }
                    else
                    {
                        counts.num_other += 1;
                    }
                }
            }
        }
    }
Esempio n. 25
0
 bool check(const PropertyFilter& filter, const BamAlignment& al) {
   
     bool keepAlignment = true;
     const PropertyMap& properties = filter.Properties;
     PropertyMap::const_iterator propertyIter = properties.begin();
     PropertyMap::const_iterator propertyEnd  = properties.end();
     for ( ; propertyIter != propertyEnd; ++propertyIter ) {
       
         // check alignment data field depending on propertyName
         const string& propertyName = (*propertyIter).first;
         const PropertyFilterValue& valueFilter = (*propertyIter).second;
         
         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
         else if ( propertyName == CIGAR_PROPERTY ) {
             stringstream cigarSs;
             const vector<CigarOp>& cigarData = al.CigarData;
             if ( !cigarData.empty() ) {
                 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                 vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                     const CigarOp& op = (*cigarIter);
                     cigarSs << op.Length << op.Type;
                 }
                 keepAlignment &= valueFilter.check(cigarSs.str());
             }
         }
         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
         else if ( propertyName == ISSINGLETON_PROPERTY ) {
             const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
             keepAlignment &= valueFilter.check(isSingleton);
         }
         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
             const string& refName = filterToolReferences.at(al.MateRefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
         else if ( propertyName == REFERENCE_PROPERTY ) {
             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
             const string& refName = filterToolReferences.at(al.RefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
         else BAMTOOLS_ASSERT_UNREACHABLE;
         
         // if alignment fails at ANY point, just quit and return false
         if ( !keepAlignment ) return false;
     }
   
     BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
     return keepAlignment;
 }
int main_aseregion(const vector<string> &all_args)
{
    Init(all_args);
    
    cerr << "* Reading bam file " << endl;
    OpenBam(bam_reader, bam_file);
    bam_reader.OpenIndex(bam_file + ".bai");
    
    vector<string> readGroupVector; //Obtain all the readgroups.
    SamHeader header = bam_reader.GetHeader();
    SamReadGroupDictionary headerRG = header.ReadGroups;
    for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++)
    {
        readGroupVector.push_back(it -> ID);
    }
    
    cout << "#CHROM" << "\t" << "StartPos" << "\t" << "EndPos";
    for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
    {
        cout << "\t" << *it;
    }
    cout << endl;
    
    vector<RefData> chroms = bam_reader.GetReferenceData();
    
    StlFor(chrom_idx, chroms)
    {
        string &chrom = chroms[chrom_idx].RefName;
        cerr << "* On chrom " << chrom << endl;
        
        map<string, vector<GenomicRegion> >::iterator searchIt = chrom_genomicRegions.find(chrom);
        
        BamAlignment startPointer; // This pointer will point to the region immediately before the start of current regions under inspection.
        bam_reader.Jump(chrom_idx);
        if (!bam_reader.GetNextAlignment(startPointer))
            break;
        
        int count = 0;
        // For each region, walk through all the reads correspoinding to this region and count the reads.
        for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it)
        {
            bam_reader.Jump(chrom_idx, startPointer.Position); // Fix the reading pointer.
            if (!bam_reader.GetNextAlignment(startPointer))
                break;
            int flag = 0;
            while (true)
            {
                int startEnd = startPointer.GetEndPosition();
                if (startEnd < it -> start)
                {
                    if (!bam_reader.GetNextAlignment(startPointer))
                    {
                        flag = 1;
                        break;
                    }
                }
                else
                {
                    break;
                }
            }
            
            if (flag == 1)
            {
                break;
            }
            // Now startPointer assumes its rightful position.
            BamAlignment nextPointer = startPointer; //This pointer traverse through all reads that align to the current genomic region in bed file and the iteration ends when this pointer pass through the end of the region.
            
            while (true)
            {
                int nextStart = nextPointer.Position;
                if (nextStart > it -> end)
                {
                    break; // This iteration is done.
                }
                
                if (nextPointer.MapQuality < min_map_qual)
                {
                    if (!bam_reader.GetNextAlignment(nextPointer))
                    {
                        break;
                    }
                    continue;
                }
                
                string currentRG;
                Assert(nextPointer.GetReadGroup(currentRG));
                
                map<string, int> &RG_counts = nextPointer.IsReverseStrand() ? it -> revCounts : it -> fwdCounts;
                map<string, int>::iterator searchItForRG = RG_counts.find(currentRG);
                if (searchItForRG == RG_counts.end())
                {
                    RG_counts[currentRG] = 1;
                }
                else
                {
                    ++ RG_counts[currentRG];
                }
                if (!bam_reader.GetNextAlignment(nextPointer))
                {
                    break;
                }
            }
            count ++;
            if (count % 1000 == 0)
                cerr << "Processed" << "\t" << count << endl;
        }
        
        // Output the counts
        for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it)
        {
            cout << chrom << "\t" << it -> start << "\t" << it -> end;
            for (vector<string>::iterator subIt = readGroupVector.begin(); subIt != readGroupVector.end(); ++subIt)
            {
                map<string, int>::iterator searchItForRG = it -> fwdCounts.find(*subIt);
                if (searchItForRG != it -> fwdCounts.end())
                {
                    cout << "\t" << searchItForRG -> second << ",";
                }
                else
                {
                    cout << "\t" << "0,";
                }
                searchItForRG = it -> revCounts.find(*subIt);
                if (searchItForRG != it -> revCounts.end())
                {
                    cout << searchItForRG -> second;
                }
                else
                {
                    cout << "0";
                }
            }
            cout << endl;
        }
    }
Esempio n. 27
0
	virtual void main()
	{
		//init
		QTextStream out(stdout);
		BamReader reader;
		NGSHelper::openBAM(reader, getInfile("in"));

		FastqOutfileStream out1(getOutfile("out1"), false);
		FastqOutfileStream out2(getOutfile("out2"), false);

		long long c_unpaired = 0;
		long long c_paired = 0;
		int max_cached = 0;

		//iterate through reads
		BamAlignment al;
		QHash<QByteArray, BamAlignment> al_cache;
		while (reader.GetNextAlignment(al))
		{
			//skip secondary alinments
			if(!al.IsPrimaryAlignment()) continue;

			//skip unpaired
			if(!al.IsPaired())
			{
				++c_unpaired;
				continue;
			}

			QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4)

			//store cached read when we encounter the mate
			if (al_cache.contains(name))
			{
				BamAlignment mate = al_cache.take(name);
				//out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl;
				//out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl;
				if (al.IsFirstMate())
				{
					write(out1, al, al.IsReverseStrand());
					write(out2, mate, mate.IsReverseStrand());
				}
				else
				{
					write(out1, mate, mate.IsReverseStrand());
					write(out2, al, al.IsReverseStrand());
				}
				++c_paired;
			}
			//cache read for later retrieval
			else
			{
				al_cache.insert(name, al);
			}

			max_cached = std::max(max_cached, al_cache.size());
		}
		reader.Close();
		out1.close();
		out2.close();

		//write debug output
		out << "Pair reads (written)            : " << c_paired << endl;
		out << "Unpaired reads (skipped)        : " << c_unpaired << endl;
		out << "Unmatched paired reads (skipped): " << al_cache.size() << endl;
		out << endl;
		out << "Maximum cached reads            : " << max_cached << endl;
	}
Esempio n. 28
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Esempio n. 29
0
void BedWindow::WindowIntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    vector<BED> hits;                   // vector of potential hits
    // reserve some space
    hits.reserve(100);

    _bedA->bedType = 6;
    BamAlignment bam;
    bool overlapsFound;
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
            a.end   = bam.GetEndPosition(false, false);

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);
            a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                overlapsFound = FindOneOrMoreWindowOverlaps(a);
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true)
                        writer.SaveAlignment(bam);
                }
            }
            else {
                FindWindowOverlaps(a, hits);
                hits.clear();
            }
        }
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Esempio n. 30
0
int main (int argc, char *argv[]) {

    bool mapped  =false;
    bool unmapped=false;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "This program takes a BAM file as input and produces\n"+
			      "another where the putative deaminated bases have\n"+
			      "have been cut\n"+
			      "\n"+
			      "Options:\n");
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    // for(int i=1;i<(argc-1);i++){ //all but the last arg

    // 	if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){
    // 	    mapped=true;
    // 	    continue;
    // 	}

    // 	if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){
    // 	    unmapped=true;
    // 	    continue;
    // 	}
       
    // 	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    // 	return 1;
    // }

    if(argc != 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;
			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);			 
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			}
		    }else{
			int indexToCheck;
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			}
		    }
		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //second to last
			    indexToCheck=al.QueryBases.length()-2;
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    }else{
				//last
				indexToCheck=al.QueryBases.length()-1;
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				    al.Qualities  = al.Qualities.substr(0, indexToCheck);
				}
			    }
			}else{
			    int indexToCheck;
			    //second base
			    indexToCheck=1;
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    }else{
				//first base
				indexToCheck=0;
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				    al.Qualities  = al.Qualities.substr(indexToCheck+1);
				}
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;



		    //second base
		    indexToCheck=1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"51 "<<al.QueryBases<<endl;
			// cout<<"51 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"52 "<<al.QueryBases<<endl;
			// cout<<"52 "<<al.Qualities<<endl;
		    }else{
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"61 "<<al.QueryBases<<endl;
			    // cout<<"61 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    // cout<<"62 "<<al.QueryBases<<endl;
			    // cout<<"62 "<<al.Qualities<<endl;
			}

		    }


		    //last
		    indexToCheck=al.QueryBases.length()-1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"21 "<<al.QueryBases<<endl;
			// cout<<"21 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"22 "<<al.QueryBases<<endl;
			// cout<<"22 "<<al.Qualities<<endl;
		    }
		}else{

		    int indexToCheck;
		    //first base
		    indexToCheck=0;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);

			// cout<<"11 "<<al.QueryBases<<endl;
			// cout<<"11 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"12 "<<al.QueryBases<<endl;
			// cout<<"12 "<<al.Qualities<<endl;

		    }

		    //second to last
		    indexToCheck=al.QueryBases.length()-2;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"31 "<<al.QueryBases<<endl;
			// cout<<"31 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"32 "<<al.QueryBases<<endl;
			// cout<<"32 "<<al.Qualities<<endl;
		    }else{

			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"41 "<<al.QueryBases<<endl;
			    // cout<<"41 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    // cout<<"42 "<<al.QueryBases<<endl;
			    // cout<<"42 "<<al.Qualities<<endl;
			}

		    }
		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();
   
    return 0;
}