Exemple #1
0
ReadGroup::ReadGroup(BamAlignment &al, int max_isize, int isize_samples,
	string prefix, list<string> blacklist) :
	max_isize(max_isize),
	isize_samples(isize_samples),
	prefix(prefix),
	blacklisted(false)
{
	if (!al.GetReadGroup(name))
		name = "none";

	nreads = 0;

	/* Determine if this read group is in the blacklist */
	for (list<string>::iterator it = blacklist.begin();
	     it != blacklist.end(); ++it) {
		if (*it == name) {
			blacklisted = true;
			break;
		}
	}

	if (!blacklisted) {
		f1.open((prefix + "/" + name + "_1.fq.gz").c_str());
		f2.open((prefix + "/" + name + "_2.fq.gz").c_str());
	}

	witness(al);
}
Exemple #2
0
void
clipAlignment(BamAlignment &al)
{
	int offset, length;
	CigarOp cop1 = al.CigarData[0];
	CigarOp cop2 = al.CigarData[al.CigarData.size() - 1];

	if (copcomp(cop2, cop1)) {
		offset = 0;
		length = min(al.Length, (signed)cop1.Length);
	} else {
		offset = al.Length - min(al.Length, (signed)cop2.Length);
		length = min(al.Length, (signed)cop2.Length);
	}

	try {
		al.Qualities = al.Qualities.substr(offset, length);
		al.QueryBases = al.QueryBases.substr(offset, length);
	} catch (exception &e) {
		cout << "ERROR: substr failed in clipAlignment()" << endl;
		cout << al.Name << " " << (al.IsReverseStrand() ? "(-)" : "(+)");
		cout << " offset: " << offset << " length: " << length
		     << " taglen: " << al.Length << endl;
		cout << "cop1: " << cop1.Length << cop1.Type << endl;
		cout << "cop2: " << cop2.Length << cop2.Type << endl;
		exit(1);
	}
}
pos_t VariantProcessor::processMatchOrMismatch(const BamAlignment& alignment, 
					       vector<VariantPtr>& read_variants, 
					       const uint32_t& op_length, const string& refseq, 
					       const pos_t& refpos, const pos_t& readpos) {
  // Process a matching or mismatching sequence in the CIGAR string,
  // adding any SNP variants present.
  int endpos = alignment.GetEndPosition();
  for (int i = 0; i < op_length; i++) {
    assert(alignment.Position + i < endpos);
    char query_base = alignment.QueryBases[readpos + i];
    assert(refpos + i < refseq.size());
    char ref_base = refseq[refpos + i];
    if (ref_base != query_base) {
      // SNP
      string ref(1, ref_base), alt(1, query_base);
      char qual_base = alignment.Qualities[refpos + i]; // TODO check

      VariantPtr snp(new Variant(VariantType::SNP, alignment.RefID, 
				 alignment.Position+i, 1, 0, ref, alt));
      block_variants.insert(snp);
      read_variants.push_back(snp);
      //cout << "mismatch at " << alignment.Position + i <<" refbase: " << ref_base << " querybase: " << query_base << endl;      
    }
  }
}
Exemple #4
0
int main (int argc, char *argv[]) {

     if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl;
    	return 1;
    }

     string bamfiletopen = string(argv[1]);
     string bamFileOUT   = string(argv[2]);

     BamReader reader;
     BamWriter writer;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    if ( !writer.Open(bamFileOUT,header,references) ) {
    	cerr << "Could not open output BAM file "<<bamFileOUT << endl;
    	return 1;
    }

    BamAlignment al;
 
    while ( reader.GetNextAlignment(al) ) {
	if(al.IsMapped()){
	    cerr << "Cannot yet handle mapped reads " << endl;
	    return 1;
	}

	
	al.SetIsPaired (false);
	
	writer.SaveAlignment(al);    

    } //while al

    reader.Close();
    writer.Close();

    return 0;
}
Exemple #5
0
int main (int argc, char *argv[]) {

     if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl;
	 return 1;
     }

     string bamfiletopen = string(argv[1]);
     // cout<<bamfiletopen<<endl;
     BamReader reader;
     // cout<<"ok"<<endl;
     if ( !reader.Open(bamfiletopen) ) {
	 cerr << "Could not open input BAM files." << endl;
	 return 1;
     }

     BamAlignment al;
     // cout<<"ok"<<endl;
     while ( reader.GetNextAlignment(al) ) {
	 // cout<<al.Name<<endl;
	 if(!al.IsMapped())
	     continue;

	 if(al.HasTag("NM") ){
	     int editDist;
	     if(al.GetTag("NM",editDist) ){
		 cout<<editDist<<endl;
	     }else{
		 cerr<<"Cannot retrieve NM field for "<<al.Name<<endl;
		 return 1;
	     }
	 }else{
	     cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl;
	 }

		    

     } //while al

     reader.Close();

     return 0;
}
int VariantProcessor::run() {

  int nmapped = 0;
  last_aln_pos = 0;
  bool stop;
  BamAlignment al; // TODO: mem copying issues?
  
  if (!reader.IsOpen()) {
    std::cerr << "error: BAM file '" << filename << "' is not open." << std::endl;
  }

  while (reader.GetNextAlignment(al)) {
    if (!al.IsMapped()) continue;

    // TODO add chromosome checking code here

    assert(al.Position >= last_aln_pos); // ensure is sorted

    if (al.Position > block_start) {
      // only check if we can stop if we've moved in the block
      stop = isBlockEnd(al);
    }

    if (stop) {
      // end of block; process all variants in this block and output
      // haplotype count statistics
      processBlockAlignments();
      
      // reset block
      blockReset((pos_t) al.Position);
      stop = false;
    } else {
      // process read
      last_aln_pos = processAlignment(al);
    }
    
    // for debug TODO
    for (set<VariantPtr>::const_iterator it = block_variants.begin(); it != block_variants.end(); ++it) {
      (*it)->print();
    }

    nmapped++;
  }

  return nmapped;
}
Exemple #7
0
// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {

    // if alignment is on any reference sequence before left bound
    if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;

    // if alignment starts on left bound reference
    else if ( bAlignment.RefID == Region.LeftRefID ) {

	// if alignment starts at or after left boundary
	if ( bAlignment.Position >= Region.LeftPosition) {

	    // if right boundary is specified AND
	    // left/right boundaries are on same reference AND
	    // alignment starts past right boundary
	    if ( Region.isRightBoundSpecified() &&
		 Region.LeftRefID == Region.RightRefID &&
		 bAlignment.Position > Region.RightPosition )
		return AFTER_REGION;

	    // otherwise, alignment is within region
	    return WITHIN_REGION;
	}

	// alignment starts before left boundary
	else {
	    // check if alignment overlaps left boundary
	    if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
	    else return BEFORE_REGION;
	}
    }

    // alignment starts on a reference after the left bound
    else {

	// if region has a right boundary
	if ( Region.isRightBoundSpecified() ) {

	    // alignment is on reference between boundaries
	    if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;

	    // alignment is on reference after right boundary
	    else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;

	    // alignment is on right bound reference
	    else {
		// check if alignment starts before or at right boundary
		if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
		else return AFTER_REGION;
	    }
	}

	// otherwise, alignment is after left bound reference, but there is no right boundary
	else return WITHIN_REGION;
    }
}
Exemple #8
0
    void getBamBlocks(const BamAlignment &bam, const RefVector &refs,
                      vector<BED> &blocks, bool breakOnDeletionOps) {

        CHRPOS currPosition = bam.Position;
        CHRPOS blockStart   = bam.Position;
        string chrom        = refs.at(bam.RefID).RefName;
        string name         = bam.Name;
        string strand       = "+";
        string score        = ToString(bam.MapQuality);
        char  prevOp        = '\0';
        if (bam.IsReverseStrand()) strand = "-";
        bool blocksFound = false;

        vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
        vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
        for ( ; cigItr != cigEnd; ++cigItr ) {
            if (cigItr->Type == 'M') {
                currPosition += cigItr->Length;
                // we only want to create a new block if the current M op
                // was preceded by an N op or a D op (and we are breaking on D ops)
                if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) {
                    blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                    blockStart = currPosition;
                }
            }
            else if (cigItr->Type == 'D') {
                if (breakOnDeletionOps == false)
                    currPosition += cigItr->Length;
                else {
                    blocksFound = true;
                    currPosition += cigItr->Length;
                    blockStart    = currPosition;
                }
            }
            else if (cigItr->Type == 'N') {
                blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                blocksFound = true;
                currPosition += cigItr->Length;
                blockStart    = currPosition;
            }
            else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
                // do nothing
            }
            else {
                cerr << "Input error: invalid CIGAR type (" << cigItr->Type
                    << ") for: " << bam.Name << endl;
                exit(1);
            }
            prevOp = cigItr->Type;
        }
        // if there were no splits, we just create a block representing the contiguous alignment.
        if (blocksFound == false) {
            blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) );
        }
    }
int DataStatisticsTool::Execute()
{
    // iterate over reads in BAM file(s)
    BamAlignment alignObj;
    while(bamReader.GetNextAlignment(alignObj))
    {
        if (alignObj.IsDuplicate()) continue;
        if (alignObj.IsFailedQC()) continue;
        if (!alignObj.IsMapped()) continue;
        if (!alignObj.IsPrimaryAlignment()) continue;
        if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue;
        if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue;
        if (!alignObj.HasTag("MD")) continue;

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // shift InDel
        GenericBamAlignmentTools::leftShiftInDel(alignObj);

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // get the alignment sequences
        string alignRead;
        string alignGenome;
        GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome);

        // update the statistics
        statistics.update(alignRead, alignGenome);
    }


    // print to screen
    cout << statistics << endl;
//    statistics.printMatchMismatch();

    // close BAM reader
    bamReader.Close();

    // close Fasta
    genomeFasta.Close();

    return 1;
}
Exemple #10
0
// get next alignment (with character data fully parsed)
bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {

    // if valid alignment found
    if ( GetNextAlignmentCore(alignment) ) {

        // store alignment's "source" filename
        alignment.Filename = m_filename;

        // return success/failure of parsing char data
        if ( alignment.BuildCharData() )
            return true;
        else {
            const string alError = alignment.GetErrorString();
            const string message = string("could not populate alignment data: \n\t") + alError;
            SetErrorString("BamReader::GetNextAlignment", message);
            return false;
        }
    }

    // no valid alignment found
    return false;
}
Exemple #11
0
//{{{ void process_pair(const BamAlignment &curr,
void
SV_Pair::
process_pair(const BamAlignment &curr,
             const RefVector refs,
             map<string, BamAlignment> &mapped_pairs,
             UCSCBins<SV_BreakPoint*> &r_bin,
             int weight,
             int ev_id,
             SV_PairReader *reader)
{
    if (mapped_pairs.find(curr.Name) == mapped_pairs.end())
        mapped_pairs[curr.Name] = curr;
    else {
        SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name],
                                        curr,
                                        refs,
                                        weight,
                                        ev_id,
                                        reader);
        //cerr << count_clipped(curr.CigarData) << "\t" <<
                //count_clipped(mapped_pairs[curr.Name].CigarData) << endl;
                
        if ( new_pair->is_sane() &&  
             new_pair->is_aberrant() &&
             (count_clipped(curr.CigarData) > 0) &&
             (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) {
            SV_BreakPoint *new_bp = new_pair->get_bp();

#ifdef TRACE

            cerr << "READ\t" << 
                    refs.at(mapped_pairs[curr.Name].RefID).RefName << "," <<
                    mapped_pairs[curr.Name].Position << "," <<
                    (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1)
                        << "\t" <<
                    refs.at(curr.RefID).RefName << "," <<
                    curr.Position << "," <<
                    (curr.GetEndPosition(false, false) - 1)
                        <<
                    endl;

            cerr << "\tPE\t" << *new_bp << endl;
#endif
            new_bp->cluster(r_bin);
        } else {
            delete(new_pair);
        }

        mapped_pairs.erase(curr.Name);
    }
}
string createReferenceSequence(const BamAlignment& alignment) {
  // Recreate a reference sequence for a particular alignment. This is
  // the reference sequence that is identical to the reference at this
  // spot. This means skipping insertions or soft clipped regions in
  // reads, adding deletions back in, and keeping read matches.
  const vector<CigarOp> cigar = alignment.CigarData;
  const string querybases = alignment.QueryBases;
  string md_tag;
  alignment.GetTag("MD", md_tag);
  
  vector<MDToken> tokens;
  string refseq, alignedseq; // final ref bases; aligned portion of ref bases
  int md_len = TokenizeMD(md_tag, tokens);

  // Create reference-aligned sequence of read; doesn't contain soft
  // clips or insertions. Then, deletions and reference alleles are
  // added onto this.
  int pos=0;
  for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) {
    if (!(op->Type == 'S' || op->Type == 'I')) {
      alignedseq.append(querybases.substr(pos, op->Length));
      pos += op->Length;
    } else {
      pos += op->Length; // increment read position past skipped bases
    }
  }

  // the size of the aligned sequence MUST equal what is returned from
  // TokenizeMD: the number of aligned bases. Not the real reference
  // sequence is this length + deletions, which we add in below.
  assert(alignedseq.size() == md_len);

  pos = 0;
  for (vector<MDToken>::const_iterator it = tokens.begin(); it != tokens.end(); ++it) {
    if (it->type == MDType::isMatch) {
      refseq.append(alignedseq.substr(pos, it->length));
      pos += it->length;
    } else if (it->type == MDType::isSNP) {
      assert(it->length == it->seq.size());
      refseq.append(it->seq);
      pos += it->length;
    } else if (it->type == MDType::isDel) {
      // does not increment position in alignedseq
      assert(it->length == it->seq.size());
      refseq.append(it->seq);
    } else {
      assert(false);
    }
  }
  return refseq;
}
Exemple #13
0
void CountDepth(Histogram& hist, BamMultiReader& reader, BamAlignment& al, int32_t refID, int64_t refLen)
{
    bool moreReads = (al.RefID == refID);

    int32_t maxReadLen = 1000;
    vector<int64_t> readEnds(maxReadLen);

    int64_t depth = 0;
    for(int64_t pos=0; pos<refLen; ++pos){
        while(moreReads and al.Position == pos){
            ++depth;
            assert(al.GetEndPosition() - pos < maxReadLen);
            ++readEnds[al.GetEndPosition() % maxReadLen];
            moreReads = GetNextAlignment(al, reader, refID);
        }
        depth -= readEnds[pos % maxReadLen];
        assert(depth >= 0);
        readEnds[pos % maxReadLen] = 0;
        if(depth >= hist.size())
            hist.resize(2 * depth);
        ++hist[depth];
    }
}
Exemple #14
0
// get next alignment (with character data fully parsed)
bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {

    // if valid alignment found
    if ( GetNextAlignmentCore(alignment) ) {

        // store alignment's "source" filename
        alignment.Filename = m_filename;

        // return success/failure of parsing char data
        return alignment.BuildCharData();
    }

    // no valid alignment found
    return false;
}
bool 
processReadPair(const BamAlignment& al1, 
        const BamAlignment& al2, 
        const RefVector& refs, 
        const int32_t totalTail, 
        const int32_t critTail, 
        const bool diff_ref)
{
    if ((al1.IsFirstMate() && al2.IsFirstMate())
        || (al1.IsSecondMate() && al2.IsSecondMate())) {
        cerr << "Incompatible mate orders: name1 = " << al1.Name 
             << " is1stmate " << al1.IsFirstMate() << " is2ndmate " << al1.IsSecondMate()
             << " name2 = " << al2.Name 
             << " is1stmate " << al2.IsFirstMate() << " is2ndmate " << al2.IsSecondMate()
             << endl;
        exit(1);
    }

    int32_t total_tail = -1;
    if (! (total_tail = checkLinkPair(al1, al2, refs, totalTail, critTail, diff_ref))) {
        return false;  // reject all but link pairs
        // continue;
    }
    if (critTail && ! checkLinkPairCandidate(al1, refs, critTail)
        && ! checkLinkPairCandidate(al2, refs, critTail)) {
        return false;  // neither read was a link pair candidate
    }
    if (debug_processReadPair) cout << "---------------------------------" << endl;
    int32_t lpc_tail1 = checkLinkPairCandidate(al1, refs, critTail);
    int32_t lpc_tail2 = checkLinkPairCandidate(al2, refs, critTail);
    if (debug_processReadPair) {
        printAlignmentInfo(al1, refs);
        if (lpc_tail1) {
            cout << "LINK PAIR CANDIDATE ";
            cout << ((lpc_tail1 > 0) ? "--->" : "<---") << " " << lpc_tail1 << endl;
        }
        printAlignmentInfo(al2, refs);
        if (lpc_tail2) {
            cout << "LINK PAIR CANDIDATE ";
            cout << ((lpc_tail2 > 0) ? "--->" : "<---") << " " << lpc_tail2 << endl;
        }
        cout << "TOTAL TAIL " << (abs(readTail(al1, refs)) + abs(readTail(al2, refs))) << endl;
    }

    return true;
}
Exemple #16
0
bool shouldRealign(BamAlignment& alignment,
                   string& ref,
                   long int offset,
                   Parameters& params,
                   AlignmentStats& stats) {

    if (allN(alignment.QueryBases)) {
        if (params.debug) {
            cerr << "not realigning because query is all Ns! " << alignment.Name << endl;
        }
        return false;
    }
    if (!alignment.IsMapped()) {
        if (params.debug) {
            cerr << "realigning because read " << alignment.Name << " is not mapped " << endl;
        }
        return true;
    }
    
    if (alignment.CigarData.empty()) {
        cerr << "realigning because alignment " << alignment.Name << " @ " << alignment.Position
             << " has empty (or corrupted?) CIGAR" << endl;
        return true;
    }

    Cigar cigar(alignment.CigarData);
    countMismatchesAndGaps(alignment, cigar, ref, offset, stats, params.debug);

    if (stats.mismatch_qsum >= params.mismatch_qsum_threshold
        || stats.softclip_qsum >= params.softclip_qsum_threshold
        || stats.gaps >= params.gap_count_threshold
        || stats.gapslen >= params.gap_length_threshold) {
        if (params.debug) {
            cerr << "realigning because read " << alignment.Name
                 << " meets mismatch (q" << stats.mismatch_qsum << " in " << stats.mismatches << ")" //<< " vs. " << params.mismatch_qsum_threshold << "),"
                 << " softclip (q" << stats.softclip_qsum << " in " << stats.softclips << ")" //<< " vs. " << params.softclip_qsum_threshold << "),"
                 << " gap count (" << stats.gaps << ")" //" vs. " << params.gap_count_threshold << "),"
                 << " or gap length (" << stats.gapslen << ")" //<< " vs. " << params.gap_length_threshold << ") "
                 << " thresholds" << endl;
            cerr << cigar << endl;
        }
        return true;
    } else {
        return false;
    }
}
Exemple #17
0
/**
 * Gets the library name from the header for the record. If the RG tag is not present on
 * the record, or the library isn't denoted on the read group, a constant string is
 * returned.
 */
string MarkDuplicates::getLibraryName(SamHeader & header, const BamAlignment & rec) {     
    
    string read_group;
    static const string RG("RG");
    static const string unknown_library("Unknown Library");
    rec.GetTag(RG, read_group);
    
    if (read_group.size() > 0 && header.ReadGroups.Contains(read_group)) {
        SamReadGroupDictionary & d = header.ReadGroups;
        const SamReadGroup & rg = d[read_group];
        
        if(rg.HasLibrary()) {
            return rg.Library;
        }
    }
    
    return unknown_library;
}
Exemple #18
0
//bool SampleManager::IdentifySample(Alignment& ra) const
bool SampleManager::IdentifySample(const BamAlignment& alignment, int& sample_index, bool& primary_sample) const
{

  string read_group;
  if (!alignment.GetTag("RG", read_group)) {
    cerr << "ERROR: Couldn't find read group id (@RG tag) for BAM Alignment " << alignment.Name << endl;
    exit(1);
  }

  map<string,int>::const_iterator I = read_group_to_sample_idx_.find(read_group);
  if (I == read_group_to_sample_idx_.end())
    return false;

  sample_index =I->second;
  primary_sample = (sample_index == primary_sample_);

  return true;
}
Exemple #19
0
void Contig::updateContig(BamAlignment b, int max_nsert, bool is_mp) {
	readStatus read_status 	= computeReadType(b, max_nsert, is_mp);
	uint32_t readLength     = b.Length;
	uint32_t iSize 			= abs(b.InsertSize);
	uint32_t startRead 		= b.Position;
	uint32_t endRead 		= startRead + readLength ; // position where reads ends
	uint32_t startMateRead  = b.MatePosition;
	if (read_status == unmapped or read_status == lowQualty) {
		return;
	}
	if (read_status != unmapped and read_status != lowQualty) { //if the read is aligned and is not duplicated or low quality use it in cov computation
		updateCov(startRead, endRead, readCov); // update coverage
	}
	if (b.IsFirstMate() && read_status == pair_proper) {
		int iSize = abs(b.InsertSize);
		if(startRead < startMateRead) {
			updateCov(startRead, startRead + iSize, insertCov);
		} else {
			updateCov(startMateRead, startMateRead + iSize, insertCov);
		}
	}
	switch (read_status) {
	case singleton:
		updateCov(startRead, endRead, singCov);
		break;
	case pair_wrongChrs:
		updateCov(startRead, endRead, mdcCov);
		break;
	case pair_wrongDistance:
		updateCov(startRead, endRead, woCov); //
		break;
	case pair_wrongOrientation:
		updateCov(startRead, endRead, woCov);
		break;
	case pair_proper:
		updateCov(startRead, endRead, cmCov);
		break;
	default:
		cout << read_status << " --> This should never be printed\n";
		break;

	}
}
// Can be either unique or multi-mapping reads
inline void PROBerReadModel_iCLIP::update(AlignmentGroup& ag) {
	int size = ag.size();
	BamAlignment *ba = NULL;
	char dir;

	if (size > 1) {
		assert(model_type >= 2);
		double frac = 1.0 / size;
		for (int i = 0; i < size; ++i) {
			ba = ag.getAlignment(i);
			fld->update(ba->getInsertSize(), frac);
		}
		return;
	}

	assert(ag.getSEQ(seq));
	if (model_type & 1) assert(ag.getQUAL(qual));
	for (int i = 0; i < size; ++i) {
		ba = ag.getAlignment(i);
		dir = ba->getMateDir();
		assert(ba->getCIGAR(cigar));
		assert(ba->getMD(mdstr));
		refseq.setUp(dir, cigar, mdstr, seq);
		seqmodel->update(1.0, dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}

	if (model_type >= 2) {
		assert(ag.getSEQ(seq, 2));
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		for (int i = 0; i < size; ++i) {
			ba = ag.getAlignment(i);
			dir = ba->getMateDir(2);
			assert(ba->getCIGAR(cigar, 2));
			assert(ba->getMD(mdstr, 2));
			refseq.setUp(dir, cigar, mdstr, seq);
			seqmodel->update(1.0, dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
		}
	}
}
inline void PROBerReadModel_iCLIP::calcProbs(AlignmentGroup& ag, double* conprbs) {
	int size = ag.size();
	BamAlignment *ba = NULL;
	char dir;

	assert(ag.getSEQ(seq));
	if (model_type & 1) assert(ag.getQUAL(qual));
	for (int i = 0; i < size; ++i) {
		ba = ag.getAlignment(i);
		dir = ba->getMateDir();
		assert(ba->getCIGAR(cigar));
		assert(ba->getMD(mdstr));
		refseq.setUp(dir, cigar, mdstr, seq);
		conprbs[i] = seqmodel->getProb(dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}

	if (model_type >= 2) {
		assert(ag.getSEQ(seq, 2));
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		for (int i = 0; i < size; ++i) {
			ba = ag.getAlignment(i);
			dir = ba->getMateDir(2);
			assert(ba->getCIGAR(cigar, 2));
			assert(ba->getMD(mdstr, 2));
			refseq.setUp(dir, cigar, mdstr, seq);
			conprbs[i] *= seqmodel->getProb(dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));

			conprbs[i] *= fld->getProb(ba->getInsertSize()); // fragment length distribution
		}
	}

	double sum = 0.0;
	for (int i = 0; i < size; ++i) sum += conprbs[i];

	//assert(sum > 0.0);
	if (sum <= 0.0) sum = 1.0;
	
	for (int i = 0; i < size; ++i) conprbs[i] /= sum;  
}
Exemple #22
0
//increases the counters mismatches and typesOfMismatches of a given BamAlignment object
inline void increaseCounters(BamAlignment & al,string & reconstructedReference,int firstCycleRead,int increment){

    char refeBase;
    char readBase;
    int cycleToUse=firstCycleRead;
    // cout<<"name "<<al.Name<<endl;
    // cout<<"firstCycleRead "<<firstCycleRead<<endl;
    // cout<<"increment      "<<increment<<endl;

    for(int i=0;i<numberOfCycles;i++,cycleToUse+=increment){
        // cout<<"i = "<<i<<" cyc "<<cycleToUse<<endl;
	refeBase=toupper(reconstructedReference[i]);
	readBase=toupper(         al.QueryBases[i]);
		     
	//match
	if(refeBase == 'M'){
	    matches[cycleToUse]++;
	    continue;
	}

	if(refeBase == 'S' ||refeBase == 'I'){ //don't care about soft clipped or indels
	    continue;
	}
		    
	//mismatch
	if( isResolvedDNA(refeBase)  && 
	    isResolvedDNA(readBase) ){
	    if(al.IsReverseStrand()){ //need to take the complement
		refeBase=complement(refeBase);
		readBase=complement(readBase);
	    }
	    if(readBase == refeBase){
		cerr<<"Internal error in reconstruction of read "<<al.Name<<", contact developer"<<endl;
		exit(1);;
	    }						
	    mismatches[cycleToUse]++;
	    typesOfMismatches[dimer2index(refeBase,readBase)][cycleToUse]++;
	    continue;
	}		    		     
    }
}
 void getBamBlocks(const BamAlignment &bam, const RefVector &refs, 
                   BedVec &blocks, bool breakOnDeletionOps) {
 
 	CHRPOS currPosition = bam.Position;
     CHRPOS blockStart   = bam.Position;
     string chrom        = refs.at(bam.RefID).RefName;
     string name         = bam.Name;
     string strand       = "+";
     float  score        = bam.MapQuality;
     if (bam.IsReverseStrand()) strand = "-"; 
 	
 	vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
 	vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
     for ( ; cigItr != cigEnd; ++cigItr ) {
         if (cigItr->Type == 'M') {
             currPosition += cigItr->Length;
 			blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) );
 			blockStart    = currPosition;
         }
         else if (cigItr->Type == 'D') {
             if (breakOnDeletionOps == false)
                 currPosition += cigItr->Length;
             else {
                 currPosition += cigItr->Length;
                 blockStart    = currPosition;
             }
         }
         else if (cigItr->Type == 'N') {
             currPosition += cigItr->Length;
             blockStart    = currPosition;            }
         else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
             // do nothing
         }
         else {
             cerr << "Input error: invalid CIGAR type (" << cigItr->Type
 				<< ") for: " << bam.Name << endl;
 			exit(1);
         }
 	}
 }
Exemple #24
0
/*
 * snip() doesn't leave a valid BamAlignment; it contains
 * correct FASTQ data.  Handles negative strand alignments:
 * 'start=0' will always correspond to the 5'-most basepair
 * in the read.
 */
BamAlignment
snip(BamAlignment &a, int start, int len)
{
	BamAlignment copy(a);

	/* Handle reverse strand mappings */
	int converted_start = copy.IsReverseStrand() ?
		copy.Length - start - len : start;
	copy.Length = len;
	try {
		copy.QueryBases = copy.QueryBases.substr(converted_start, len);
		copy.Qualities = copy.Qualities.substr(converted_start, len);
	} catch (exception &e) {
		cout << "ERROR: substr failed in snip(" << a.Name << ", "
		     << start << ", " << len << ")" << endl;
		cout << (a.IsReverseStrand() ? "(-)" : "(+)")
		     << ", converted_start: " << converted_start << endl;
		cout << a.QueryBases << endl;
		cout << a.Qualities << endl;
		exit(1);
	}
	return copy;
}
Exemple #25
0
// print BamAlignment in FASTA format
// N.B. - uses QueryBases NOT AlignedBases
void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { 
    
    // >BamAlignment.Name
    // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
    // ...
    //
    // N.B. - QueryBases are reverse-complemented if aligned to reverse strand
  
    // print header
    m_out << ">" << a.Name << endl;
    
    // handle reverse strand alignment - bases 
    string sequence = a.QueryBases;
    if ( a.IsReverseStrand() )
        Utilities::ReverseComplement(sequence);
    
    // if sequence fits on single line
    if ( sequence.length() <= FASTA_LINE_MAX )
        m_out << sequence << endl;
    
    // else split over multiple lines
    else {
      
        size_t position = 0;
        size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth();
        
        // write subsequences to each line
        while ( position < (seqLength - FASTA_LINE_MAX) ) {
            m_out << sequence.substr(position, FASTA_LINE_MAX) << endl;
            position += FASTA_LINE_MAX;
        }
        
        // write final subsequence
        m_out << sequence.substr(position) << endl;
    }
}
Exemple #26
0
// print BamAlignment in SAM format
void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
  
    // tab-delimited
    // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
  
    // write name & alignment flag
    m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
    
    // write reference name
    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
        m_out << m_references[a.RefID].RefName << "\t";
    else 
        m_out << "*\t";
    
    // write position & map quality
    m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
    
    // write CIGAR
    const vector<CigarOp>& cigarData = a.CigarData;
    if ( cigarData.empty() ) m_out << "*\t";
    else {
        vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
        vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
            const CigarOp& op = (*cigarIter);
            m_out << op.Length << op.Type;
        }
        m_out << "\t";
    }
    
    // write mate reference name, mate position, & insert size
    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
        if ( a.MateRefID == a.RefID )
            m_out << "=\t";
        else
            m_out << m_references[a.MateRefID].RefName << "\t";
        m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
    } 
    else
        m_out << "*\t0\t0\t";
    
    // write sequence
    if ( a.QueryBases.empty() )
        m_out << "*\t";
    else
        m_out << a.QueryBases << "\t";
    
    // write qualities
    if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) )
        m_out << "*";
    else
        m_out << a.Qualities;
    
    // write tag data
    const char* tagData = a.TagData.c_str();
    const size_t tagDataLength = a.TagData.length();
    
    size_t index = 0;
    while ( index < tagDataLength ) {

        // write tag name   
        string tagName = a.TagData.substr(index, 2);
        m_out << "\t" << tagName << ":";
        index += 2;
        
        // get data type
        char type = a.TagData.at(index);
        ++index;
        switch ( type ) {
            case (Constants::BAM_TAG_TYPE_ASCII) :
                m_out << "A:" << tagData[index];
                ++index;
                break;

            case (Constants::BAM_TAG_TYPE_INT8)  :
            case (Constants::BAM_TAG_TYPE_UINT8) :
                m_out << "i:" << (int)tagData[index];
                ++index;
                break;

            case (Constants::BAM_TAG_TYPE_INT16) :
                m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]);
                index += sizeof(int16_t);
                break;

            case (Constants::BAM_TAG_TYPE_UINT16) :
                m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]);
                index += sizeof(uint16_t);
                break;

            case (Constants::BAM_TAG_TYPE_INT32) :
                m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]);
                index += sizeof(int32_t);
                break;

            case (Constants::BAM_TAG_TYPE_UINT32) :
                m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]);
                index += sizeof(uint32_t);
                break;

            case (Constants::BAM_TAG_TYPE_FLOAT) :
                m_out << "f:" << BamTools::UnpackFloat(&tagData[index]);
                index += sizeof(float);
                break;

            case (Constants::BAM_TAG_TYPE_HEX)    :
            case (Constants::BAM_TAG_TYPE_STRING) :
                m_out << type << ":";
                while (tagData[index]) {
                    m_out << tagData[index];
                    ++index;
                }
                ++index;
                break;
        }

        if ( tagData[index] == '\0') 
            break;
    }

    m_out << endl;
}
Exemple #27
0
// print BamAlignment in JSON format
void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
  
    // write name & alignment flag
    m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
    
    // write reference name
    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
        m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
    
    // write position & map quality
    m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
    
    // write CIGAR
    const vector<CigarOp>& cigarData = a.CigarData;
    if ( !cigarData.empty() ) {
        m_out << "\"cigar\":[";
        vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
        vector<CigarOp>::const_iterator cigarIter  = cigarBegin;
        vector<CigarOp>::const_iterator cigarEnd   = cigarData.end();
        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
            const CigarOp& op = (*cigarIter);
            if (cigarIter != cigarBegin)
                m_out << ",";
            m_out << "\"" << op.Length << op.Type << "\"";
        }
        m_out << "],";
    }
    
    // write mate reference name, mate position, & insert size
    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
        m_out << "\"mate\":{"
              << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
              << "\"position\":" << a.MatePosition+1
              << ",\"insertSize\":" << a.InsertSize << "},";
    }
    
    // write sequence
    if ( !a.QueryBases.empty() ) 
        m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
    
    // write qualities
    if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) {
        string::const_iterator s = a.Qualities.begin();
        m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
        ++s;
        for ( ; s != a.Qualities.end(); ++s )
            m_out << "," << static_cast<short>(*s) - 33;
        m_out << "],";
    }
    
    // write alignment's source BAM file
    m_out << "\"filename\":" << a.Filename << ",";

    // write tag data
    const char* tagData = a.TagData.c_str();
    const size_t tagDataLength = a.TagData.length();
    size_t index = 0;
    if ( index < tagDataLength ) {

        m_out << "\"tags\":{";
        
        while ( index < tagDataLength ) {

            if ( index > 0 )
                m_out << ",";
            
            // write tag name
            m_out << "\"" << a.TagData.substr(index, 2) << "\":";
            index += 2;
            
            // get data type
            char type = a.TagData.at(index);
            ++index;
            switch ( type ) {
                case (Constants::BAM_TAG_TYPE_ASCII) :
                    m_out << "\"" << tagData[index] << "\"";
                    ++index; 
                    break;
                
                case (Constants::BAM_TAG_TYPE_INT8)  :
                case (Constants::BAM_TAG_TYPE_UINT8) :
                    m_out << (int)tagData[index]; 
                    ++index; 
                    break;
                
                case (Constants::BAM_TAG_TYPE_INT16) :
                    m_out << BamTools::UnpackSignedShort(&tagData[index]);
                    index += sizeof(int16_t);
                    break;

                case (Constants::BAM_TAG_TYPE_UINT16) :
                    m_out << BamTools::UnpackUnsignedShort(&tagData[index]);
                    index += sizeof(uint16_t);
                    break;
                    
                case (Constants::BAM_TAG_TYPE_INT32) :
                    m_out << BamTools::UnpackSignedInt(&tagData[index]);
                    index += sizeof(int32_t);
                    break;

                case (Constants::BAM_TAG_TYPE_UINT32) :
                    m_out << BamTools::UnpackUnsignedInt(&tagData[index]);
                    index += sizeof(uint32_t);
                    break;

                case (Constants::BAM_TAG_TYPE_FLOAT) :
                    m_out << BamTools::UnpackFloat(&tagData[index]);
                    index += sizeof(float);
                    break;
                
                case (Constants::BAM_TAG_TYPE_HEX)    :
                case (Constants::BAM_TAG_TYPE_STRING) :
                    m_out << "\""; 
                    while (tagData[index]) {
                        if (tagData[index] == '\"')
                            m_out << "\\\""; // escape for json
                        else
                            m_out << tagData[index];
                        ++index;
                    }
                    m_out << "\""; 
                    ++index; 
                    break;      
            }
            
            if ( tagData[index] == '\0') 
                break;
        }

        m_out << "}";
    }

    m_out << "}" << endl;
}
int main_asequantmultirg(const vector<string> &all_args)
{
    Init(all_args);
    
    cerr << "* Reading bam file " << endl;
    OpenBam(bam_reader, bam_file);
    bam_reader.OpenIndex(bam_file + ".bai");
    
    vector<string> readGroupVector;
    SamHeader header = bam_reader.GetHeader();
    SamReadGroupDictionary headerRG = header.ReadGroups;
    for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++)
    {
        readGroupVector.push_back(it -> ID);
    }
    
    
    vector<RefData> chroms = bam_reader.GetReferenceData();
    
    cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT";
    for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
    {
        cout << "\t" << *it;
    }
    cout << endl;
    
    StlFor(chrom_idx, chroms)
    {
        string &chrom = chroms[chrom_idx].RefName;
        vector<Snp> snps = snps_by_chrom[chrom];
        
        int s = 0; // Index into snp array
        
        BamAlignment bam;
        bam_reader.Jump(chrom_idx);
        
        string align;
        string qualities;
        
        cerr << "* On chrom " << chrom << endl;

        while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) 
        {
	  if (bam.MapQuality < min_map_qual || !bam.IsMapped())
                continue;
       
            string currentRG;
            Assert(bam.GetReadGroup(currentRG));
            
            int start = AlignStart(bam);
            int end = AlignEnd(bam);
            
            // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order)
            while (s < snps.size() && snps[s].pos < start)
                ++s;
            
            // Stop everything if we have visited all SNPs on this chrom
            if (s >= snps.size())
                break;
            
            // Find any/all SNPs that are within the bam alignment
            int n = 0; // Number of SNPs overlapped
            while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps!
                ++n;
            
            // Now, look at each SNP and see which way it votes
            AlignedString(bam, align);
            AlignedQualities(bam, qualities);
            Assert(align.size() == qualities.size());

            // Now, tally votes
            for (int i = 0; i < n; ++i)
            {
                Snp &snp = snps[s + i];
                char base = align[snp.pos - start]; // Base from the read
                int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read
                
                //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities);
                
                if (base == '-' || qual < min_base_qual)
                    continue;
                
                map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd;
                
                map<string, Counts>::iterator searchIt = RG_counts.find(currentRG);
                
                if (searchIt == RG_counts.end())
                {
                    if (base == snp.ref)
                    {
                        RG_counts[currentRG].num_ref = 1;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else if (base == snp.alt)
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 1;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 1;
                    }
                }
                else
                {
                    if (base == snp.ref)
                    {
                        searchIt -> second.num_ref += 1;
                    }
                    else if (base == snp.alt)
                    {
                        searchIt -> second.num_alt += 1;
                    }
                    else
                    {
                        searchIt -> second.num_other += 1;
                    }
                }
            }
        }
        
        // Output counts
        for (int s = 0; s < snps.size(); ++s)
        {
            cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt;
            for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
            {
                map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it);
                if (searchIt != snps[s].fwd.end())
                {
                    cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ",";
                }
                else
                {
                    cout << "\t" << "0,0,0,";
                }
                searchIt = snps[s].rev.find(*it);
                if (searchIt != snps[s].rev.end())
                {
                    cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other;
                }
                else
                {
                    cout << "0,0,0";
                }
            }
            cout << endl;
        }
    }
Exemple #29
0
int IonstatsTestFragments(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string fasta_filename       = opts.GetFirstString('r', "ref", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) {
    IonstatsTestFragmentsHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  map<string,string> tf_sequences;
  PopulateReferenceSequences(tf_sequences, fasta_filename);


  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  int num_tfs = input_bam.GetReferenceCount();


  SamHeader sam_header = input_bam.GetHeader();
  if(!sam_header.HasReadGroups()) {
    fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str());
    return 1;
  }

  string flow_order;
  string key;
  for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) {
    if(rg->HasFlowOrder())
      flow_order = rg->FlowOrder;
    if(rg->HasKeySequence())
      key = rg->KeySequence;
  }


  // Need these metrics stratified by TF.

  vector<ReadLengthHistogram> called_histogram(num_tfs);
  vector<ReadLengthHistogram> aligned_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ10_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ17_histogram(num_tfs);
  vector<SimpleHistogram> error_by_position(num_tfs);
  vector<MetricGeneratorSNR> system_snr(num_tfs);
  vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs);

  for (int tf = 0; tf < num_tfs; ++tf) {
    called_histogram[tf].Initialize(histogram_length);
    aligned_histogram[tf].Initialize(histogram_length);
    AQ10_histogram[tf].Initialize(histogram_length);
    AQ17_histogram[tf].Initialize(histogram_length);
    error_by_position[tf].Initialize(histogram_length);
  }

  vector<uint16_t> flow_signal_fz(flow_order.length());
  vector<int16_t> flow_signal_zm(flow_order.length());

  const RefVector& refs = input_bam.GetReferenceData();

  // Missing:
  //  - hp accuracy - tough, copy verbatim from TFMapper?


  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {


    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    // The check below eliminates unexpected alignments
    if (alignment.IsReverseStrand() or alignment.Position > 5)
      continue;

    int current_tf = alignment.RefID;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    called_histogram[current_tf].Add(alignment.Length);
    aligned_histogram[current_tf].Add(num_bases);
    AQ10_histogram[current_tf].Add(AQ10_bases);
    AQ17_histogram[current_tf].Add(AQ17_bases);

    if(alignment.GetTag("ZM", flow_signal_zm))
      system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order);
    else if(alignment.GetTag("FZ", flow_signal_fz))
      system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order);


    // HP accuracy - keeping it simple

    if (!alignment.IsReverseStrand()) {

      string genome = key + tf_sequences[refs[current_tf].RefName];
      string calls = key + alignment.QueryBases;
      const char *genome_ptr = genome.c_str();
      const char *calls_ptr = calls.c_str();

      for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) {
        int genome_hp = 0;
        int calls_hp = 0;
        while (*genome_ptr == flow_order[flow]) {
          genome_hp++;
          genome_ptr++;
        }
        while (*calls_ptr == flow_order[flow]) {
          calls_hp++;
          calls_ptr++;
        }
        hp_accuracy[current_tf].Add(genome_hp, calls_hp);
      }
    }
  }



  //
  // Processing complete, generate ionstats_tf.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_tf";
  output_json["meta"]["format_version"] = "1.0";

  output_json["results_by_tf"] = Json::objectValue;

  for (int tf = 0; tf < num_tfs; ++tf) {

    if (aligned_histogram[tf].num_reads() < 1000)
      continue;

    called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]);
    aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]);
    AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]);
    AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]);
    error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]);
    system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);
    hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);

    output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName];
  }

  input_bam.Close();

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }
}
void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) {
  
    // skip if unmapped
    if ( !al.IsMapped() ) return;
    
    // intialize local variables
    int  genomePosition      = al.Position;
    int  positionInAlignment = 0;
    bool isNewReadSegment    = true;
    bool saveAlignment       = true;    
    PileupAlignment pileupAlignment(al);
    
    // iterate over CIGAR operations
    const int numCigarOps = (const int)al.CigarData.size();
    for (int i = 0; i < numCigarOps; ++i ) { 
        const CigarOp& op = al.CigarData.at(i);
      
        // if op is MATCH
        if ( op.Type == 'M' ) {
          
            // if match op overlaps current position
            if ( genomePosition + (int)op.Length > CurrentPosition ) {
              
                // set pileup data
                pileupAlignment.IsCurrentDeletion   = false;
                pileupAlignment.IsNextDeletion      = false;
                pileupAlignment.IsNextInsertion     = false;
                pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
                
                // check for beginning of read segment
                if ( genomePosition == CurrentPosition && isNewReadSegment ) 
                    pileupAlignment.IsSegmentBegin = true;
                
                // if we're at the end of a match operation
                if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) {
                    
                    // if not last operation
                    if ( i < numCigarOps - 1 ) {
                        
                        // check next CIGAR op
                        const CigarOp& nextOp = al.CigarData.at(i+1);
                        
                        // if next CIGAR op is DELETION
                        if ( nextOp.Type == 'D') {
                            pileupAlignment.IsNextDeletion = true;
                            pileupAlignment.DeletionLength = nextOp.Length;
                        }
                        
                        // if next CIGAR op is INSERTION
                        else if ( nextOp.Type == 'I' ) {
                            pileupAlignment.IsNextInsertion = true;
                            pileupAlignment.InsertionLength = nextOp.Length;
                        }
                            
                        // if next CIGAR op is either DELETION or INSERTION
                        if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) {

                            // if there is a CIGAR op after the DEL/INS
                            if ( i < numCigarOps - 2 ) {
                                const CigarOp& nextNextOp = al.CigarData.at(i+2);
                                
                                // if next CIGAR op is clipping or ref_skip
                                if ( nextNextOp.Type == 'S' || 
                                     nextNextOp.Type == 'N' ||
                                     nextNextOp.Type == 'H' )
                                    pileupAlignment.IsSegmentEnd = true;
                            } 
                            else {
                                pileupAlignment.IsSegmentEnd = true;
                                
                                // if next CIGAR op is clipping or ref_skip
                                if ( nextOp.Type == 'S' || 
                                     nextOp.Type == 'N' ||
                                     nextOp.Type == 'H' )
                                    pileupAlignment.IsSegmentEnd = true;
                            }
                        }
                        
                        // otherwise
                        else { 
                        
                            // if next CIGAR op is clipping or ref_skip
                            if ( nextOp.Type == 'S' || 
                                 nextOp.Type == 'N' ||
                                 nextOp.Type == 'H' )
                                pileupAlignment.IsSegmentEnd = true;
                        }
                    }
                    
                    // else this is last operation
                    else pileupAlignment.IsSegmentEnd = true;
                }
            }
          
            // increment markers
            genomePosition      += op.Length;
            positionInAlignment += op.Length;
        } 
        
        // if op is DELETION
        else if ( op.Type == 'D' ) {
          
            // if deletion op overlaps current position
            if ( genomePosition + (int)op.Length > CurrentPosition ) {
              
                // set pileup data
                pileupAlignment.IsCurrentDeletion   = true;
                pileupAlignment.IsNextDeletion      = false;
                pileupAlignment.IsNextInsertion     = true;
                pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
            }
            
            // increment marker
            genomePosition += op.Length;
        }

        // if op is REF_SKIP
        else if ( op.Type == 'N' ) {
            genomePosition += op.Length;
        }
        
        // if op is INSERTION or SOFT_CLIP
        else if ( op.Type == 'I' || op.Type == 'S' ) {
            positionInAlignment += op.Length;
        }
        
        // checl for beginning of new read segment
        if ( op.Type == 'N' ||
             op.Type == 'S' ||
             op.Type == 'H' )
            isNewReadSegment = true;
        else 
            isNewReadSegment = false;
      
        // if we've moved beyond current position
        if ( genomePosition > CurrentPosition ) {
            if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP
            break;
        }
    }

    // save pileup position if flag is true
    if ( saveAlignment )
        CurrentPileupData.PileupAlignments.push_back( pileupAlignment );
}