Пример #1
0
TEST(CigarStringTest, FromStdString_MultipleOps)
{
    const string multiCigar = "100=2D34I6=6X6=";

    Cigar cigar = Cigar::FromStdString(multiCigar);
    ASSERT_TRUE(cigar.size() == 6);

    CigarOperation op0 = cigar.at(0);
    CigarOperation op1 = cigar.at(1);
    CigarOperation op2 = cigar.at(2);
    CigarOperation op3 = cigar.at(3);
    CigarOperation op4 = cigar.at(4);
    CigarOperation op5 = cigar.at(5);

    EXPECT_TRUE(op0.Char()   == '=');
    EXPECT_TRUE(op0.Length() == 100);
    EXPECT_TRUE(op1.Char()   == 'D');
    EXPECT_TRUE(op1.Length() == 2);
    EXPECT_TRUE(op2.Char()   == 'I');
    EXPECT_TRUE(op2.Length() == 34);
    EXPECT_TRUE(op3.Char()   == '=');
    EXPECT_TRUE(op3.Length() == 6);
    EXPECT_TRUE(op4.Char()   == 'X');
    EXPECT_TRUE(op4.Length() == 6);
    EXPECT_TRUE(op5.Char()   == '=');
    EXPECT_TRUE(op5.Length() == 6);
}
Пример #2
0
unsigned UngappedAligner::alignUngapped(
    FragmentMetadata &fragmentMetadata,
    Cigar &cigarBuffer,
    const flowcell::ReadMetadata &readMetadata,
    const matchSelector::FragmentSequencingAdapterClipper &adapterClipper,
    const reference::ContigList &contigList,
    const isaac::reference::ContigAnnotations &contigAnnotations) const
{
    const unsigned cigarOffset = cigarBuffer.size();

// Don't reset alignment to preserve the seed-based anchors.
//    fragmentMetadata.resetAlignment();
    ISAAC_ASSERT_MSG(!fragmentMetadata.isAligned(), "alignUngapped is expected to be performend on a clean fragment");
    fragmentMetadata.resetClipping();

    const reference::Contig &contig = contigList[fragmentMetadata.contigId];

    const Read &read = fragmentMetadata.getRead();
    const bool reverse = fragmentMetadata.reverse;
    const std::vector<char> &sequence = read.getStrandSequence(reverse);
    const reference::Contig &reference = contig;

    std::vector<char>::const_iterator sequenceBegin = sequence.begin();
    std::vector<char>::const_iterator sequenceEnd = sequence.end();

    adapterClipper.clip(contig, fragmentMetadata, sequenceBegin, sequenceEnd);
    clipReadMasking(read, fragmentMetadata, sequenceBegin, sequenceEnd);

    clipReference(reference.size(), fragmentMetadata, sequenceBegin, sequenceEnd);

    const unsigned firstMappedBaseOffset = std::distance(sequence.begin(), sequenceBegin);
    if (firstMappedBaseOffset)
    {
        cigarBuffer.addOperation(firstMappedBaseOffset, Cigar::SOFT_CLIP);
    }

    const unsigned mappedBases = std::distance(sequenceBegin, sequenceEnd);
    if (mappedBases)
    {
        const Cigar::OpCode opCode = Cigar::ALIGN;
        cigarBuffer.addOperation(mappedBases, opCode);
    }

    const unsigned clipEndBases = std::distance(sequenceEnd, sequence.end());
    if (clipEndBases)
    {
        cigarBuffer.addOperation(clipEndBases, Cigar::SOFT_CLIP);
    }

    const unsigned ret = updateFragmentCigar(
        readMetadata, contigList, contigAnnotations, fragmentMetadata,
        fragmentMetadata.reverse, fragmentMetadata.contigId, fragmentMetadata.position, cigarBuffer, cigarOffset);

    if (!ret)
    {
        fragmentMetadata.setUnaligned();
    }

    return ret;
}
Пример #3
0
TEST(CigarStringTest, ToStdString_SingleOp)
{
    const string singleCigar = "100=";

    Cigar cigar;
    cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) );

    EXPECT_EQ(singleCigar, cigar.ToStdString());
}
Пример #4
0
int getMaxClipLen( SamRecord & sam_rec )
{
	Cigar * myCigar = sam_rec.getCigarInfo();
	int begin_clip = myCigar->getNumBeginClips();
	int end_clip = myCigar->getNumEndClips();
	if (begin_clip >= end_clip)
		return begin_clip;
	else
		return -end_clip;
}
Пример #5
0
TEST(CigarStringTest, FromStdString_SingleOp)
{
    const string singleCigar = "100=";

    Cigar cigar = Cigar::FromStdString(singleCigar);
    ASSERT_TRUE(cigar.size() == 1);

    const CigarOperation& op = cigar.front();
    EXPECT_TRUE(op.Char()   == '=');
    EXPECT_TRUE(op.Length() == 100);
}
Пример #6
0
TEST(CigarStringTest, ToStdString_MultipleOps)
{
    const string multiCigar = "100=2D34I6=6X6=";

    Cigar cigar;
    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,  100));
    cigar.push_back(CigarOperation(CigarOperationType::DELETION,          2));
    cigar.push_back(CigarOperation(CigarOperationType::INSERTION,        34));
    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6));
    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));

    EXPECT_EQ(multiCigar, cigar.ToStdString());
}
Пример #7
0
void Sites::addToCurrentCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// update breakpoint
	int old_evi = new_site.evidence;
	float a1 = (float)1 / float(old_evi+1);
	int ep = getEstimatedBreakPoint(rec);
	new_site.breakp = round( a1 * (float)ep + (float)new_site.breakp * (1-a1));
	new_site.evidence++; 

	// update position
	if (rec.get1BasedPosition() < new_site.start)
		new_site.start = rec.get1BasedPosition();
	else if (rec.get1BasedAlignmentEnd() > new_site.end)
		new_site.end = rec.get1BasedAlignmentEnd();

	// update info
	if (rec.getFlag() & 0x10) {
		if (new_site.right_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int begin_clip = myCigar->getNumBeginClips();
			if ( begin_clip < MIN_CLIP/2)
				new_site.right_clip_only = 0;	
		}
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m]++;
		}	
	}
	else {
		if (new_site.left_clip_only) {
			Cigar * myCigar = rec.getCigarInfo();
			int end_clip = myCigar->getNumEndClips();
			if (end_clip < MIN_CLIP/2)
				new_site.left_clip_only = 0;			
		}
		for( int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m]++;
		}
	}
}
Пример #8
0
void Sites::setNewCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec )
{
	if (is_in_coord.size() != NMEI)
		morphError("[Sites::setNewCluster] is_in_coord size error");

	// set info
	new_site.breakp = getEstimatedBreakPoint(rec);
	new_site.rcount = 1;
	new_site.evidence = 1;
	for(int m=0; m<NMEI; m++) {
		new_site.left[m] = 0;
		new_site.right[m] = 0;
	}
	new_site.left_clip_only = 1;
	new_site.right_clip_only = 1;
	new_site.depth = current_depth;
	new_site.depth_add = 1;

	// set position & mtype
	if ( rec.getFlag() & 0x10 )  { // right anchor
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int begin_clip = myCigar->getNumBeginClips();
		if ( begin_clip < MIN_CLIP/2)
			new_site.right_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.right[m] = 1;
		}
	}
	else {
		new_site.start = rec.get1BasedPosition();
		new_site.end = rec.get1BasedAlignmentEnd();
		Cigar * myCigar = rec.getCigarInfo();
		int end_clip = myCigar->getNumEndClips();
		if (end_clip < MIN_CLIP/2)
			new_site.left_clip_only = 0;
		for(int m=0; m<NMEI; m++) {
			if (is_in_coord[m])
				new_site.left[m] = 1;
		}
	}	
}
Пример #9
0
bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {

    int arsOffset = 0; // pointer to insertion point in aligned reference sequence
    string alignedReferenceSequence = referenceSequence;
    int aabOffset = 0;
    string alignmentAlignedBases = alternateSequence;

    // store information about the indels
    vector<VCFIndelAllele> indels;

    int rp = 0;  // read position, 0-based relative to read
    int sp = 0;  // sequence position

    string softBegin;
    string softEnd;

    stringstream cigar_before, cigar_after;
    for (vector<pair<int, string> >::const_iterator c = cigar.begin();
        c != cigar.end(); ++c) {
        unsigned int l = c->first;
        char t = c->second.at(0);

        cigar_before << l << t;
        if (t == 'M') { // match or mismatch
            sp += l;
            rp += l;
        } else if (t == 'D') { // deletion
            indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
            alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
            aabOffset += l;
            sp += l;  // update reference sequence position
        } else if (t == 'I') { // insertion
            indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
            alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
            arsOffset += l;
            rp += l;
        } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
            // remove these bases from the refseq and read seq, but don't modify the alignment sequence
            if (rp == 0) {
                alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
                softBegin = alignmentAlignedBases.substr(0, l);
            } else {
                alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
                softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
            }
            rp += l;
        } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
        } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
            sp += l;
        }
    }


    int alignedLength = sp;

    VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
       << "| " << alignedReferenceSequence << endl
       << "| " << alignmentAlignedBases << endl);

    // if no indels, return the alignment
    if (indels.empty()) { return false; }

    // for each indel, from left to right
    //     while the indel sequence repeated to the left and we're not matched up with the left-previous indel
    //         move the indel left

    vector<VCFIndelAllele>::iterator previous = indels.begin();
    for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {

        // left shift by repeats
        //
        // from 1 base to the length of the indel, attempt to shift left
        // if the move would cause no change in alignment optimality (no
        // introduction of mismatches, and by definition no change in gap
        // length), move to the new position.
        // in practice this moves the indel left when we reach the size of
        // the repeat unit.
        //
        int steppos, readsteppos;
        VCFIndelAllele& indel = *id;
        int i = 1;
        while (i <= indel.length) {

            int steppos = indel.position - i;
            int readsteppos = indel.readPosition - i;

#ifdef VERBOSE_DEBUG
            if (debug) {
                if (steppos >= 0 && readsteppos >= 0) {
                    cerr << referenceSequence.substr(steppos, indel.length) << endl;
                    cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
                    cerr << indel.sequence << endl;
                }
            }
#endif
            while (steppos >= 0 && readsteppos >= 0
                   && indel.sequence == referenceSequence.substr(steppos, indel.length)
                   && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
                   && (id == indels.begin()
                       || (previous->insertion && steppos >= previous->position)
                       || (!previous->insertion && steppos >= previous->position + previous->length))) {
                VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
                indel.position -= i;
                indel.readPosition -= i;
                steppos = indel.position - i;
                readsteppos = indel.readPosition - i;
            }
            do {
                ++i;
            } while (i <= indel.length && indel.length % i != 0);
        }

        // left shift indels with exchangeable flanking sequence
        //
        // for example:
        //
        //    GTTACGTT           GTTACGTT
        //    GT-----T   ---->   G-----TT
        //
        // GTGTGACGTGT           GTGTGACGTGT
        // GTGTG-----T   ---->   GTG-----TGT
        //
        // GTGTG-----T           GTG-----TGT
        // GTGTGACGTGT   ---->   GTGTGACGTGT
        //
        //
        steppos = indel.position - 1;
        readsteppos = indel.readPosition - 1;
        while (steppos >= 0 && readsteppos >= 0
               && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
               && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
               && (id == indels.begin()
                   || (previous->insertion && indel.position - 1 >= previous->position)
                   || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
            VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
            indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
            indel.position -= 1;
            indel.readPosition -= 1;
            steppos = indel.position - 1;
            readsteppos = indel.readPosition - 1;
        }
        // tracks previous indel, so we don't run into it with the next shift
        previous = id;
    }

    // bring together floating indels
    // from left to right
    // check if we could merge with the next indel
    // if so, adjust so that we will merge in the next step
    if (indels.size() > 1) {
        previous = indels.begin();
        for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
            VCFIndelAllele& indel = *id;
            // parsimony: could we shift right and merge with the previous indel?
            // if so, do it
            int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
            int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
            if (previous->insertion == indel.insertion
                    && ((previous->insertion
                        && (previous->position < indel.position
                        && previous->readPosition + previous->readPosition < indel.readPosition))
                        ||
                        (!previous->insertion
                        && (previous->position + previous->length < indel.position)
                        && (previous->readPosition < indel.readPosition)
                        ))) {
                if (previous->homopolymer()) {
                    string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
                    string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
                    VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
                    if (previous->sequence.at(0) == seq.at(0)
                            && FBhomopolymer(seq)
                            && FBhomopolymer(readseq)) {
                        VCFLEFTALIGN_DEBUG("moving " << *previous << " right to " 
                                << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
                        previous->position = indel.insertion ? indel.position : indel.position - previous->length;
                    }
                } 
                else {
                    int pos = previous->position;
                    while (pos < (int) referenceSequence.length() &&
                            ((previous->insertion && pos + previous->length <= indel.position)
                            ||
                            (!previous->insertion && pos + previous->length < indel.position))
                            && previous->sequence 
                                == referenceSequence.substr(pos + previous->length, previous->length)) {
                        pos += previous->length;
                    }
                    if (pos < previous->position &&
                        ((previous->insertion && pos + previous->length == indel.position)
                        ||
                        (!previous->insertion && pos == indel.position - previous->length))
                       ) {
                        VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
                        previous->position = pos;
                    }
                }
            }
            previous = id;
        }
    }

    // for each indel
    //     if ( we're matched up to the previous insertion (or deletion) 
    //          and it's also an insertion or deletion )
    //         merge the indels
    //
    // and simultaneously reconstruct the cigar

    Cigar newCigar;

    if (!softBegin.empty()) {
        newCigar.push_back(make_pair(softBegin.size(), "S"));
    }

    vector<VCFIndelAllele>::iterator id = indels.begin();
    VCFIndelAllele last = *id++;
    if (last.position > 0) {
        newCigar.push_back(make_pair(last.position, "M"));
        newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
    } else {
        newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
    }
    int lastend = last.insertion ? last.position : (last.position + last.length);
    VCFLEFTALIGN_DEBUG(last << ",");

    for (; id != indels.end(); ++id) {
        VCFIndelAllele& indel = *id;
        VCFLEFTALIGN_DEBUG(indel << ",");
        if (indel.position < lastend) {
            cerr << "impossibility?: indel realigned left of another indel" << endl
                 << referenceSequence << endl << alternateSequence << endl;
            exit(1);
        } else if (indel.position == lastend && indel.insertion == last.insertion) {
            pair<int, string>& op = newCigar.back();
            op.first += indel.length;
        } else if (indel.position >= lastend) {  // also catches differential indels, but with the same position
            newCigar.push_back(make_pair(indel.position - lastend, "M"));
            newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
        }
        last = *id;
        lastend = last.insertion ? last.position : (last.position + last.length);
    }
    
    if (lastend < alignedLength) {
        newCigar.push_back(make_pair(alignedLength - lastend, "M"));
    }

    if (!softEnd.empty()) {
        newCigar.push_back(make_pair(softEnd.size(), "S"));
    }

    VCFLEFTALIGN_DEBUG(endl);

    cigar = newCigar;

    for (vector<pair<int, string> >::const_iterator c = cigar.begin();
        c != cigar.end(); ++c) {
        unsigned int l = c->first;
        char t = c->second.at(0);
        cigar_after << l << t;
    }

    //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
    VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);

    // check if we're realigned
    if (cigar_after.str() == cigar_before.str()) {
        return false;
    } else {
        return true;
    }

}
Пример #10
0
TEST(CigarStringTest, ToStdString_Empty)
{
    const string empty;
    Cigar cigar;
    EXPECT_EQ(empty, cigar.ToStdString());
}
Пример #11
0
TEST(CigarStringTest, FromStdString_Empty)
{
    const string emptyCigar = "";
    Cigar cigar = Cigar::FromStdString(emptyCigar);
    EXPECT_TRUE(cigar.empty());
}
Пример #12
0
void BAMUtils::padded_alignment() {
	Cigar cig = bam_record.get_cigar();
	Sequence tdna = bam_record.get_seq();

	int sdna_pos = 0;
	int tdna_pos = 0;
	pad_source.reserve(t_dna.length());
	pad_target.reserve(t_dna.length());
	pad_match.reserve(t_dna.length());
	Sequence::iterator tdna_itr = tdna.get_iterator();
	int tot = 0;
	//find out if the first cigar op could be soft clipped or not
	is_three_prime_soft_clipped = false;


	for (Cigar::iterator i = cig.get_iterator(); i.good(); i.next()) {
		//i.op();		i.len();
		if (this->bam_record.mapped_reverse_strand()) {
			if (tot > ( cig.get_length( ) - 3) ){
				if (i.op() == 'S')
					is_three_prime_soft_clipped = true;
				else
					is_three_prime_soft_clipped = false;

			}
		} else {
			if (tot < 2) {
				if (i.op() == 'S')
					is_three_prime_soft_clipped = true;
				else
					is_three_prime_soft_clipped = false;

			}
		}

		if (i.op() == 'I' ) {
			pad_source.append(i.len(), '-');
					
			int count = 0;
			tdna_itr.set_position(tdna_pos);
			
			while (tdna_itr.good()) {
				if (count >= i.len()) {
					break;
				} else {
					pad_target += tdna_itr.get();
					tdna_itr.next();
					
					tdna_pos++;
					count++;
				}
				

			}
			pad_match.append(i.len(), '+');
		}
		else if(i.op() == 'D' || i.op() == 'N') {
			pad_source.append( t_dna.substr(sdna_pos, i.len()));
			sdna_pos += i.len();
			pad_target.append(i.len(), '-');
			pad_match.append(i.len(), '-');
			
			
		}
		else if(i.op() == 'P') {
			pad_source.append(i.len(), '*');

			pad_target.append(i.len(), '*');
			pad_match.append(i.len(), ' ');
			
			
			
			
		} else if (i.op() == 'S') {

			if (!truncate_soft_clipped) {

					pad_source.append(i.len(), '-');
					pad_match.append(i.len(), '+');
					pad_target.append(i.len(), '+');

			}	
			int count = 0;
			while (tdna_itr.good()) {
				if (count >= i.len()) {
					break;
				}		
				tdna_pos++;
				tdna_itr.next();

				count++;
			}
			

						
		}
		
		else if (i.op() == 'H') {
			//nothing for clipped bases
		}else {
			std::string ps, pt, pm;
			ps.reserve(i.len());
			pm.reserve(i.len());

			ps = t_dna.substr(sdna_pos,i.len()); //tdna is really qdna

			tdna_itr.set_position(tdna_pos);
			int count = 0;
			
			while (tdna_itr.good()) {
				if (count < i.len()) {
					pt += tdna_itr.get();
				} else {
					break;
				}

				tdna_itr.next();
				count++;

			}
			for (unsigned int z = 0; z < ps.length(); z++) {
				if (ps[z] == pt[z]) {
					pad_match += '|';
				} else if (ps[z] != 'A' || ps[z] != 'C' || ps[z] != 'G' || ps[z] != 'T') {
					if (iupac_flag) {
						
						std::vector<char> nukes(IUPAC::get_base(ps[z]));
						bool replaced = false;
						unsigned int nuke_ptr = 0;
						for (unsigned int n = 0; n < nukes.size(); n++) {
							if (nukes[n] == pt[z]) {
								pad_match += '|';
								replaced  = true;
								nuke_ptr = n;
								break;
							}
							//nuke_ptr++;
						}
						if (!replaced) {
							pad_match += ' ';
						}
						else if (!keep_iupac) {
							//std::cerr << "nukes["<<nuke_ptr<<"]: " << nukes[nuke_ptr] << " nukes.size() " << nukes.size() << std::endl;
							ps[z] = nukes[nuke_ptr];
						}//keep_iupac
					}//iupac_flag
					else {
						pad_match += ' ';
					}
				}//end else if checking ps[z] agianst nukes
				else {
					pad_match += ' ';
				}


			}//end for loop
			pad_source += ps;
			pad_target += pt;
			sdna_pos += i.len();
			tdna_pos += i.len();

			
			
		}
		tot++;

	}
	/*
	std::cerr << "pad_source: " << pad_source << std::endl;
	std::cerr << "pad_target: " << pad_target << std::endl;
	std::cerr << "pad_match : " << pad_match << std::endl;
	*/
}
Пример #13
0
void BAMUtils::dna() {
	
	
	
	MD md = bam_record.get_md();
	Cigar cig = bam_record.get_cigar();
	Sequence qseq = bam_record.get_seq();
	


	int position = 0;
	std::string seq;
	Sequence::iterator qseq_itr = qseq.get_iterator();
	for (Cigar::iterator i = cig.get_iterator(); i.good(); i.next()) {
		
		
		if (i.op() == 'M') {
			int count = 0;
			while (qseq_itr.good()) {
				
				if (count >= i.len()) {
					break;
				} else {
					seq += qseq_itr.get();
					qseq_itr.next();
					count++;

				}
			}
			

		} else if ((i.op() == 'I') || (i.op() == 'S')) {
			int count = 0;
			while (qseq_itr.good()) {
				if (count >= i.len()) {
					break;
				}				
				qseq_itr.next();
				count++;
				
			}
			//bool is_error = false;

			if (i.op() == 'S') {
				soft_clipped_bases += i.len();
				//is_error = true;

			}

			
		} 
		position++;
	}
	
	
	t_dna.reserve(seq.length());
	int start = 0;
	MD::iterator md_itr = md.get_iterator();
	std::string num;
	coord_t md_len = 0;
	char cur;

	while (md_itr.good()) {
		cur = md_itr.get();
		
		if (std::isdigit(cur)) {
			num+=cur;
			//md_itr.next();
		}
		else {
			if (num.length() > 0) {
				md_len = convert(num);
				num.clear();
			
				t_dna += seq.substr(start, md_len);
				start += md_len;
				
			}
			
		}
				
		if (cur == '^') {
			//get nuc
			md_itr.next();
			char nuc = md_itr.get();
			while (std::isalpha(nuc)) {
				t_dna += nuc;
				md_itr.next();
				nuc = md_itr.get();
			}
			num += nuc; //it's a number now will
						//lose this value if i don't do it here
			//cur = nuc;				
			
		} else if (std::isalpha(cur)) {
			t_dna += cur;
			start++;

		}
		md_itr.next();
		

	}
	//clean up residual num if there is any
	if (num.length() > 0) {
		md_len = convert(num);
		num.clear();
		t_dna += seq.substr(start, md_len);
		start += md_len;
	}
	

	
}
Пример #14
0
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
Пример #15
0
// Soft clip the cigar from the front and/or the back, writing the value
// into the new cigar.
SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar, 
                                            int32_t numFrontClips,
                                            int32_t numBackClips,
                                            int32_t& startPos,
                                            CigarRoller& updatedCigar)
{
    int32_t readLength = oldCigar.getExpectedQueryBaseCount();
    int32_t endClipPos = readLength - numBackClips;
    FilterStatus status = NONE;

    if((numFrontClips != 0) || (numBackClips != 0))
    {
        // Clipping from front and/or from the back.

        // Check to see if the entire read was clipped.
        int32_t totalClips = numFrontClips + numBackClips;
        if(totalClips >= readLength)
        {
            /////////////////////////////
            // The entire read is clipped, so rather than clipping it,
            // filter it out.
            return(FILTERED);
        }
         
        // Part of the read was clipped.
        status = CLIPPED;
            
        // Loop through, creating an updated cigar.
        int origCigarOpIndex = 0;
        
        // Track how many read positions are covered up to this
        // point by the cigar to determine up to up to what
        // point in the cigar is affected by this clipping.
        int32_t numPositions = 0;
        
        // Track if any non-clips are in the new cigar.
        bool onlyClips = true;

        const Cigar::CigarOperator* op = NULL;

        //////////////////
        // Clip from front
        while((origCigarOpIndex < oldCigar.size()) &&
              (numPositions < numFrontClips))
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            switch(op->operation)
            {
                case Cigar::hardClip:
                    // Keep this operation as the new clips do not
                    // affect other clips.
                    updatedCigar += *op;
                    break;
                case Cigar::del:
                case Cigar::skip:
                    // Skip and delete are going to be dropped, and
                    // are not in the read, so the read index doesn't
                    // need to be updated
                    break;
                case Cigar::insert:
                case Cigar::match:
                case Cigar::mismatch:
                case Cigar::softClip:
                    // Update the read index as these types
                    // are found in the read.
                    numPositions += op->count;
                    break;
                case Cigar::none:
                default:
                    // Nothing to do for none.
                    break;
            };
            ++origCigarOpIndex;
        }
    
        // If bases were clipped from the front, add the clip and
        // any partial cigar operation as necessary.
        if(numFrontClips != 0)
        {
            // Add the softclip to the front of the read.
            updatedCigar.Add(Cigar::softClip, numFrontClips);
        
            // Add the rest of the last Cigar operation if
            // it is not entirely clipped.
            int32_t newCount = numPositions - numFrontClips;
            if(newCount > 0)
            {
                // Before adding it, check to see if the same
                // operation is clipped from the end.
                // numPositions greater than the endClipPos
                // means that it is equal or past that position,
                // so shorten the number of positions.
                if(numPositions > endClipPos)
                {
                    newCount -= (numPositions - endClipPos);
                }
                if(newCount > 0)
                {
                    updatedCigar.Add(op->operation, newCount);
                    if(!Cigar::isClip(op->operation))
                    {
                        onlyClips = false;
                    }
                }
            }
        }
    
        // Add operations until the point of the end clip is reached.
        // For example...
        //   2M1D3M = MMDMMM  readLength = 5
        // readIndex: 01 234
        //   at cigarOpIndex 0 (2M), numPositions = 2.
        //   at cigarOpIndex 1 (1D), numPositions = 2.
        //   at cigarOpIndex 2 (3M), numPositions = 5.
        // if endClipPos = 2, we still want to consume the 1D, so
        // need to keep looping until numPositions > endClipPos
        while((origCigarOpIndex < oldCigar.size()) &&
              (numPositions <= endClipPos))
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            
            // Update the numPositions count if the operations indicates
            // bases within the read.
            if(!Cigar::foundInQuery(op->operation))
            {
                // This operation is not in the query read sequence,
                // so it is not yet to the endClipPos, just add the
                // operation do not increment the number of positions.
                updatedCigar += *op;
                if(!Cigar::isClip(op->operation))
                {
                    onlyClips = false;
                }
            }
            else
            {
                // This operation appears in the query sequence, so
                // check to see if the clip occurs in this operation.
                
                // endClipPos is 0 based & numPositions is a count.
                // If endClipPos is 4, then it is the 5th position.
                // If 4 positions are covered so far (numPositions = 4), 
                // then we are right at endCLipPos: 4-4 = 0, none of 
                // this operation should be kept. 
                // If only 3 positions were covered, then we are at offset
                // 3, so offset 3 should be added: 4-3 = 1.
                uint32_t numPosTilClip = endClipPos - numPositions;
                
                if(numPosTilClip < op->count)
                {
                    // this operation is partially clipped, write the part
                    // that was not clipped if it is not all clipped.
                    if(numPosTilClip != 0)
                    {
                        updatedCigar.Add(op->operation,
                                     numPosTilClip);
                        if(!Cigar::isClip(op->operation))
                        {
                            onlyClips = false;
                        }
                    }
                }
                else
                {
                    // This operation is not clipped, so add it
                    updatedCigar += *op;
                    if(!Cigar::isClip(op->operation))
                    {
                        onlyClips = false;
                    }
                }
                // This operation occurs in the query sequence, so 
                // increment the number of positions covered.
                numPositions += op->count;
            }

            // Move to the next cigar position.
            ++origCigarOpIndex;
        }
            
        //////////////////
        // Add the softclip to the back.
        if(numBackClips != 0)
        {
            // Add the softclip to the end
            updatedCigar.Add(Cigar::softClip, numBackClips);
        }
        
        //////////////////
        // Add any hardclips remaining in the original cigar to the back.
        while(origCigarOpIndex < oldCigar.size())
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            if(op->operation == Cigar::hardClip)
            {
                // Keep this operation as the new clips do not
                // affect other clips.
                updatedCigar += *op;
            }
            ++origCigarOpIndex;
        }
        
        // Check to see if the new cigar is only clips.
        if(onlyClips)
        {
            // Only clips in the new cigar, so mark the read as filtered
            // instead of updating the cigar.
            /////////////////////////////
            // The entire read was clipped.
            status = FILTERED;
        }
        else
        {
            // Part of the read was clipped.
            // Update the starting position if a clip was added to
            // the front.
            if(numFrontClips > 0)
            {
                // Convert from query index to reference position (from the
                // old cigar)
                // Get the position for the last front clipped position by
                // getting the position associated with the clipped base on
                // the reference.  Then add one to get to the first
                // non-clipped position.
                int32_t lastFrontClipPos = numFrontClips - 1;
                int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, 
                                                              startPos);
                if(newStartPos != Cigar::INDEX_NA)
                {
                    // Add one to get first non-clipped position.
                    startPos = newStartPos + 1;
                }
            }
        }
    }
    return(status);
}
Пример #16
0
// Soft Clip from the beginning of the read to the specified reference position.
int32_t CigarHelper::softClipBeginByRefPos(SamRecord& record, 
                                           int32_t refPosition0Based,
                                           CigarRoller& newCigar,
                                           int32_t &new0BasedPosition)
{
    newCigar.clear();
    Cigar* cigar = record.getCigarInfo();
    if(cigar == NULL)
    {
        // Failed to get the cigar.
        ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
        return(NO_CLIP);
    }

    // No cigar or position in the record, so return no clip.
    if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
    {
        return(NO_CLIP);
    }

    // Check to see if the reference position occurs before the record starts,
    // if it does, do no clipping.
    if(refPosition0Based < record.get0BasedPosition())
    {
        // Not within this read, so nothing to clip.
        newCigar.Set(record.getCigar());
        return(NO_CLIP);
    }

    // The position falls after the read starts, so loop through until the
    // position or the end of the read is found.
    int32_t readClipPosition = 0;
    bool clipWritten = false;
    new0BasedPosition = record.get0BasedPosition();
    for(int i = 0; i < cigar->size(); i++)
    {
        const Cigar::CigarOperator* op = &(cigar->getOperator(i));

        if(clipWritten)
        {
            // Clip point has been found, so just add everything.
            newCigar += *op;
            // Go to the next operation.
            continue;
        }

        // The clip point has not yet been found, so check to see if we found 
        // it now.

        // Not a clip, check to see if the operation is found in the
        // reference.
        if(Cigar::foundInReference(*op))
        {
            // match, mismatch, deletion, skip

            // increment the current reference position to just past this
            // operation.
            new0BasedPosition += op->count;

            // Check to see if this is also in the query, because otherwise
            // the operation is still being consumed.
            if(Cigar::foundInQuery(*op))
            {
                // Also in the query, determine if the entire thing should
                // be clipped or just part of it.

                uint32_t numKeep = 0;
                // Check to see if we have hit our clip position.
                if(refPosition0Based < new0BasedPosition)
                {
                    // The specified clip position is in this cigar operation.
                    numKeep = new0BasedPosition - refPosition0Based - 1;
                    
                    if(numKeep > op->count)
                    {
                        // Keep the entire read.  This happens because
                        // we keep reading until the first match/mismatch
                        // after the clip.
                        numKeep = op->count;
                    }
                }

                // Add the part of this operation that is being clipped
                // to the clip count.
                readClipPosition += (op->count - numKeep);

                // Only write the clip if we found a match/mismatch
                // to write.  Otherwise we will keep accumulating clips
                // for the case of insertions.
                if(numKeep > 0)
                {
                    new0BasedPosition -= numKeep;

                    newCigar.Add(Cigar::softClip, readClipPosition);
                    
                    // Add the clipped part of this cigar to the clip
                    // position.
                    newCigar.Add(op->operation, numKeep);
                    
                    // Found a match after the clip point, so stop
                    // consuming cigar operations.
                    clipWritten = true;
                    continue;
                }
            }
        }
        else
        {
            // Only add hard clips.  The softclips will be added in
            // when the total number is found.
            if(op->operation == Cigar::hardClip)
            {
                // Check if this is the first operation, if so, just write it.
                if(i == 0)
                {
                    newCigar += *op;
                }
                // Check if it is the last operation (otherwise skip it).
                else if(i == (cigar->size() - 1))
                {
                    // Check whether or not the clip was ever written, and if
                    // not, write it.
                    if(clipWritten == false)
                    {
                        newCigar.Add(Cigar::softClip, readClipPosition);
                        // Since no match/mismatch was ever found, set
                        // the new ref position to the original one.
                        new0BasedPosition = record.get0BasedPosition();
                        clipWritten = true;
                    }
                    // Add the hard clip.
                    newCigar += *op;
                }
            }
            // Not yet to the clip position, so do not add this operation.
            if(Cigar::foundInQuery(*op))
            {
                // Found in the query, so update the read clip position.
                readClipPosition += op->count;
            }
        }
    } // End loop through cigar.


    // Check whether or not the clip was ever written, and if
    // not, write it.
    if(clipWritten == false)
    {
        newCigar.Add(Cigar::softClip, readClipPosition);
        // Since no match/mismatch was ever found, set
        // the new ref position to the original one.
        new0BasedPosition = record.get0BasedPosition();
    }

    // Subtract 1 since readClipPosition atually contains the first 0based 
    // position that is not clipped.
    return(readClipPosition - 1);
}
Пример #17
0
// Soft Clip from the end of the read at the specified reference position.
int32_t CigarHelper::softClipEndByRefPos(SamRecord& record, 
                                         int32_t refPosition0Based,
                                         CigarRoller& newCigar)
{
    newCigar.clear();
    Cigar* cigar = record.getCigarInfo();
    if(cigar == NULL)
    {
        // Failed to get the cigar.
        ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
        return(NO_CLIP);
    }

    // No cigar or position in the record, so return no clip.
    if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
    {
        return(NO_CLIP);
    }

    // Check to see if the reference position occurs after the record ends,
    // if so, do no clipping.
    if(refPosition0Based > record.get0BasedAlignmentEnd())
    {
        // Not within this read, so nothing to clip.
        newCigar.Set(record.getCigar());
        return(NO_CLIP);
    }

    // The position falls before the read ends, so loop through until the
    // position is found.
    int32_t currentRefPosition = record.get0BasedPosition();
    int32_t readClipPosition = 0;
    for(int i = 0; i < cigar->size(); i++)
    {
        const Cigar::CigarOperator* op = &(cigar->getOperator(i));

        // If the operation is found in the reference, increase the
        // reference position.
        if(Cigar::foundInReference(*op))
        {
            // match, mismatch, deletion, skip
            // increment the current reference position to just past
            // this operation.
            currentRefPosition += op->count;
        }
         
        // Check to see if we have hit our clip position.
        if(refPosition0Based < currentRefPosition)
        {
            // If this read is also in the query (match/mismatch), 
            // write the partial op to the new cigar.
            int32_t numKeep = 0;
            if(Cigar::foundInQuery(*op))
            {
                numKeep = op->count - (currentRefPosition - refPosition0Based);
                if(numKeep > 0)
                {
                    newCigar.Add(op->operation, numKeep);
                    readClipPosition += numKeep;
                }
            }
            else if(Cigar::isClip(*op))
            {
                // This is a hard clip, so write it.
                newCigar.Add(op->operation, op->count);
            }
            else
            {

                // Not found in the query (skip/deletion),
                // so don't write any of the operation.
            }
            // Found the clip point, so break.
            break;
        }
        else if(refPosition0Based == currentRefPosition)
        {
            newCigar += *op;
            if(Cigar::foundInQuery(*op))
            {
                readClipPosition += op->count;
            }
        }
        else
        {
            // Not yet to the clip position, so add this operation/size to
            // the new cigar.
            newCigar += *op;
            if(Cigar::foundInQuery(*op))
            {
                // Found in the query, so update the read clip position.
                readClipPosition += op->count;
            }
        }
    } // End loop through cigar.

    // Before adding the softclip, read from the end of the cigar checking to
    // see if the operations are in the query, removing operations that are
    // not (pad/delete/skip) until a hardclip or an operation in the query is
    // found. We do not want a pad/delete/skip right before a softclip.
    for(int j = newCigar.size() - 1; j >= 0; j--)
    {
        const Cigar::CigarOperator* op = &(newCigar.getOperator(j));
        if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op))
        {
            // pad/delete/skip
            newCigar.Remove(j);
        }
        else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op))
        {
            // Soft clip, so increment the clip position for the return value.
            // Remove the softclip since the readClipPosition is used to
            // calculate teh size of the soft clip added.
            readClipPosition -= op->count;
            newCigar.Remove(j);
        }
        else
        {
            // Found a cigar operation that should not be deleted, so stop deleting.
            break;
        }
    } 

    // Determine the number of soft clips.
    int32_t numSoftClips = record.getReadLength() - readClipPosition;
    // NOTE that if the previous operation is a softclip, the CigarRoller logic
    // will merge this with that one.
    newCigar.Add(Cigar::softClip, numSoftClips);

    // Check if an ending hard clip needs to be added.
    if(cigar->size() != 0)
    {
        const Cigar::CigarOperator* lastOp = 
            &(cigar->getOperator(cigar->size() - 1));
        if(lastOp->operation == Cigar::hardClip)
        {
            newCigar += *lastOp;
        }
    }

    return(readClipPosition);
}
Пример #18
0
std::string Cigar::toString(const Cigar &cigarBuffer, unsigned offset, unsigned length)
{
    ISAAC_ASSERT_MSG(cigarBuffer.size() >= offset + length, "Requested end is outside of cigarBuffer");
    return toString(cigarBuffer.begin() + offset, cigarBuffer.begin() + offset + length);
}
Пример #19
0
// Add an entry to this pileup element.  
void PileupElementBaseQual::addEntry(SamRecord& record)
{
    // Call the base class:
    PileupElement::addEntry(record);

    if(myRefAllele.empty())
    {
    	genomeIndex_t markerIndex = (*myRefSeq).getGenomePosition(getChromosome(), static_cast<uint32_t>(getRefPosition()+1));
        myRefAllele = (*myRefSeq)[markerIndex];
    }

    // Increment the index
    ++myIndex;
    
    // if the index has gone beyond the allocated space, double the size.
    if(myIndex >= myAllocatedSize)
    {
        char* tempBuffer = (char*)realloc(myBases, myAllocatedSize * 2);
        if(tempBuffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myBases = tempBuffer;
        int8_t* tempInt8Buffer = (int8_t*)realloc(myMapQualities, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myMapQualities = tempInt8Buffer; 
        tempInt8Buffer = (int8_t*)realloc(myQualities, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myQualities = tempInt8Buffer;
        tempBuffer = (char*)realloc(myStrands, myAllocatedSize * 2);
        if(tempBuffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myStrands = tempBuffer;
        tempInt8Buffer = (int8_t*)realloc(myCycles, myAllocatedSize * 2 * sizeof(int8_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myCycles = tempInt8Buffer; 
        int16_t* tempInt16Buffer = (int16_t*)realloc(myGLScores, myAllocatedSize * 2 * sizeof(int16_t));
        if(tempInt8Buffer == NULL)
        {
            std::cerr << "Memory Allocation Failure\n";
            // TODO
            return;
        }
        myGLScores = tempInt16Buffer;
        myAllocatedSize = myAllocatedSize * 2;
    }

    Cigar* cigar = record.getCigarInfo();
    
    if(cigar == NULL)
    {
        throw std::runtime_error("Failed to retrieve cigar info from the record.");
    }

    int32_t readIndex = 
        cigar->getQueryIndex(getRefPosition(), record.get0BasedPosition());

    // If the readPosition is N/A, this is a deletion.
    if(readIndex != CigarRoller::INDEX_NA)
    {
        char base = record.getSequence(readIndex);
        int8_t mapQual = record.getMapQuality();
        //-33 to obtain the PHRED base quality
        char qual = record.getQuality(readIndex) - 33;
        if(qual == UNSET_QUAL)
        {
            qual = ' ';
        }
        char strand = (record.getFlag() & 0x0010) ? 'R' : 'F';
        int cycle = strand == 'F' ? readIndex + 1 : record.getReadLength() -  readIndex;
        myBases[myIndex] = base;
        myMapQualities[myIndex] = mapQual;
        myQualities[myIndex] = qual;
        myStrands[myIndex] = strand;
        myCycles[myIndex] = cycle;
    }
    else if(myAddDelAsBase)
    {
        int8_t mapQual = record.getMapQuality();
        char strand = (record.getFlag() & 0x0010) ? 'R' : 'F';
        myBases[myIndex] = '-';
        myMapQualities[myIndex] = mapQual;
        myQualities[myIndex] = -1;
        myStrands[myIndex] = strand;
        myCycles[myIndex] = -1;
    }
    else
    {
        // Do not add a deletion.
        // Did not add any entries, so decrement the index counter since the
        // index was not used.
        --myIndex;
    }
}
Пример #20
0
bool Recab::processReadBuildTable(SamRecord& samRecord)
{
    static BaseData data;
    static std::string chromosomeName;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();
    
    // Check if the parameters have been processed.
    if(!myParamsSetup)
    {
        // This throws an exception if the reference cannot be setup.
        processParams();
    }

    uint16_t  flag = samRecord.getFlag();

    if(!SamFlag::isMapped(flag))
    {
        // Unmapped, skip processing
        ++myUnMappedCount;
    }
    else
    {
        // This read is mapped.
        ++myMappedCount;
    }

    if(SamFlag::isSecondary(flag))
    {
        // Secondary read
        ++mySecondaryCount;
    }
    if(SamFlag::isDuplicate(flag))
    {
        ++myDupCount;
    }
    if(SamFlag::isQCFailure(flag))
    {
        ++myQCFailCount;
    }

    // Check if the flag contains an exclude.
    if((flag & myIntBuildExcludeFlags) != 0)
    {
        // Do not use this read for building the recalibration table.
        ++myNumBuildSkipped;
        return(false);
    }

    if(samRecord.getMapQuality() == 0)
    {
        // 0 mapping quality, so skip processing.
        ++myMapQual0Count;
        ++myNumBuildSkipped;
        return(false);
    }
    if(samRecord.getMapQuality() == 255)
    {
        // 255 mapping quality, so skip processing.
        ++myMapQual255Count;
        ++myNumBuildSkipped;
        return(false);
    }
    
    chromosomeName = samRecord.getReferenceName();
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
    {
        // New element inserted.
        insertRet.first->second = myId2Rg.size();
        myId2Rg.push_back(readGroup);
    }

    data.rgid = insertRet.first->second;


    //reverse
    bool reverse;
    if(SamFlag::isReverse(flag))
        reverse = true;
    else
        reverse = false;

    if(myReferenceGenome == NULL)
    {
        throw std::runtime_error("Failed to setup Reference File.\n");
    }

    genomeIndex_t mapPos = 
        myReferenceGenome->getGenomePosition(chromosomeName.c_str(), 
                                             samRecord.get1BasedPosition());

    if(mapPos==INVALID_GENOME_INDEX)
    {
    	Logger::gLogger->warning("INVALID_GENOME_INDEX (chrom:pos %s:%ld) and record skipped... Reference in BAM is different from the ref used here!", chromosomeName.c_str(), samRecord.get1BasedPosition());

        ++myNumBuildSkipped;
        return false;
    }

    if(!myQField.IsEmpty())
    {
        // Check if there is an old quality.
        const String* oldQPtr = 
            samRecord.getStringTag(myQField.c_str());
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
        {
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
        }
        else
        {
            // Tag was not found, so use the current quality.
            ++myNumQualTagErrors;
            if(myNumQualTagErrors == 1)
            {
                Logger::gLogger->warning("Recab: %s tag was not found/invalid, so using the quality field in records without the tag", myQField.c_str());
            }
            myQualityStrings.oldq = samRecord.getQuality();
        }
        //printf("%s\n",samRecord.getQuality());
        //printf("%s:%s\n",myQField.c_str(),temp.c_str());
    }
    else
    {
        myQualityStrings.oldq = samRecord.getQuality();
    }

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
    {
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");
        ++myNumBuildSkipped;
        return(false);
    }

    aligTypes = "";
    Cigar* cigarPtr = samRecord.getCigarInfo();

    if(cigarPtr == NULL)
    {
        Logger::gLogger->warning("Failed to get the cigar");
        ++myNumBuildSkipped;
        return(false);
    }

    // This read will be used for building the recab table.
    ++myNumBuildReads;

    ////////////////
    ////// iterate sequence
    ////////////////
    genomeIndex_t refPos = 0;
    int32_t refOffset = 0;
    int32_t prevRefOffset = Cigar::INDEX_NA;
    int32_t seqPos = 0;
    int seqIncr = 1;
    if(reverse)
    {
        seqPos = seqLen - 1;
        seqIncr = -1;
    }

    // read
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
    else
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
    {
        // Store the previous current base in preBase.
        data.preBase = data.curBase;

        // Get the current base before checking if we are going to
        // process this position so it will be set for the next position.
        data.curBase = samRecord.getSequence(seqPos);
        if(reverse)
        {
            // Complement the current base.
            // The prebase is already complemented.
            data.curBase = 
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];
        }
        
        // Get the reference offset.
        refOffset = cigarPtr->getRefOffset(seqPos);
        if(refOffset == Cigar::INDEX_NA)
        {
            // Not a match/mismatch, so continue to the next one which will
            // not have a previous match/mismatch.
            // Set previous ref offset to a negative so
            // the next one won't be kept.
            prevRefOffset = -2;
            continue;
        }

        // This one is a match.
        refPos = mapPos + refOffset;

        // Check to see if we should process this position.
        // Do not process if it is cycle 0 and:
        //   1) current base is in dbsnp
        if(data.cycle == 0)
        {
            if(!(myDbsnpFile.IsEmpty()) && myDbSNP[refPos])
            {
                // Save the previous reference offset.
                ++myNumDBSnpSkips;
                prevRefOffset = refOffset;
                continue;
            }
        }
        else
        {
            // Do not process if it is not cycle 0 and:
            //   1) previous reference position not adjacent 
            //      (not a match/mismatch)
            //   2) previous base is in dbsnp
            //   3) current base is in dbsnp
            if((!myKeepPrevNonAdjacent && (refOffset != (prevRefOffset + seqIncr))) ||
               (data.preBase == 'K'))
            {
                // Save the previous reference offset.
                prevRefOffset = refOffset;
                continue;
            }
            if(!(myDbsnpFile.IsEmpty()) && 
               (myDbSNP[refPos] ||
                (!myKeepPrevDbsnp && myDbSNP[refPos - seqIncr])))
            {
                ++myNumDBSnpSkips;
                // Save the previous reference offset.
                prevRefOffset = refOffset;
                continue;
            }
       }
        
        // Save the previous reference offset.
        prevRefOffset = refOffset;

        // Set the reference & read bases in the Covariates
        char refBase = (*myReferenceGenome)[refPos];

        if(BaseUtilities::isAmbiguous(refBase))
        {
            // N reference, so skip it when building the table.
            ++myAmbiguous;
            continue;
        }

        if(reverse)
        {
            refBase = BaseAsciiMap::base2complement[(unsigned int)(refBase)];
        }

        // Get quality char
        data.qual = 
            BaseUtilities::getPhredBaseQuality(myQualityStrings.oldq[seqPos]);

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)
        {
            ++mySubMinQual;
            continue;
        }

        if(BaseUtilities::areEqual(refBase, data.curBase)
           && (BaseAsciiMap::base2int[(unsigned int)(data.curBase)] < 4))
            myBMatchCount++;
        else
            myBMismatchCount++;

        hasherrormodel.setCell(data, refBase);
        myBasecounts++;
    }
    return true;
}
Пример #21
0
void realign_bam(Parameters& params) {

    FastaReference reference;
    reference.open(params.fasta_reference);

    bool suppress_output = false;

    int dag_window_size = params.dag_window_size;
    
    // open BAM file
    BamReader reader;
    if (!reader.Open("stdin")) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

    BamWriter writer;
    if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    vector<RefData> referenceSequences = reader.GetReferenceData();
    int i = 0;
    for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->RefName;
        ++i;
    }

    vcf::VariantCallFile vcffile;
    if (!params.vcf_file.empty()) {
        if (!vcffile.open(params.vcf_file)) {
            cerr << "could not open VCF file " << params.vcf_file << endl;
            exit(1);
        }
    } else {
        cerr << "realignment requires VCF file" << endl;
        exit(1);
    }
    vcf::Variant var(vcffile);

    BamAlignment alignment;
    map<long int, vector<BamAlignment> > alignmentSortQueue;

    // get alignment
    // assemble DAG in region around alignment
    // loop for each alignment in BAM:
    //     update DAG when current alignment gets close to edge of assembled DAG
    //     attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal
    //     if alignment to DAG has fewer mismatches and gaps than original alignment, use it
    //         flatten read into reference space (for now just output alleles from VCF un-spanned insertions)
    //     write read to queue for streaming re-sorting (some positional change will occur)

    long int dag_start_position = 0;
    string currentSeqname;
    string ref;
    //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph
    //vector<long int> refpositions; // contains the reference start coords of nodes in the graph
    ReferenceMappings ref_map;
    gssw_graph* graph = gssw_graph_create(0);
    int8_t* nt_table = gssw_create_nt_table();
    int8_t* mat = gssw_create_score_matrix(params.match, params.mism);

    int total_reads = 0;
    int total_realigned = 0;
    int total_improved = 0;
    bool emptyDAG = false; // if the dag is constructed over empty sequence
                           // such as when realigning reads mapped to all-N sequence
    if (params.debug) {
        cerr << "about to start processing alignments" << endl;
    }

    while (reader.GetNextAlignment(alignment)) {

        string& seqname = referenceIDToName[alignment.RefID];

        if (params.debug) {
            cerr << "--------------------------------------------" << endl
                 << "processing alignment " << alignment.Name << " at "
                 << seqname << ":" << alignment.Position << endl;
        }

        /*
        if (!alignment.IsMapped() && graph->size == 0) {
            if (params.debug) {
                cerr << "unable to build DAG using unmapped read "
                     << alignment.Name << " @ "
                     << seqname << ":" << alignment.Position
                     << " no previous mapped read found and DAG currently empty" << endl;
            }
            alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment);
            continue;
        }
        */

        ++total_reads;

        BamAlignment originalAlignment = alignment;
        long unsigned int initialAlignmentPosition = alignment.Position;
        //if (dag_start_position == 1) {
        //    dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2);
        //}

        // should we construct a new DAG?  do so when 3/4 of the way through the current one
        // center on current position + 1/2 dag window
        // TODO check this scheme using some scribbles on paper
        // alignment.IsMapped()
        if ((seqname != currentSeqname
             || ((alignment.Position + (alignment.QueryBases.size()/2)
                  > (3*dag_window_size/4) + dag_start_position)))
            && alignment.Position < reference.sequenceLength(seqname)) {

            if (seqname != currentSeqname) {
                if (params.debug) {
                    cerr << "switched ref seqs" << endl;
                }
                dag_start_position = max((long int) 0,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            // recenter DAG
            } else if (!ref_map.empty()) {
                dag_start_position = dag_start_position + dag_window_size/2;
                dag_start_position = max(dag_start_position,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            } else {
                dag_start_position = alignment.Position - dag_window_size/2;
            }
            dag_start_position = max((long int)0, dag_start_position);

            // TODO get sequence length and use to bound noted window size (edge case)
            //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl;

            // get variants for new DAG
            vector<vcf::Variant> variants;
            if (!vcffile.setRegion(seqname,
                                   dag_start_position + 1,
                                   dag_start_position + dag_window_size)) {
                // this is not necessarily an error; there should be a better way to check for VCF file validity
                /*
                cerr << "could not set region on VCF file to " << currentSeqname << ":"
                     << dag_start_position << "-" << dag_start_position + ref.size()
                     << endl;
                */
                //exit(1);
            } else {

                // check first variant
                if (vcffile.getNextVariant(var)) {
                    while (var.position <= dag_start_position + 1) {
                        //cerr << "var position == dag_start_position " << endl;
                        dag_start_position -= 1;
                        vcffile.setRegion(seqname,
                                          dag_start_position + 1,
                                          dag_start_position + dag_window_size);
                        if (!vcffile.getNextVariant(var)) { break; }
                    }
                }

                vcffile.setRegion(seqname,
                                  dag_start_position + 1,
                                  dag_start_position + dag_window_size);

                while (vcffile.getNextVariant(var)) {
                    if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl;
                    //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl;
                    //cerr << var.position << " >= " << dag_start_position << endl;
                    if (var.position + var.ref.length() <= dag_start_position + dag_window_size
                        && var.position >= dag_start_position) {
                        variants.push_back(var);
                    }
                }

            }

            //cerr << "dag_start_position " << dag_start_position << endl;
            ref = reference.getSubSequence(seqname,
                                           max((long int) 0, dag_start_position),
                                           dag_window_size); // 0/1 conversion

            // clear graph and metadata
            ref_map.clear();
            //cigars.clear();
            //refpositions.clear();
            gssw_graph_destroy(graph);

            if (params.debug) { cerr << "constructing DAG" << endl; }
            // and build the DAG
            graph = gssw_graph_create(0);
            constructDAGProgressive(graph,
                                    ref_map,
                                    ref,
                                    seqname,
                                    variants,
                                    dag_start_position,
                                    nt_table,
                                    mat,
                                    params.flat_input_vcf);

            if (params.debug) {
                cerr << "graph has " << graph->size << " nodes" << endl;
                cerr << "DAG generated from input variants over "
                     << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size
                     << endl;
            }
            if (params.display_dag) {
                gssw_graph_print(graph);
                /*
                for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) {
                    cout << b->first << " "
                         << b->first->id << " "
                         << b->second.ref_position << " "
                         << b->second.cigar << endl
                         << b->first->seq << endl;
                }
                */
            }

            if (graph->size == 1 && allN(ref) || graph->size == 0) {
                if (params.debug) {
                    cerr << "DAG is empty (1 node, all N).  Alignment is irrelevant." << endl;
                }
                emptyDAG = true;
            } else {
                emptyDAG = false;
            }

        }

        AlignmentStats stats_before;
        bool was_mapped = alignment.IsMapped();
        bool has_realigned = false;
        if (was_mapped) {
            if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                ref = reference.getSubSequence(seqname,
                                               max((long int) 0, dag_start_position),
                                               alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
            }
        }

        if (params.debug) {
            if (emptyDAG) {
                cerr << "cannot realign against empty (all-N single node) graph" << endl;
            }
        }

        if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) {

            ++total_realigned;

            if (params.debug) {
                cerr << "realigning: " << alignment.Name
                     << " " << alignment.QueryBases << endl
                     << " aligned @ " << alignment.Position
                     << " to variant graph over "
                     << seqname
                     << ":" << dag_start_position
                     << "-" << dag_start_position + dag_window_size << endl;
            }

            //{
            try {

                Cigar flat_cigar;
                string read = alignment.QueryBases;
                string qualities = alignment.Qualities;
                int score;
                long int position;
                string strand;
                gssw_graph_mapping* gm =
                    gswalign(graph,
                             ref_map,
                             read,
                             qualities,
                             params,
                             position,
                             score,
                             flat_cigar,
                             strand,
                             nt_table,
                             mat);
                //
                gssw_graph_mapping_destroy(gm);

                if (params.dry_run) {

                    if (strand == "-" && !alignment.IsMapped()) {
                        read = reverseComplement(read);
                    }
                    cout << read << endl;
                    cout << graph_mapping_to_string(gm) << endl;
                    cout << score << " " << strand << " "
                         << position << " "
                         << flat_cigar << endl;

                } else {

                    /*
                    if (strand == "-") {
                        read = reverseComplement(trace_report.read);
                    }
                   */
 
                    // TODO the qualities are not on the right side of the read
                    if (strand == "-" && alignment.IsMapped()) {
                        // if we're realigning, this is always true unless we swapped strands
                        alignment.SetIsReverseStrand(true);
                        //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities
                    }
                    //alignment.QueryBases = reverseComplement(trace_report.read);
                    alignment.QueryBases = read;
                    alignment.Qualities = qualities;

                    alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x;
                    alignment.SetIsMapped(true);
                    if (!alignment.MapQuality) {
                        alignment.MapQuality = 20; // horrible hack...  at least approximate with alignment mismatches against graph
                    }

                    // check if somehow we've ended up with an indel at the ends
                    // if so, grab the reference sequence right beyond it and add
                    // a single match to the cigar, allowing variant detection methods
                    // to run on the results without internal modification
                    Cigar& cigar = flat_cigar;
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;
                    int flankSize = params.flatten_flank;
                    if (cigar.front().isIndel() ||
                        (cigar.front().isSoftclip() && cigar.at(1).isIndel())) {
                        alignment.Position -= flankSize;
                        string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize);
                        if (cigar.front().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.begin(),
                                                       alignment.QueryBases.begin()+cigar.front().length);
                            alignment.Qualities.erase(alignment.Qualities.begin(),
                                                       alignment.Qualities.begin()+cigar.front().length);
                            cigar.erase(cigar.begin());
                        }
                        alignment.QueryBases.insert(0, refBase);
                        alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30)));
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        newCigar.append(flat_cigar);
                        flat_cigar = newCigar;
                    }
                    if (cigar.back().isIndel() ||
                        (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) {
                        string refBase = reference.getSubSequence(seqname,
                                                                  alignment.Position
                                                                  + flat_cigar.refLen(),
                                                                  flankSize);
                        if (cigar.back().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length,
                                                       alignment.QueryBases.end());
                            alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length,
                                                      alignment.Qualities.end());
                            cigar.pop_back();
                        }
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        flat_cigar.append(newCigar);
                        //flat_cigar.append(newCigar);
                        alignment.QueryBases.append(refBase);
                        alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30)));
                    }

                    flat_cigar.toCigarData(alignment.CigarData);
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;

                    if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                        ref = reference.getSubSequence(seqname,
                                                       max((long int) 0, dag_start_position),
                                                       alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
                    }

                    AlignmentStats stats_after;
                    countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug);
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum
                                         && stats_before.mismatch_qsum >= stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum
                                         >= stats_after.softclip_qsum + stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */

                    // we accept the new alignment if...
                    if (!was_mapped  // it wasn't mapped previously
                        // or if we have removed soft clips or mismatches (per quality) from the alignment
                        //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum
                        //     && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)
                        || ((stats_before.softclip_qsum + stats_before.mismatch_qsum
                             >= stats_after.softclip_qsum + stats_after.mismatch_qsum)
                            // and if we have added gaps, we have added them to remove mismatches or softclips
                            && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches
                                || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment
                                    && (stats_before.softclip_qsum 
                                        + stats_before.mismatch_qsum
                                        >
                                        stats_after.softclip_qsum
                                        + stats_after.mismatch_qsum))))
                            // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches
                            // as provided in input parameters
                        && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {

                        // keep the alignment
                        // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...)
                        if (params.debug) {
                            cerr << "realigned " << alignment.Name << " to graph, which it maps to with "
                                 << stats_after.mismatch_qsum << "q in mismatches and "
                                 << stats_after.softclip_qsum << "q in soft clips" << endl;
                        }
                        ++total_improved;
                        has_realigned = true;
                    } else {
                        // reset to old version of alignment
                        if (params.debug) {
                            cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " 
                                 << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and "
                                 << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl;
                        }
                        has_realigned = false;
                        alignment = originalAlignment;
                    }
                }
                //} // try block

            } catch (...) {
                cerr << "exception when realigning " << alignment.Name
                     << " at position " << referenceIDToName[alignment.RefID]
                     << ":" << alignment.Position
                     << " " << alignment.QueryBases << endl;
                // reset to original alignment
                has_realigned = false;
                alignment = originalAlignment;

            }
        }

        // ensure correct order if alignments move
        long int maxOutputPos = initialAlignmentPosition - dag_window_size;
        // if we switched sequences we need to flush out all the reads from the previous one
        string lastSeqname = currentSeqname;
        if (seqname != currentSeqname) {
            // so the max output position is set past the end of the last chromosome
            if (!currentSeqname.empty()) {
                maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size;
            }
            currentSeqname = seqname;
        }

        if (!params.dry_run) {
            map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
            for ( ; p != alignmentSortQueue.end(); ++p) {
                // except if we are running in unsorted mode, stop when we are at the window size
                if (!params.unsorted_output && p->first > maxOutputPos) {
                    break; // no more to do
                } else {
                    for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) {
                        writer.SaveAlignment(*a);
                    }
                }
            }
            if (p != alignmentSortQueue.begin()) {
                alignmentSortQueue.erase(alignmentSortQueue.begin(), p);
            }
            if (!params.only_realigned || has_realigned) {
                alignmentSortQueue[alignment.Position].push_back(alignment);
            }
        }
    } // end GetNextAlignment loop

    if (!params.dry_run) {
        map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
        for ( ; p != alignmentSortQueue.end(); ++p) {
            for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a)
                writer.SaveAlignment(*a);
        }
    }

    gssw_graph_destroy(graph);
    free(nt_table);
	free(mat);

    reader.Close();
    writer.Close();

    if (params.debug) {
        cerr << "total reads:\t" << total_reads << endl;
        cerr << "realigned:\t" << total_realigned << endl;
        cerr << "improved:\t" << total_improved << endl;
    }

}