Beispiel #1
0
//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a,
// if both reads are on the same chrome, then read_l must map before read_r
// if the reads are on different strands then read_l must be on the lexo
// lesser chrom (using the string.compare() method)
SV_Pair::
SV_Pair(const BamAlignment &bam_a,
        const BamAlignment &bam_b,
        const RefVector &refs,
        int _weight,
        int _ev_id,
        SV_PairReader *_reader)
{
    reader = _reader;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct interval tmp_a, tmp_b;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition(false, false) - 1;
    tmp_a.chr = refs.at(bam_a.RefID).RefName;

    if ( bam_a.IsReverseStrand() == true )
        tmp_a.strand = '-';
    else
        tmp_a.strand = '+';

    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition(false, false) - 1;
    tmp_b.chr = refs.at(bam_b.RefID).RefName;

    if ( bam_b.IsReverseStrand() == true )
        tmp_b.strand = '-';
    else
        tmp_b.strand = '+';

    //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) {
    if ( bam_a.RefID < bam_b.RefID ) {
        read_l = tmp_a;
        read_r = tmp_b;
        //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) {
    } else if ( bam_a.RefID > bam_b.RefID) {
        read_l = tmp_b;
        read_r = tmp_a;
    } else { // ==
        if (tmp_a.start > tmp_b.start) {
            read_l = tmp_b;
            read_r = tmp_a;
        } else {
            read_l = tmp_a;
            read_r = tmp_b;
        }
    }

    weight = _weight;
    ev_id = _ev_id;
}
Beispiel #2
0
void BedCoverage::CollectCoverageBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    // open the BAM file
    BamReader reader;
    reader.Open(bamFile);

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        if (bam.IsMapped()) {
            // treat the BAM alignment as a single "block"
            if (_obeySplits == false) {
                // construct a new BED entry from the current BAM alignment.
                BED a;
                a.chrom  = refs.at(bam.RefID).RefName;
                a.start  = bam.Position;
                a.end    = bam.GetEndPosition(false, false);
                a.strand = "+";
                if (bam.IsReverseStrand()) a.strand = "-";

                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            }
            // split the BAM alignment into discrete blocks and
            // look for overlaps only within each block.
            else {
                // vec to store the discrete BED "blocks" from a
                bedVector bedBlocks;
                // since we are counting coverage, we do want to split blocks when a
                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
    // close the BAM file
    reader.Close();
}
Beispiel #3
0
//{{{ void process_pair(const BamAlignment &curr,
void
SV_Pair::
process_pair(const BamAlignment &curr,
             const RefVector refs,
             map<string, BamAlignment> &mapped_pairs,
             UCSCBins<SV_BreakPoint*> &r_bin,
             int weight,
             int ev_id,
             SV_PairReader *reader)
{
    if (mapped_pairs.find(curr.Name) == mapped_pairs.end())
        mapped_pairs[curr.Name] = curr;
    else {
        SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name],
                                        curr,
                                        refs,
                                        weight,
                                        ev_id,
                                        reader);
        //cerr << count_clipped(curr.CigarData) << "\t" <<
                //count_clipped(mapped_pairs[curr.Name].CigarData) << endl;
                
        if ( new_pair->is_sane() &&  
             new_pair->is_aberrant() &&
             (count_clipped(curr.CigarData) > 0) &&
             (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) {
            SV_BreakPoint *new_bp = new_pair->get_bp();

#ifdef TRACE

            cerr << "READ\t" << 
                    refs.at(mapped_pairs[curr.Name].RefID).RefName << "," <<
                    mapped_pairs[curr.Name].Position << "," <<
                    (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1)
                        << "\t" <<
                    refs.at(curr.RefID).RefName << "," <<
                    curr.Position << "," <<
                    (curr.GetEndPosition(false, false) - 1)
                        <<
                    endl;

            cerr << "\tPE\t" << *new_bp << endl;
#endif
            new_bp->cluster(r_bin);
        } else {
            delete(new_pair);
        }

        mapped_pairs.erase(curr.Name);
    }
}
long get_ref_lengths(int id, RefVector ref) {
	long length = 0;

	for (size_t i = 0; i < (size_t) id && i < ref.size(); i++) {
		length += (long) ref[i].RefLength + (long) Parameter::Instance()->max_dist;
	}
	return length;
}
Beispiel #5
0
GenomeFile::GenomeFile(const RefVector &genome) {
    for (size_t i = 0; i < genome.size(); ++i) {
        string chrom = genome[i].RefName;
        int length = genome[i].RefLength;
        
        _chromSizes[chrom] = length;
        _chromList.push_back(chrom);
    }
}
Beispiel #6
0
long Breakpoint::calc_pos(long pos, RefVector ref) {
	size_t i = 0;
	pos -= ref[i].RefLength;
	while (i < ref.size() && pos >= 0) {
		i++;
		pos -= ref[i].RefLength;
	}
	return pos + ref[i].RefLength;
}
Beispiel #7
0
    void getBamBlocks(const BamAlignment &bam, const RefVector &refs,
                      vector<BED> &blocks, bool breakOnDeletionOps) {

        CHRPOS currPosition = bam.Position;
        CHRPOS blockStart   = bam.Position;
        string chrom        = refs.at(bam.RefID).RefName;
        string name         = bam.Name;
        string strand       = "+";
        string score        = ToString(bam.MapQuality);
        char  prevOp        = '\0';
        if (bam.IsReverseStrand()) strand = "-";
        bool blocksFound = false;

        vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
        vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
        for ( ; cigItr != cigEnd; ++cigItr ) {
            if (cigItr->Type == 'M') {
                currPosition += cigItr->Length;
                // we only want to create a new block if the current M op
                // was preceded by an N op or a D op (and we are breaking on D ops)
                if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) {
                    blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                    blockStart = currPosition;
                }
            }
            else if (cigItr->Type == 'D') {
                if (breakOnDeletionOps == false)
                    currPosition += cigItr->Length;
                else {
                    blocksFound = true;
                    currPosition += cigItr->Length;
                    blockStart    = currPosition;
                }
            }
            else if (cigItr->Type == 'N') {
                blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) );
                blocksFound = true;
                currPosition += cigItr->Length;
                blockStart    = currPosition;
            }
            else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
                // do nothing
            }
            else {
                cerr << "Input error: invalid CIGAR type (" << cigItr->Type
                    << ") for: " << bam.Name << endl;
                exit(1);
            }
            prevOp = cigItr->Type;
        }
        // if there were no splits, we just create a block representing the contiguous alignment.
        if (blocksFound == false) {
            blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) );
        }
    }
long fuck_off(long pos, RefVector ref, std::string &chr) {
	size_t i = 0;
	pos -= (ref[i].RefLength + Parameter::Instance()->max_dist);

	while (i < ref.size() && pos >= 0) {
		i++;
		pos -= ((long) ref[i].RefLength + (long) Parameter::Instance()->max_dist);
	}
	chr = ref[i].RefName;
	return pos + ref[i].RefLength + (long) Parameter::Instance()->max_dist;
}
Beispiel #9
0
std::string Breakpoint::get_chr(long pos, RefVector ref) {
//	std::cout << "pos: " << pos << std::endl;
	size_t id = 0;
	while (id < ref.size() && pos >= 0) {
		pos -= (long) ref[id].RefLength;
		//	std::cout << id << std::endl;
		id++;
	}

	return ref[id - 1].RefName;
}
//-------------------------------------------------------------------------
void XList::sortByElementNumber(String order){

	// Get the number of sessions per speaker
	LKVector spk(0,0);
	for(unsigned long i=0;i<_vector.size();i++){
		LKVector::type sps;
		sps.idx = i;
		sps.lk = _vector.getObject(i).getElementCount();
		spk.addValue(sps);
	}
	// Sort Xlines of the temporary XList by element number
	spk.descendingSort();

	// Copy the current RefVector<XLine> into a temporary one
	RefVector<XLine> tmpX;
	for(unsigned long i=0;i<_vector.size();i++){
		XLine *ll = new XLine(_vector.getObject(i));
		tmpX.addObject(*ll);
	}

	// Remove all elements from the XList
	_vector.deleteAllObjects();

	// Fill the XList according to the number of elements
	if(order == "descend"){
		for(unsigned long i=0;i<tmpX.size();i++){
			_vector.addObject(tmpX.getObject(spk[i].idx));
		}
	}
	else if(order == "ascend"){
		for(long i=tmpX.size()-1;i>=0;i--){
			_vector.addObject(tmpX.getObject(spk[i].idx));
		}
	}
}
Beispiel #11
0
 void getBamBlocks(const BamAlignment &bam, const RefVector &refs, 
                   BedVec &blocks, bool breakOnDeletionOps) {
 
 	CHRPOS currPosition = bam.Position;
     CHRPOS blockStart   = bam.Position;
     string chrom        = refs.at(bam.RefID).RefName;
     string name         = bam.Name;
     string strand       = "+";
     float  score        = bam.MapQuality;
     if (bam.IsReverseStrand()) strand = "-"; 
 	
 	vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin();
 	vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end();
     for ( ; cigItr != cigEnd; ++cigItr ) {
         if (cigItr->Type == 'M') {
             currPosition += cigItr->Length;
 			blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) );
 			blockStart    = currPosition;
         }
         else if (cigItr->Type == 'D') {
             if (breakOnDeletionOps == false)
                 currPosition += cigItr->Length;
             else {
                 currPosition += cigItr->Length;
                 blockStart    = currPosition;
             }
         }
         else if (cigItr->Type == 'N') {
             currPosition += cigItr->Length;
             blockStart    = currPosition;            }
         else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') {
             // do nothing
         }
         else {
             cerr << "Input error: invalid CIGAR type (" << cigItr->Type
 				<< ") for: " << bam.Name << endl;
 			exit(1);
         }
 	}
 }
Beispiel #12
0
void BedIntersect::IntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB = new BedFile(_bedBFile);
    _bedB->loadBedFileIntoMap();

    // create a dummy BED A file for printing purposes if not
    // using BAM output.
    if (_bamOutput == false) {
        _bedA = new BedFile(_bedAFile);
        _bedA->bedType = 12;
    }
    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();
    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }
    vector<BED> hits;
    // reserve some space
    hits.reserve(100);
    BamAlignment bam;    
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        // save an unaligned read if -v
        if (!bam.IsMapped()) {
            if (_noHit == true)
                writer.SaveAlignment(bam);
            continue;
        }   
        // break alignment into discrete blocks,
        bedVector bed_blocks;
        string chrom = refs.at(bam.RefID).RefName;
        GetBamBlocks(bam, chrom, bed_blocks, false, true);
        // create a basic BED entry from the BAM alignment
        BED bed;
        MakeBedFromBam(bam, chrom, bed_blocks, bed);
        bool overlapsFound = false;
        if ((_bamOutput == true) && (_obeySplits == false))
        {
            overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, 
                                           bed.strand, _sameStrand, _diffStrand,
                                           _overlapFraction, _reciprocal);
        }
        else if ( ((_bamOutput == true)  && (_obeySplits == true)) ||
                  ((_bamOutput == false) && (_obeySplits == true)) )
        {
            // find the hits that overlap with the full span of the blocked BED
            _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
                           hits, _sameStrand, _diffStrand,
                           _overlapFraction, _reciprocal);
            // find the overlaps between the block in A and B
            overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
        }
        else if ((_bamOutput == false) && (_obeySplits == false))
        {
            FindOverlaps(bed, hits);
        }
        // save the BAM alignment if overlap reqs. were met
        if (_bamOutput == true) {
            if ((overlapsFound == true) && (_noHit == false))
                writer.SaveAlignment(bam);
            else if ((overlapsFound == false) && (_noHit == true))
                writer.SaveAlignment(bam);
        }
        hits.clear();
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #13
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
Beispiel #14
0
// this has been copied from bamtools utilities, since it isn't in the API. Original file is bamtools_utilities.cpp.
// Like the rest of Bamtools, it is under the BSD license.
bool Filter::ParseRegionString(const string& regionString, BamRegion& region)
{
    // -------------------------------
    // parse region string
    
    // check first for empty string
    if ( regionString.empty() ) 
        return false;   
    
    // non-empty string, look for a colom
    size_t foundFirstColon = regionString.find(':');
    
    // store chrom strings, and numeric positions
    string chrom;
    int startPos;
    int stopPos;
    
    // no colon found
    // going to use entire contents of requested chromosome 
    // just store entire region string as startChrom name
    // use BamReader methods to check if its valid for current BAM file
    if ( foundFirstColon == string::npos ) {
        chrom = regionString;
        startPos   = 0;
        stopPos    = -1;
    }
    
    // colon found, so we at least have some sort of startPos requested
    else {
        
        // store start chrom from beginning to first colon
        chrom = regionString.substr(0,foundFirstColon);
        
        // look for ".." after the colon
        size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
        
        // no dots found
        // so we have a startPos but no range
        // store contents before colon as startChrom, after as startPos
        if ( foundRangeDots == string::npos ) {
            startPos   = atoi( regionString.substr(foundFirstColon+1).c_str() ); 
            stopPos    = -1;
        } 
        
        // ".." found, so we have some sort of range selected
        else {
            
            // store startPos between first colon and range dots ".."
            startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
            
            // look for second colon
            size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
            
            // no second colon found
            // so we have a "standard" chrom:start..stop input format (on single chrom)
            if ( foundSecondColon == string::npos ) {
                stopPos    = atoi( regionString.substr(foundRangeDots+2).c_str() );
            } else {
                return false;
            }
        }
    }
    
    // -------------------------------
    // validate reference IDs & genomic positions
    
    const RefVector references = getReferences();
    
    int RefID = -1;
    for(int i = 0; i < references.size(); i++) {
        if(references[i].RefName == chrom)
            RefID = i;
    }
    
    // if startRefID not found, return false
    if ( RefID == -1 ) {
        cerr << "Can't find chromosome'" << chrom << "'" << endl;
        return false;
    }
    
    // startPos cannot be greater than or equal to reference length
    const RefData& startReference = references.at(RefID);
    if ( startPos >= startReference.RefLength ) {
        cerr << "Start position (" << startPos << ") after end of the reference sequence (" << startReference.RefLength << ")" << endl;
        return false;
    }
    
    // stopPosition cannot be larger than reference length
    const RefData& stopReference = references.at(RefID);
    if ( stopPos > stopReference.RefLength ) {
        cerr << "Start position (" << stopPos << ") after end of the reference sequence (" << stopReference.RefLength << ")" << endl;
        return false;
    }

    // if no stopPosition specified, set to reference end
    if ( stopPos == -1 ) stopPos = stopReference.RefLength;
    
    // -------------------------------
    // set up Region struct & return
    
    region.LeftRefID     = RefID;
    region.LeftPosition  = startPos;
    region.RightRefID    = RefID;;
    region.RightPosition = stopPos;
    return true;
}
Beispiel #15
0
 bool check(const PropertyFilter& filter, const BamAlignment& al) {
   
     bool keepAlignment = true;
     const PropertyMap& properties = filter.Properties;
     PropertyMap::const_iterator propertyIter = properties.begin();
     PropertyMap::const_iterator propertyEnd  = properties.end();
     for ( ; propertyIter != propertyEnd; ++propertyIter ) {
       
         // check alignment data field depending on propertyName
         const string& propertyName = (*propertyIter).first;
         const PropertyFilterValue& valueFilter = (*propertyIter).second;
         
         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
         else if ( propertyName == CIGAR_PROPERTY ) {
             stringstream cigarSs;
             const vector<CigarOp>& cigarData = al.CigarData;
             if ( !cigarData.empty() ) {
                 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                 vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                     const CigarOp& op = (*cigarIter);
                     cigarSs << op.Length << op.Type;
                 }
                 keepAlignment &= valueFilter.check(cigarSs.str());
             }
         }
         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
         else if ( propertyName == ISSINGLETON_PROPERTY ) {
             const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
             keepAlignment &= valueFilter.check(isSingleton);
         }
         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
             const string& refName = filterToolReferences.at(al.MateRefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
         else if ( propertyName == REFERENCE_PROPERTY ) {
             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
             const string& refName = filterToolReferences.at(al.RefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
         else BAMTOOLS_ASSERT_UNREACHABLE;
         
         // if alignment fails at ANY point, just quit and return false
         if ( !keepAlignment ) return false;
     }
   
     BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
     return keepAlignment;
 }
Beispiel #16
0
void BedGenomeCoverage::CoverageBam(string bamFile) {

    ResetChromCoverage();

    // open the BAM file
    BamReader reader;
    if (!reader.Open(bamFile)) {
        cerr << "Failed to open BAM file " << bamFile << endl;
        exit(1);
    }

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // load the BAM header references into a BEDTools "genome file"
    _genome = new GenomeFile(refs);
    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // skip if the read is unaligned
        if (bam.IsMapped() == false)
            continue;

        bool _isReverseStrand = bam.IsReverseStrand();

        //changing second mate's strand to opposite
        if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate())
            _isReverseStrand = !bam.IsReverseStrand();

        // skip if we care about strands and the strand isn't what
        // the user wanted
        if ( (_filterByStrand == true) &&
             ((_requestedStrand == "-") != _isReverseStrand) )
            continue;

        // extract the chrom, start and end from the BAM alignment
        string chrom(refs.at(bam.RefID).RefName);
        CHRPOS start = bam.Position;
        CHRPOS end = bam.GetEndPosition(false, false) - 1;

        // are we on a new chromosome?
        if ( chrom != _currChromName )
            StartNewChrom(chrom);
        if(_pair_chip_) {
            // Skip if not a proper pair
            if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) )
                continue;
            // Skip if wrong coordinates
            if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) ||
                ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) {
                    //chemically designed: left on positive strand, right on reverse one
                    continue;
            }

            /*if(_haveSize) {
                if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = bam.MatePosition+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
                else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = start+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
            } else */

            if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left
                AddCoverage(bam.MatePosition, end);
            }
            else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right
                AddCoverage(start, start + abs(bam.InsertSize) - 1);
            }
        } else if (_haveSize) {
            if(bam.IsReverseStrand()) {
                if(end<_fragmentSize) { //sometimes fragmentSize is bigger :(
                    AddCoverage(0, end);
                } else {
                    AddCoverage(end + 1 - _fragmentSize, end );
                }
            } else {
                AddCoverage(start,start+_fragmentSize - 1);
            }
        } else
        // add coverage accordingly.
        if (!_only_5p_end && !_only_3p_end) {
            bedVector bedBlocks;
            // we always want to split blocks when a D CIGAR op is found.
            // if the user invokes -split, we want to also split on N ops.
            if (_obeySplits) { // "D" true, "N" true
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true);
            }
            else { // "D" true, "N" false
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false);
            }
            AddBlockedCoverage(bedBlocks);
        }
        else if (_only_5p_end) {
            CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
        else if (_only_3p_end) {
            CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
    }
    // close the BAM
    reader.Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
Beispiel #17
0
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block,
SV_SplitRead::
SV_SplitRead(const BamAlignment &bam_a,
             const BamAlignment &bam_b,
             const RefVector &refs,
             int _weight,
             int _id,
             int _sample_id,
             SV_SplitReadReader *_reader)
{
    reader = _reader;
    sample_id = _sample_id;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct cigar_query query_a =
        calc_query_pos_from_cigar(bam_a.CigarData,
                                  bam_a.IsReverseStrand() );
    struct cigar_query query_b =
        calc_query_pos_from_cigar(bam_b.CigarData,
                                  bam_b.IsReverseStrand() );

    struct interval tmp_a, tmp_b;

    tmp_a.strand = '+';
    if (bam_a.IsReverseStrand())
        tmp_a.strand = '-';
    tmp_a.chr = refs.at(bam_a.RefID).RefName;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition();

    tmp_b.strand = '+';
    if (bam_b.IsReverseStrand())
        tmp_b.strand = '-';
    tmp_b.chr = refs.at(bam_b.RefID).RefName;
    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition();


    //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) ||
    //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) &&
    //( tmp_a.start > tmp_b.start ) ) ) {

    if ( (bam_a.RefID > bam_b.RefID) ||
            ( (bam_a.RefID == bam_b.RefID) &&
              (tmp_a.start > tmp_b.start ) ) ) {
        side_r = tmp_a;
        side_l = tmp_b;
        query_r = query_a;
        query_l = query_b;
    } else {
        side_l = tmp_a;
        side_r = tmp_b;
        query_l = query_a;
        query_r = query_b;
    }

    if (side_l.strand != side_r.strand)
        type = SV_BreakPoint::INVERSION;
    else if ( (	( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos < query_r.qs_pos ) ) ||
              (	( side_l.strand == '-' ) &&
                  ( side_r.strand == '-' ) &&
                  ( query_l.qs_pos > query_r.qs_pos) ) )
        type = SV_BreakPoint::DELETION;
    else if ( ( ( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos > query_r.qs_pos ) ) ||
              ( ( side_l.strand == '-' ) &&
                ( side_r.strand == '-' ) &&
                ( query_l.qs_pos < query_r.qs_pos) ) )
        type = SV_BreakPoint::DUPLICATION;
    else {
        cerr << "ERROR IN BAM FILE.  " <<
             "TYPE not detected (DELETION,DUPLICATION,INVERSION)" <<
             endl;
        cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" <<
             query_r.qs_pos << "," << side_r.strand << "\t" <<
             tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" <<
             tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" <<
             endl;

        throw(1);
    }

    weight = _weight;
    id = _id;
}
Beispiel #18
0
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
bool BamMultiReaderPrivate::ValidateReaders() const
{

    m_errorString.clear();

    // skip if 0 or 1 readers opened
    if (m_readers.empty() || (m_readers.size() == 1)) return true;

    // retrieve first reader
    const MergeItem& firstItem = m_readers.front();
    const BamReader* firstReader = firstItem.Reader;
    if (firstReader == 0) return false;

    // retrieve first reader's header data
    const SamHeader& firstReaderHeader = firstReader->GetHeader();
    const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder;

    // retrieve first reader's reference data
    const RefVector& firstReaderRefData = firstReader->GetReferenceData();
    const int firstReaderRefCount = firstReader->GetReferenceCount();
    const int firstReaderRefSize = firstReaderRefData.size();

    // iterate over all readers
    std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
    std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
    for (; readerIter != readerEnd; ++readerIter) {
        const MergeItem& item = (*readerIter);
        BamReader* reader = item.Reader;
        if (reader == 0) continue;

        // get current reader's header data
        const SamHeader& currentReaderHeader = reader->GetHeader();
        const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder;

        // check compatible sort order
        if (currentReaderSortOrder != firstReaderSortOrder) {
            const std::string message =
                std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " +
                firstReaderSortOrder + ", but found " + currentReaderSortOrder;
            SetErrorString("BamMultiReader::ValidateReaders", message);
            return false;
        }

        // get current reader's reference data
        const RefVector currentReaderRefData = reader->GetReferenceData();
        const int currentReaderRefCount = reader->GetReferenceCount();
        const int currentReaderRefSize = currentReaderRefData.size();

        // init reference data iterators
        RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
        RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
        RefVector::const_iterator currentRefIter = currentReaderRefData.begin();

        // compare reference counts from BamReader ( & container size, in case of BR error)
        if ((currentReaderRefCount != firstReaderRefCount) ||
            (firstReaderRefSize != currentReaderRefSize)) {
            std::stringstream s;
            s << "mismatched reference count in " << reader->GetFilename() << ", expected "
              << firstReaderRefCount << ", but found " << currentReaderRefCount;
            SetErrorString("BamMultiReader::ValidateReaders", s.str());
            return false;
        }

        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (firstRefIter != firstRefEnd) {
            const RefData& firstRef = (*firstRefIter);
            const RefData& currentRef = (*currentRefIter);

            // compare reference name & length
            if ((firstRef.RefName != currentRef.RefName) ||
                (firstRef.RefLength != currentRef.RefLength)) {
                std::stringstream s;
                s << "mismatched references found in" << reader->GetFilename()
                  << "expected: " << std::endl;

                // print first reader's reference data
                RefVector::const_iterator refIter = firstReaderRefData.begin();
                RefVector::const_iterator refEnd = firstReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    std::stringstream s;
                    s << entry.RefName << ' ' << std::endl;
                }

                s << "but found: " << std::endl;

                // print current reader's reference data
                refIter = currentReaderRefData.begin();
                refEnd = currentReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    s << entry.RefName << ' ' << entry.RefLength << std::endl;
                }

                SetErrorString("BamMultiReader::ValidateReaders", s.str());
                return false;
            }

            // update iterators
            ++firstRefIter;
            ++currentRefIter;
        }
    }

    // if we get here, everything checks out
    return true;
}
Beispiel #19
0
void BedWindow::WindowIntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    vector<BED> hits;                   // vector of potential hits
    // reserve some space
    hits.reserve(100);

    _bedA->bedType = 6;
    BamAlignment bam;
    bool overlapsFound;
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
            a.end   = bam.GetEndPosition(false, false);

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);
            a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                overlapsFound = FindOneOrMoreWindowOverlaps(a);
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true)
                        writer.SaveAlignment(bam);
                }
            }
            else {
                FindWindowOverlaps(a, hits);
                hits.clear();
            }
        }
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #20
0
int main(int argc, char* argv[])
{

    //{{{ setup
    double trim_threshold = 1e-10;
    double merge_threshold = 1e-10;
    int min_weight = 0;
    int min_sample_weight = 0;
    bool show_evidence = false;
    bool has_pe_bams = false,
	has_sr_bams = false,
	has_bedpes = false;
    CHR_POS window_size = 1000000;
    string inter_chrom_file_prefix = "./";
    int call_id = 0;
    bool has_next = false;
    vector<SV_EvidenceReader*>::iterator i_er;
    UCSCBins<SV_BreakPoint*> r_bin;
    srand (time(NULL));
    string exclude_bed_file;
    bool has_exclude = false;
    string genome_file; 
    bool has_genome_file = false;
    int print_prob = 0;
    int bedpe_output = 0;
    //vector<string> bam_files;
    //}}}

    //{{{ check to see if we should print out some help
    if (argc == 1)
        ShowHelp();

    for(int i = 1; i < argc; i++) {
        int parameterLength = (int)strlen(argv[i]);

        if( (PARAMETER_CHECK("-h", 2, parameterLength)) ||
                (PARAMETER_CHECK("--help", 6, parameterLength))) {
            ShowHelp();
        }
    }
    //}}}

    //{{{ do some parsing and setup
    vector<SV_EvidenceReader*> evidence_readers;
    map<pair<string,string>, SV_EvidenceReader*> bam_evidence_readers;

    for(int i = 1; i < argc; i++) {
        int parameterLength = (int)strlen(argv[i]);

        if(PARAMETER_CHECK("-pe", 3, parameterLength)) {
            //{{{
            has_pe_bams = true;
            SV_PairReader *pe_r = new SV_PairReader();

            if ((i+1) < argc) {
                char *params = argv[i + 1];
                char *param_val, *brka, *brkb;

                for (	param_val = strtok_r(params, ",", &brka);
                        param_val;
                        param_val = strtok_r(NULL, ",", &brka)) {
                    char *param = strtok_r(param_val, ":", &brkb);
                    char *val = strtok_r(NULL, ":", &brkb);

                    if (val == NULL) {
                        cerr << "Parameter required for " << param << endl;
                        ShowHelp();
                    }

                    if ( ! pe_r->add_param(param, val) ) {
                        cerr << "Unknown pair end parameter:" << param << endl;
                        ShowHelp();
                    }
                }
            }

            string msg = pe_r->check_params();
            if ( msg.compare("") == 0 ) {
                // Add to list of readers
                // Set the distro map

                pe_r->initialize();
                SV_Evidence::distros[pe_r->ev_id] =
                    pair<log_space*,log_space*>(
                        SV_Pair::get_bp_interval_probability('+',
                                pe_r->distro_size,
                                pe_r->distro),
                        SV_Pair::get_bp_interval_probability('-',
                                pe_r->distro_size,
                                pe_r->distro));
                SV_Evidence::distros_size[pe_r->ev_id] = pe_r->distro_size;
            } else {
                cerr << "missing pair end parameters:" << msg << endl;
                ShowHelp();
            }

	    // create SV_EvidenceReaders by (BAM, read_group) pairs
	    if (pe_r->read_group.size() == 0)
	        bam_evidence_readers[pair<string,string> (pe_r->get_source_file_name(),"")] = pe_r;
	    else {
	        for (vector<string>::iterator it = pe_r->read_group.begin();
		     it != pe_r->read_group.end();
		     ++it) {
		    pair<string,string> ev_pair (pe_r->get_source_file_name(),*it);
		    bam_evidence_readers[ev_pair] = pe_r;
		}
	    }

            i++;
            //}}}
        }

        else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) {
            //{{{
            has_bedpes = true;
            SV_BedpeReader *be_r = new SV_BedpeReader();

            if ((i+1) < argc) {
                char *params = argv[i + 1];
                char *param_val, *brka, *brkb;

                for (	param_val = strtok_r(params, ",", &brka);
                        param_val;
                        param_val = strtok_r(NULL, ",", &brka)) {
                    char *param = strtok_r(param_val, ":", &brkb);
                    char *val = strtok_r(NULL, ":", &brkb);

                    if (val == NULL) {
                        cerr << "Parameter requied for " << param << endl;
                        ShowHelp();
                    }

                    if ( ! be_r->add_param(param, val) ) {
                        cerr << "Unknown bedpe parameter:" << param << endl;
                        ShowHelp();
                    }
                }
            }

            string msg = be_r->check_params();
            if ( msg.compare("") == 0 ) {
                be_r->initialize();
                SV_Evidence::distros[be_r->ev_id] =
                    pair<log_space*,log_space*>(
                        SV_Bedpe::
                        get_bp_interval_probability('+',
                                                    be_r->distro_size,
                                                    be_r->distro),
                        SV_Bedpe::
                        get_bp_interval_probability('-',
                                                    be_r->distro_size,
                                                    be_r->distro));
                SV_Evidence::distros_size[be_r->ev_id] = be_r->distro_size;
                evidence_readers.push_back(be_r);
            } else {
                cerr << "missing bedpe parameters:" << msg << endl;
                ShowHelp();
            }

            i++;
            //}}}
        }

        else if(PARAMETER_CHECK("-sr", 3, parameterLength)) {
            //{{{
            has_sr_bams = true;
            SV_SplitReadReader *sr_r = new SV_SplitReadReader();

            if ((i+1) < argc) {
                char *params = argv[i + 1];
                char *param_val, *brka, *brkb;

                for (	param_val = strtok_r(params, ",", &brka);
                        param_val;
                        param_val = strtok_r(NULL, ",", &brka)) {
                    char *param = strtok_r(param_val, ":", &brkb);
                    char *val = strtok_r(NULL, ":", &brkb);

                    if (val == NULL) {
                        cerr << "Parameter required for " << param << endl;
                        ShowHelp();
                    }

                    if ( ! sr_r->add_param(param, val) ) {
                        cerr << "Unknown split read parameter:" << param << endl;
                        ShowHelp();
                    }
                }
            }

            string msg = sr_r->check_params();
            if ( msg.compare("") == 0 ) {
                sr_r->initialize();
                SV_Evidence::distros[sr_r->ev_id] =
                    pair<log_space*,log_space*>(
                        SV_SplitRead::
                        get_bp_interval_probability('+',
                                                    sr_r->back_distance),
                        SV_SplitRead::
                        get_bp_interval_probability('-',
                                                    sr_r->back_distance));

                SV_Evidence::distros_size[sr_r->ev_id] =
                    sr_r->back_distance * 2 + 1;
            } else {
                cerr << "missing split read parameters:" << msg << endl;
                ShowHelp();
            }

	    // create SV_EvidenceReaders by (BAM, read_group) pairs
	    if (sr_r->read_group.size() == 0)
	        bam_evidence_readers[pair<string,string> (sr_r->get_source_file_name(),"")] = sr_r;
	    else {
	        for (vector<string>::iterator it = sr_r->read_group.begin();
		     it != sr_r->read_group.end();
		     ++it) {
		    pair<string,string> ev_pair (sr_r->get_source_file_name(),*it);
		    bam_evidence_readers[ev_pair] = sr_r;
		}
	    }

            i++;
            //}}}
        }

        else if(PARAMETER_CHECK("-tt", 3, parameterLength)) {
            if ((i+1) < argc) {
                trim_threshold = 1 - atof(argv[i + 1]);
                i++;
            }
        }

        else if(PARAMETER_CHECK("-mt", 3, parameterLength)) {
            if ((i+1) < argc) {
                merge_threshold = atof(argv[i + 1]);
                i++;
            }
        }

        else if(PARAMETER_CHECK("-mw", 3, parameterLength)) {
            if ((i+1) < argc) {
                min_weight = atoi(argv[i + 1]);
                i++;
            }
        }

        else if(PARAMETER_CHECK("-msw", 4, parameterLength)) {
            if ((i+1) < argc) {
                min_sample_weight = atoi(argv[i + 1]);
                i++;
            }
        }
	
        else if(PARAMETER_CHECK("-w", 2, parameterLength)) {
            if ((i+1) < argc) {
                window_size = atoi(argv[i + 1]);
                i++;
            }
        }

        else if(PARAMETER_CHECK("-x", 2, parameterLength)) {
            if ((i+1) < argc) {
                exclude_bed_file = argv[i + 1];
                has_exclude = true;
                i++;
            }
        }

        else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
            if ((i+1) < argc) {
                genome_file = argv[i + 1];
                has_genome_file = true;
                i++;
            }
        }


        else if(PARAMETER_CHECK("-t", 2, parameterLength)) {
            if ((i+1) < argc) {
                inter_chrom_file_prefix = argv[i + 1];
                i++;
            }
        }

        else if(PARAMETER_CHECK("-e", 2, parameterLength)) {
            show_evidence = true;
        }

        else if(PARAMETER_CHECK("-P", 2, parameterLength)) {
            print_prob = 1;
        }

	else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
            bedpe_output = 1;
        }

        else {
            cerr << endl << "*****ERROR: Unrecognized parameter: " <<
                 argv[i] << " *****" << endl << endl;
            ShowHelp();
        }
    }

    if (min_weight == 0 && min_sample_weight == 0) {
        cerr << endl << "*****ERROR: must set min weight or min sample weight  *****" <<
             endl << endl;
        ShowHelp();
    }

    SV_BreakPoint::p_trim_threshold = trim_threshold;
    SV_BreakPoint::p_merge_threshold = merge_threshold;

    // append rand number to the temp inter-chrom file
    inter_chrom_file_prefix = inter_chrom_file_prefix + ToString(rand());

    if (has_exclude)
        parse_exclude_file(exclude_bed_file, SV_Evidence::exclude_regions);

    SV_BamReader *bam_r;
    if (has_pe_bams || has_sr_bams) {
        bam_r = new SV_BamReader(&bam_evidence_readers);
        bam_r->set_inter_chrom_file_name(inter_chrom_file_prefix + ".bam");
        bam_r->initialize();
        evidence_readers.push_back(bam_r);
    }

    map<string, int> genome_order;
    if (has_genome_file) {
        GenomeFile *genome;
        genome  = new GenomeFile(genome_file);
        vector<string> chroms = genome->getChromList();
        vector<string>::iterator chr_itr;
        int chr_count = 0;
        for (chr_itr = chroms.begin(); chr_itr != chroms.end(); ++chr_itr) {
            genome_order[*chr_itr] = chr_count;
            chr_count += 1;
        }
    } else if (has_pe_bams || has_sr_bams) {
        //map<string, SV_EvidenceReader*> bam_evidence_readers;
        //map<string, SV_EvidenceReader*>::iterator bam_itr;
        //bam_itr = bam_evidence_readers.begin();
        RefVector refs = bam_r->refs;
        vector<RefData>::iterator ref_itr;
        int chr_count = 0;
        for (ref_itr = refs.begin(); ref_itr != refs.end(); ++ref_itr) {
            RefData r = *ref_itr;
            genome_order[r.RefName] = chr_count;
            chr_count += 1;
        }
    } else {
            cerr << endl << "*****ERROR: Unknown chromosome order.  " <<
                "Chromosome order must be\nspecified by either bam header " <<
                "or a genome file *****" << endl << endl;
	    ShowHelp();
    }

    //}}} end parsing 

    //{{{ Test if there lines to process in each input file
    for ( i_er = evidence_readers.begin();
            i_er != evidence_readers.end();
            ++i_er) {
        SV_EvidenceReader *er = *i_er;
        has_next = has_next || er->has_next();
    }
    //}}}

    // print VCF header
    if (! bedpe_output) {
	SV_VcfVariant *header_var = new SV_VcfVariant();
	map<int,string>::iterator s_itr;
	for (s_itr = SV_EvidenceReader::sample_names.begin();
	     s_itr != SV_EvidenceReader::sample_names.end();
	     ++s_itr)
	    header_var->add_sample(s_itr->second);

	// add appropriate fields to active_formats
	header_var->set_sample_field(SV_EvidenceReader::
				     sample_names.begin()->second,
				     "GT",
				     "./.");
	header_var->set_sample_field(SV_EvidenceReader::
				     sample_names.begin()->second,
				     "SU",
				     "0");
	
	if (has_pe_bams)
	    header_var->set_sample_field(SV_EvidenceReader::
					 sample_names.begin()->second,
					 "PE",
					 "0");
	if (has_sr_bams)
	    header_var->set_sample_field(SV_EvidenceReader::
					 sample_names.begin()->second,
					 "SR",
					 "0");

	if (has_bedpes)
	    header_var->set_sample_field(SV_EvidenceReader::
					 sample_names.begin()->second,
					 "BD",
					 "0");
    	header_var->print_header();
        delete(header_var);
    }
    
    //{{{ process the intra-chrom events that were saved to a file
    CHR_POS max_pos = 0;
    string last_min_chr = "";
    while ( has_next ) {
        string min_chr = "";
        //{{{ find min_chr among all input files
        for ( i_er = evidence_readers.begin();
                i_er != evidence_readers.end();
                ++i_er) {
            SV_EvidenceReader *er = *i_er;

            if ( er->has_next() ) {
                string curr_chr = er->get_curr_chr();
                if ( ( min_chr.compare("") == 0 ) ||
                     ( genome_order[curr_chr] < genome_order[min_chr] ) ) {
                    min_chr = curr_chr;
                }
            }
        }
        //}}}

        //{{{ if the chrome switches, reset the max_pos
        if (last_min_chr.compare(min_chr) != 0) {
            max_pos = window_size;
            last_min_chr = min_chr;
        }
        //}}}

        cerr << min_chr << "\t" << max_pos << endl;
        bool input_processed = true;

        while (input_processed) {
            input_processed = false;

            //{{{ read the files
            for ( i_er = evidence_readers.begin();
                    i_er != evidence_readers.end();
                    ++i_er) {

                SV_EvidenceReader *er = *i_er;

                if ( er->has_next() ) {
                    string curr_chr = er->get_curr_chr();
                    CHR_POS curr_pos = er->get_curr_pos();
                    if ( ( genome_order[curr_chr] <= genome_order[min_chr] ) &&
                         ( curr_pos < max_pos) ) {
                        er->process_input_chr_pos(curr_chr, max_pos, r_bin);
                        input_processed = true;
                    }
                }
            }
            //}}}

            //{{{ call breakpoints
            vector< UCSCElement<SV_BreakPoint*> > values =
                r_bin.values(min_chr, max_pos);

            vector< UCSCElement<SV_BreakPoint*> >::iterator it;
            for (it = values.begin(); it < values.end(); ++it) {
                SV_BreakPoint *bp = it->value;

                // Make sure both ends of the bp are less than or equal to the
                // current chrom
                if (bp->weight >= min_weight
		    && bp->get_max_sample_weight() >= min_sample_weight) {
                    //bp->do_it();
                    // make sure there was not an error with trimming
                    if (bp->trim_intervals() > 0) {
		        if (bedpe_output) {
			    bp->print_bedpe(++call_id, print_prob);
			    if (show_evidence)
			        bp->print_evidence("\t");
			}
			else {
			    SV_VcfVariant *vcf_var =
				new SV_VcfVariant(bp,
						  ++call_id,
						  print_prob);
			    vcf_var->print_var();
                            delete(vcf_var);
			}
                    }
                }

                if (r_bin.remove(*it, false, false, true) != 0) {
                    cerr << "Error removing element:" << *bp << endl;
                    abort();
                }
                bp->free_evidence();
                delete bp;
            }
            //}}}

            // move the window
            max_pos = max_pos *2;
        }

        //{{{ Test if there is still input lines
        has_next = false;
        for ( i_er = evidence_readers.begin();
                i_er != evidence_readers.end();
                ++i_er) {
            SV_EvidenceReader *er = *i_er;
            has_next = has_next || er->has_next();
        }
        //}}}
    }
    //}}}

    //{{{ terminate input files
    for ( i_er = evidence_readers.begin();
            i_er != evidence_readers.end();
            ++i_er) {
        SV_EvidenceReader *er = *i_er;
        er->terminate();
    }
    //}}}

    //{{{ Call remaining intra breakpoints
    vector< UCSCElement<SV_BreakPoint*> > values = r_bin.values();
    vector< UCSCElement<SV_BreakPoint*> >::iterator it;

    for ( it = values.begin();
            it != values.end(); ++it) {
        SV_BreakPoint *bp = it->value;

	if (bp->weight >= min_weight
	    && bp->get_max_sample_weight() >= min_sample_weight) {
            //bp->do_it();
            if (bp->trim_intervals() > 0) {
		if (bedpe_output) {
		    bp->print_bedpe(++call_id, print_prob);
		    if (show_evidence)
			bp->print_evidence("\t");
		}
		else {
		    SV_VcfVariant *vcf_var =
			new SV_VcfVariant(bp,
					  ++call_id,
					  print_prob);
		    vcf_var->print_var();
		}
            }
        }

        if (r_bin.remove(*it, false, false, true) != 0) {
            cerr << "Error removing element" << endl;
            abort();
        }
        bp->free_evidence();
        delete bp;
    }
    //}}}

    //{{{ process the inter-chrom events that were saved to a file
    string intra_bam_file_name = inter_chrom_file_prefix + ".bam";
    ifstream intra_bam_file( intra_bam_file_name.c_str() );
    if (intra_bam_file.good()) {
        intra_bam_file.close();

        sort_inter_chrom_bam( inter_chrom_file_prefix + ".bam",
                              inter_chrom_file_prefix + ".sort.bam");

        SV_InterChromBamReader *ic_r = new SV_InterChromBamReader(
            inter_chrom_file_prefix + ".sort.bam",
            &bam_evidence_readers);
        ic_r->initialize();

        vector<SV_EvidenceReader*> inter_chrom_evidence_readers;
        inter_chrom_evidence_readers.push_back(ic_r);

        // There are two files containg all of the inter-chrom events, one bam
        // and one bedpe, each line in the file corresponds to the properies
        // set in one of the readers.  Each line has a "LS" (lumpy source)
        // property that gives its source file name.  Using that entry, the
        // line will be sent to the reader for processing.

        // get new evidence readers for both bedpe and bam inter-chrom
        // readers
        has_next = true;

        int32_t last_min_primary_refid = -1;
        int32_t last_min_secondary_refid = -1;
        max_pos = 0;
        while ( has_next ) {
            string min_primary_chr = "";
            string min_secondary_chr = "";
            int32_t min_primary_refid = -1;
            int32_t min_secondary_refid = -1;

            //{{{ find min_chr pair among all input files
            for ( i_er = inter_chrom_evidence_readers.begin();
                    i_er != inter_chrom_evidence_readers.end();
                    ++i_er) {
                SV_EvidenceReader *er = *i_er;

                if ( er->has_next() ) {
                    int32_t curr_primary_refid = er->get_curr_primary_refid();
                    int32_t curr_secondary_refid =
                        er->get_curr_secondary_refid();

                    if ( (( min_primary_refid == -1 ) &&
                            ( min_secondary_refid == -1 )) ||
                            (( curr_primary_refid < min_primary_refid)  &&
                             ( curr_secondary_refid < min_secondary_refid)) ) {
                        min_primary_refid = curr_primary_refid;
                        min_secondary_refid = curr_secondary_refid;
                        min_secondary_chr = er->get_curr_secondary_chr();
                        min_primary_chr = er->get_curr_primary_chr();
                    }
                }
            }
            //}}}

            // if the chrome pair switches, reset the max_pos
            if ( (last_min_primary_refid != min_primary_refid) ||
                    (last_min_secondary_refid != min_secondary_refid) ) {
                max_pos = window_size;
                last_min_primary_refid = min_primary_refid;
                last_min_secondary_refid = min_secondary_refid;
            }

            bool input_processed = true;

            while (input_processed) {
                input_processed = false;

                //{{{ read the files, process anything in frame
                for ( i_er = inter_chrom_evidence_readers.begin();
                        i_er != inter_chrom_evidence_readers.end();
                        ++i_er) {

                    SV_EvidenceReader *er = *i_er;

                    if ( er->has_next() ) {
                        int32_t curr_primary_refid =
                            er->get_curr_primary_refid();
                        int32_t curr_secondary_refid =
                            er->get_curr_secondary_refid();
                        CHR_POS curr_pos = er->get_curr_primary_pos();

                        if ( (curr_primary_refid <= min_primary_refid) &&
                                (curr_secondary_refid <= min_secondary_refid) &&
                                (curr_pos < max_pos) ) {

                            er->process_input_chr_pos(
                                er->get_curr_primary_chr(),
                                er->get_curr_secondary_chr(),
                                max_pos,
                                r_bin);
                            input_processed = true;
                        }
                    }
                }
                //}}}

                //{{{ get breakpoints
                // get anything that has ends in both chroms
                vector< UCSCElement<SV_BreakPoint*> > values =
                    r_bin.values(min_secondary_chr);

                vector< UCSCElement<SV_BreakPoint*> >::iterator it;

                for (it = values.begin(); it < values.end(); ++it) {
                    SV_BreakPoint *bp = it->value;
		    if (bp->weight >= min_weight
			&& bp->get_max_sample_weight() >= min_sample_weight) {
                        //bp->do_it();
                        if (bp->trim_intervals() > 0) {
			    if (bedpe_output) {
				bp->print_bedpe(++call_id, print_prob);
				if (show_evidence)
				    bp->print_evidence("\t");
			    }
			    else {
				SV_VcfVariant *vcf_var =
				    new SV_VcfVariant(bp,
						      ++call_id,
						      print_prob);
				vcf_var->print_var();
			    }
                        }
                    }

                    if (r_bin.remove(*it, false, false, true) != 0) {
                        cerr << "Error removing element" << endl;
                        abort();
                    }
                    bp->free_evidence();
                    delete bp;
                }
                //}}}

                max_pos = max_pos * 2;
            }

            has_next = false;
            //{{{ Test if there is still input lines
            for ( i_er = inter_chrom_evidence_readers.begin();
                    i_er != inter_chrom_evidence_readers.end();
                    ++i_er) {
                SV_EvidenceReader *er = *i_er;
                has_next = has_next || er->has_next();
            }
            //}}}
        }

        //{{{ Call remaining break points
        values = r_bin.values();

        for (it = values.begin(); it != values.end(); ++it) {
            SV_BreakPoint *bp = it->value;
	    if (bp->weight >= min_weight
		&& bp->get_max_sample_weight() >= min_sample_weight) {
                //bp->do_it();
                if (bp->trim_intervals() > 0) {
		    if (bedpe_output) {
			bp->print_bedpe(++call_id, print_prob);
			if (show_evidence)
			    bp->print_evidence("\t");
		    }
		    else {
			SV_VcfVariant *vcf_var =
			    new SV_VcfVariant(bp,
					      ++call_id,
					      print_prob);
			vcf_var->print_var();
		    }
                }
            }

            if (r_bin.remove(*it, false, false, true) != 0) {
                cerr << "Error removing element" << endl;
                abort();
            }
            bp->free_evidence();
            delete bp;
        }
        //}}}

        for ( i_er = inter_chrom_evidence_readers.begin();
                i_er != inter_chrom_evidence_readers.end();
                ++i_er) {
            SV_EvidenceReader *er = *i_er;
            delete(er);
        }
    }
    //}}}

    //{{{ free up stuff
    string s = inter_chrom_file_prefix + ".bam";
    remove(s.c_str());
    s = inter_chrom_file_prefix + ".sort.bam";
    remove(s.c_str());
    map<int, pair<log_space*,log_space*> >::iterator e_it;
    for( e_it =  SV_Evidence::distros.begin();
            e_it !=  SV_Evidence::distros.end();
            ++e_it) {
        free(e_it->second.first);
        free(e_it->second.second);
    }
#if 0
    for ( i_er = evidence_readers.begin();
            i_er != evidence_readers.end();
            ++i_er) {
        SV_EvidenceReader *er = *i_er;
        delete(er);
    }
#endif
    evidence_readers.clear();
    bam_evidence_readers.clear();
    //}}}
    return 0;
}
Beispiel #21
0
// Same as ParseRegionString() above, but accepts a BamMultiReader
bool ParseRegionString(const string& regionString,
                                  const BamReader& reader,
                                  BamRegion& region)
{
    // -------------------------------
    // parse region string

    // check first for empty string
    if ( regionString.empty() )
        return false;

    //cerr << "ParseRegionString Input: " << regionString << endl;

    // non-empty string, look for a colom
    size_t foundFirstColon = regionString.find(':');

    // store chrom strings, and numeric positions
    string startChrom;
    string stopChrom;
    int startPos;
    int stopPos;

    // no colon found
    // going to use entire contents of requested chromosome
    // just store entire region string as startChrom name
    // use BamReader methods to check if its valid for current BAM file
    if ( foundFirstColon == string::npos ) {
        startChrom = regionString;
        startPos   = 0;
        stopChrom  = regionString;
        stopPos    = -1;
    }

    // colon found, so we at least have some sort of startPos requested
    else {
        // store start chrom from beginning to first colon
        startChrom = regionString.substr(0,foundFirstColon);

        // look for ".." after the colon
        size_t foundRangeDots = regionString.find("..", foundFirstColon+1);

        // no dots found
        // so we have a startPos but no range
        // store contents before colon as startChrom, after as startPos
        if ( foundRangeDots == string::npos )
        {
            startPos   = atoi( regionString.substr(foundFirstColon+1).c_str() );
            stopChrom  = startChrom;
            stopPos    = -1;
        }

        // ".." found, so we have some sort of range selected
        else {

            // store startPos between first colon and range dots ".."
            startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );

            // look for second colon
            size_t foundSecondColon = regionString.find(':', foundRangeDots+1);

            // no second colon found
            // so we have a "standard" chrom:start..stop input format (on single chrom)
            if ( foundSecondColon == string::npos ) {
                stopChrom  = startChrom;
                stopPos    = atoi( regionString.substr(foundRangeDots+2).c_str() );
            }

            // second colon found
            // so we have a range requested across 2 chrom's
            else {
                stopChrom  = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
                stopPos    = atoi( regionString.substr(foundSecondColon+1).c_str() );
            }
        }
    }
    
    // -------------------------------
    // validate reference IDs & genomic positions
    const RefVector references = reader.GetReferenceData();

    // if startRefID not found, return false
    int startRefID = reader.GetReferenceID(startChrom);
    if ( startRefID == -1 ) return false;
    // startPos cannot be greater than or equal to reference length
    const RefData& startReference = references.at(startRefID);
    if ( startPos >= startReference.RefLength ) return false;

    // if stopRefID not found, return false
    int stopRefID = reader.GetReferenceID(stopChrom);
    if ( stopRefID == -1 ) return false;

    // stopPosition cannot be larger than reference length
    const RefData& stopReference = references.at(stopRefID);
    if ( stopPos > stopReference.RefLength ) return false;

    // if no stopPosition specified, set to reference end
    if ( stopPos == -1 ) stopPos = stopReference.RefLength;

    // -------------------------------
    // set up Region struct & return

    region.LeftRefID     = startRefID;
    region.LeftPosition  = startPos;
    region.RightRefID    = stopRefID;;
    region.RightPosition = stopPos;

    //cerr << "ParseRegionString " << region.LeftRefID <<  " " << region.LeftPosition << " " << region.RightPosition << endl;
    return true;
}
Beispiel #22
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);

  //region file input (the region file should be sorted as the same way as the bam file)
  ifstream region_f;
  region_f.open(param->region_f, ios_base::in);  // the region file is opened

  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();

  if ( ! reader.LocateIndexes() )     // opens any existing index files that match our BAM files
    reader.CreateIndexes();         // creates index files for BAM files that still lack one


  // locus bias
  struct lb empty_profile = {0,0,0,0};
  vector <struct lb> locus_b(1000, empty_profile);
  // output locus bias file
  string locus_bias_set = param->lbias;
  ofstream locus_bias;
  if ( locus_bias_set != "" ) {
    locus_bias.open(param->lbias);
    if ( !locus_bias ) {
      cerr << "can not open locus_bias file.\n";
      exit(0);
    }
  }

  //should decide which chromosome
  string line;
  string old_chr = "SRP";
  string type = param->type;

  //whether do some position-level pile-up stuff
  bool posc = false;
  ofstream posc_f;
  ofstream chrmap_f;
  string poscset = param->posc;
  if ( poscset != "" ) {
    posc = true;
    posc_f.open(param->posc);
    chrmap_f.open(param->chrmap);
  }

  bool noChr;
  if ( param->nochr == 1 ){
    noChr = true;
  } else {
    noChr = false;
  }

  //regions for the input of region file
  deque <struct region> regions;

  getline(region_f, line); //get the first line
  eatline(line,regions,noChr);
  
  deque <struct region>::iterator it = regions.begin();

  while ( it->chr != old_chr ) {

    old_chr = it->chr;  // set the current chr as old chr

    int chr_id  = reader.GetReferenceID(it->chr);

    if ( chr_id == -1 ) {  //reference not found

      for (; it != regions.end() && it->chr == old_chr; ) {
        gene_processing(*it,locus_b);           // print the old region info
        it = regions.erase(it);         // erase the current region
      }
  
      while ( regions.empty() ) {    
        getline(region_f, line);
        if ( region_f.eof() ){
          cerr << "finished: end of region file, zone 0" << endl;
          break;
        }
        eatline(line, regions,noChr);
        it = regions.begin();
        if (it->chr == old_chr){  
          gene_processing(*it,locus_b);      
          regions.clear();
          continue;
        }
      }
      continue;
    }

    int chr_len = refs.at(chr_id).RefLength;

    if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region
      {
        cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl;
        reader.Close();
        exit(1);
      }

    //pile-up pos stats
    set <string> fragment;
    map <string, unsigned int> pileup;
    bool isposPileup = false;
    unsigned int old_start   = 0;
    unsigned int total_tags  = 0;
    unsigned int total_pos   = 0;
    unsigned int pileup_pos  = 0;


    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {

      if ( bam.IsMapped() == false ) continue;   // skip unaligned reads

      unsigned int unique;
      bam.GetTag("NH", unique);
      if (param->unique == 1) {
        if (unique != 1) {                       // skipe uniquelly mapped reads
          continue;
        }
      }

      if (read_length == 0){
        read_length = bam.Length;
      }

      //cout << bam.Name << endl;
      string chrom = refs.at(bam.RefID).RefName;
      string strand = "+";
      if (bam.IsReverseStrand()) strand = "-";

      unsigned int alignmentStart =  bam.Position+1;
      unsigned int mateStart;
      if (type == "p") mateStart = bam.MatePosition+1;
      unsigned int alignmentEnd = bam.GetEndPosition();
      unsigned int cigarEnd;
      vector <int> blockLengths;
      vector <int> blockStarts;
      blockStarts.push_back(0);
      ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd);


      // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads)
      if (posc == true && unique == 1) {

        if (type == "p" && fragment.count(bam.Name) > 0) 
          fragment.erase(bam.Name);

        else {

          total_tags++;
          if (type == "p"){
            fragment.insert(bam.Name);
          }
          string alignSum;
          if (type == "p") {
             alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand;
          } else {
             alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand;
          }

          if ( alignmentStart != old_start ) {
            isposPileup = false;
            map <string, unsigned int>::iterator pit = pileup.begin();            
            for (; pit != pileup.end(); pit++) {
              posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl;     //print pileup
            }
            pileup.clear();           //clear pileup set
            pileup.insert( pair <string, unsigned int> (alignSum, 1) );  //insert the new read
            total_pos++;
          }

          else if ( alignmentStart == old_start ) { // same starts
            if ( pileup.count(alignSum) > 0 ) {  // pileup
              if ( pileup[alignSum] == 1 && isposPileup == false ) { 
                pileup_pos++; isposPileup = true;
              }
              pileup[alignSum]++;
            }
            else {
              pileup.insert( pair <string, unsigned int> (alignSum, 1) );
            }
          } //same starts

        }   //new fragment

        old_start = alignmentStart;
      } // do pos check



      float incre = 1.;
      if (blockStarts.size() > 1) incre = 0.5;     // incre half for junction reads
      incre /= static_cast < float >(unique);        // for multi aligned reads

      deque <struct region>::iterator iter = regions.begin();

      if ( iter->start > alignmentEnd ) continue;  // skip reads not overlapping with the first region

      while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) {

        if (iter->end < alignmentStart) {            // the region end is beyond the alignmentStart

          gene_processing(*iter,locus_b);            // processing
          iter = regions.erase(iter);                // this region should be removed
          if ( regions.empty() ) { 
            getline(region_f, line);                        // get a line of region file
            if ( ! region_f.eof() ) {
              eatline(line, regions, noChr);                         // eat a line and put it into the duque
              iter = regions.begin();
            }
            else {  // it's reaching the end of the region file
              cerr << "finished: end of region file, zone 3" << endl;
              break;
            }
          }
          continue;
        }

        if (iter->end >= alignmentStart && iter->start <= alignmentEnd) {  //overlapping, should take action

          vector <int>::iterator cigit = blockStarts.begin();
          for (; cigit != blockStarts.end(); cigit++) {
            unsigned int current_start = *cigit + alignmentStart;
            int current_pos = current_start - (iter->start);
            //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl;
            if ( (iter->tags).count(current_pos) > 0 ) {
              (iter->tags)[current_pos] += incre;
            }
            else
              (iter->tags).insert( pair<int, float>(current_pos, incre) );  
          }

        }  // overlapping take action!

        if ( (iter+1) != regions.end() )
          iter++;                                           // if this region is not the last element in the deque
        else {                                              // the last element
          getline(region_f, line);                          // get a line of region file
          if ( ! region_f.eof() ){
            eatline(line, regions, noChr);                         // eat a line and put it into the duque
            iter = regions.end();
            iter--;
          }
          else {  //it's reaching the end of the region file
            cerr << "finished: end of region file, zone 4" << endl;
            break;
          }
        }

      } //while

    }  // read a bam


    // print chr map
    if (posc == true) {
      chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl;
    } 
 
    //somehow to loop back
    it = regions.begin();                   //reset to begin
    for (; it != regions.end() && it->chr == old_chr; ) {
      gene_processing(*it,locus_b);              // print the old region info
      it = regions.erase(it);             // erase the current region
    }
  
    while ( regions.empty() ) {    

      getline(region_f, line);
      if ( region_f.eof() ){
        cerr << "finished: end of region file, zone 5" << endl;
        //print locus bias
        for (unsigned int l = 0; l < 1000; l++){
	  locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl;
	}
        exit(0);
      }
      eatline(line, regions, noChr);
      it = regions.begin();
      if (it->chr == old_chr){
        gene_processing(*it, locus_b);      
        regions.clear();
        continue;
      }
    }

  } // region chr != old chr
      
  regions.clear();
  reader.Close();
  region_f.close();
  return 0;

} //main
Beispiel #23
0
void BedIntersect::IntersectBam(string bamFile) {

	// load the "B" bed file into a map so
	// that we can easily compare "A" to it for overlaps
	_bedB->loadBedFileIntoMap();
	
	// open the BAM file
	BamReader reader;
	BamWriter writer;
	reader.Open(bamFile);

	// get header & reference information
	string header  = reader.GetHeaderText();
	RefVector refs = reader.GetReferenceData();

	// open a BAM output to stdout if we are writing BAM
	if (_bamOutput == true) {
		// open our BAM writer
        writer.Open("stdout", header, refs, _isUncompressedBam);
	}

	vector<BED> hits;
	// reserve some space
	hits.reserve(100);
	
	_bedA->bedType = 6;
	BamAlignment bam;	
	// get each set of alignments for each pair.
	while (reader.GetNextAlignment(bam)) {
		
		if (bam.IsMapped()) {	
			BED a;
			a.chrom = refs.at(bam.RefID).RefName;
			a.start = bam.Position;
			a.end   = bam.GetEndPosition(false);

			// build the name field from the BAM alignment.
			a.name = bam.Name;
			if (bam.IsFirstMate()) a.name += "/1";
			if (bam.IsSecondMate()) a.name += "/2";

			a.score  = ToString(bam.MapQuality);
			
			a.strand = "+"; 
			if (bam.IsReverseStrand()) a.strand = "-"; 
	
			if (_bamOutput == true) {
			    bool overlapsFound = false;
			    // treat the BAM alignment as a single "block"
			    if (_obeySplits == false) {
				    overlapsFound = FindOneOrMoreOverlap(a);
				}
				// split the BAM alignment into discrete blocks and
				// look for overlaps only within each block.
				else {
                    bool overlapFoundForBlock;
				    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
				    // we don't want to split on "D" ops, hence the "false"
                    getBamBlocks(bam, refs, bedBlocks, false);
                    
                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        overlapFoundForBlock = FindOneOrMoreOverlap(a);
            	        if (overlapFoundForBlock == true)
                            overlapsFound = true;
            	    }
				}
				if (overlapsFound == true) {
					if (_noHit == false)
						writer.SaveAlignment(bam);
				}
				else {
					if (_noHit == true) {
						writer.SaveAlignment(bam);
					}	
				}
			}
			else {
			    // treat the BAM alignment as a single BED "block"
			    if (_obeySplits == false) {
				    FindOverlaps(a, hits);
				    hits.clear();
			    }
			    // split the BAM alignment into discrete BED blocks and
				// look for overlaps only within each block.
			    else {
			        bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
                    getBamBlocks(bam, refs, bedBlocks, false);

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        FindOverlaps(*bedItr, hits);
                        hits.clear();
            	    }
			    }
			}
		}
	}
	
	// close the relevant BAM files.
	reader.Close();
	if (_bamOutput == true) {
		writer.Close();
	}
}
Beispiel #24
0
namespace BamTools {
  
// -------------------------------  
// string literal constants  

// property names
const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
const string CIGAR_PROPERTY               = "cigar";
const string INSERTSIZE_PROPERTY          = "insertSize";
const string ISDUPLICATE_PROPERTY         = "isDuplicate";
const string ISFAILEDQC_PROPERTY          = "isFailedQC";
const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
const string ISMAPPED_PROPERTY            = "isMapped";
const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
const string ISPAIRED_PROPERTY            = "isPaired";
const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
const string ISPROPERPAIR_PROPERTY        = "isProperPair";
const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
const string ISSECONDMATE_PROPERTY        = "isSecondMate";
const string ISSINGLETON_PROPERTY         = "isSingleton";
const string MAPQUALITY_PROPERTY          = "mapQuality";
const string MATEPOSITION_PROPERTY        = "matePosition";
const string MATEREFERENCE_PROPERTY       = "mateReference";
const string NAME_PROPERTY                = "name";
const string POSITION_PROPERTY            = "position";
const string QUERYBASES_PROPERTY          = "queryBases";
const string REFERENCE_PROPERTY           = "reference";
const string TAG_PROPERTY                 = "tag";

// boolalpha
const string TRUE_STR  = "true";
const string FALSE_STR = "false";
    
RefVector filterToolReferences;    
    
struct BamAlignmentChecker {
    bool check(const PropertyFilter& filter, const BamAlignment& al) {
      
        bool keepAlignment = true;
        const PropertyMap& properties = filter.Properties;
        PropertyMap::const_iterator propertyIter = properties.begin();
        PropertyMap::const_iterator propertyEnd  = properties.end();
        for ( ; propertyIter != propertyEnd; ++propertyIter ) {
          
            // check alignment data field depending on propertyName
            const string& propertyName = (*propertyIter).first;
            const PropertyFilterValue& valueFilter = (*propertyIter).second;
            
            if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
            else if ( propertyName == CIGAR_PROPERTY ) {
                stringstream cigarSs;
                const vector<CigarOp>& cigarData = al.CigarData;
                if ( !cigarData.empty() ) {
                    vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                    vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                    vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                    for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                        const CigarOp& op = (*cigarIter);
                        cigarSs << op.Length << op.Type;
                    }
                    keepAlignment &= valueFilter.check(cigarSs.str());
                }
            }
            else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
            else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
            else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
            else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
            else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
            else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
            else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
            else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
            else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
            else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
            else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
            else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
            else if ( propertyName == ISSINGLETON_PROPERTY ) {
                const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
                keepAlignment &= valueFilter.check(isSingleton);
            }
            else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
            else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
            else if ( propertyName == MATEREFERENCE_PROPERTY ) {
                if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
                BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
                const string& refName = filterToolReferences.at(al.MateRefID).RefName;
                keepAlignment &= valueFilter.check(refName);
            }
            else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
            else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
            else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
            else if ( propertyName == REFERENCE_PROPERTY ) {
                BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
                const string& refName = filterToolReferences.at(al.RefID).RefName;
                keepAlignment &= valueFilter.check(refName);
            }
            else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
            else BAMTOOLS_ASSERT_UNREACHABLE;
            
            // if alignment fails at ANY point, just quit and return false
            if ( !keepAlignment ) return false;
        }
      
        BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
        return keepAlignment;
    }
    
    bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
     
        // ensure filter contains string data
        Variant entireTagFilter = valueFilter.Value;
        if ( !entireTagFilter.is_type<string>() ) return false;

        // localize string from variant
        const string& entireTagFilterString = entireTagFilter.get<string>();

        // ensure we have at least "XX:x"
        if ( entireTagFilterString.length() < 4 ) return false;

        // get tagName & lookup in alignment
        // if found, set tagType to tag type character
        // if not found, return false
        const string& tagName = entireTagFilterString.substr(0,2);
        char tagType = '\0';
        if ( !al.GetTagType(tagName, tagType) ) return false;

        // remove tagName & ":" from beginning tagFilter
        string tagFilterString = entireTagFilterString.substr(3);

        // switch on tag type to set tag query value & parse filter token
        int32_t  intFilterValue,    intQueryValue;
        uint32_t uintFilterValue,   uintQueryValue;
        float    realFilterValue,   realQueryValue;
        string   stringFilterValue, stringQueryValue;

        PropertyFilterValue tagFilter;
        PropertyFilterValue::ValueCompareType compareType;
        bool keepAlignment = false;
        switch (tagType) {

            // signed int tag type
            case 'c' :
            case 's' :
            case 'i' :
                if ( al.GetTag(tagName, intQueryValue) ) {
                    if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
                        tagFilter.Value = intFilterValue;
                        tagFilter.Type  = compareType;
                        keepAlignment   = tagFilter.check(intQueryValue);
                    }
                }
                break;

            // unsigned int tag type
            case 'C' :
            case 'S' :
            case 'I' :
                if ( al.GetTag(tagName, uintQueryValue) ) {
                    if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
                        tagFilter.Value = uintFilterValue;
                        tagFilter.Type  = compareType;
                        keepAlignment   = tagFilter.check(uintQueryValue);
                    }
                }
                break;

            // 'real' tag type
            case 'f' :
                if ( al.GetTag(tagName, realQueryValue) ) {
                    if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
                        tagFilter.Value = realFilterValue;
                        tagFilter.Type  = compareType;
                        keepAlignment   = tagFilter.check(realQueryValue);
                    }
                }
                break;

            // string tag type
            case 'A':
            case 'Z':
            case 'H':
                if ( al.GetTag(tagName, stringQueryValue) ) {
                    if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
                        tagFilter.Value = stringFilterValue;
                        tagFilter.Type  = compareType;
                        keepAlignment   = tagFilter.check(stringQueryValue);
                    }
                }
                break;

            // unknown tag type
            default :
                keepAlignment = false;
        }

        return keepAlignment;
    }
};    
    
} // namespace BamTools
Beispiel #25
0
bool RandomTool::RandomToolPrivate::Run(void) {

    // set to default stdin if no input files provided
    if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
        m_settings->InputFiles.push_back(Options::StandardIn());

    // add files in the filelist to the input file list
    if ( m_settings->HasInputFilelist ) {

        ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
        if ( !filelist.is_open() ) {
            cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl;
            return false;
        }

        string line;
        while ( getline(filelist, line) )
            m_settings->InputFiles.push_back(line);
    }

    // open our reader
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl;
        return false;
    }

    // look up index files for all BAM files
    reader.LocateIndexes();

    // make sure index data is available
    if ( !reader.HasIndexes() ) {
        cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl;
        reader.Close();
        return false;
    }

    // get BamReader metadata
    const string headerText = reader.GetHeaderText();
    const RefVector references = reader.GetReferenceData();
    if ( references.empty() ) {
        cerr << "bamtools random ERROR: no reference data available... Aborting." << endl;
        reader.Close();
        return false;
    }

    // determine compression mode for BamWriter
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
                              !m_settings->IsForceCompression );
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
    if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;

    // open BamWriter
    BamWriter writer;
    writer.SetCompressionMode(compressionMode);
    if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) {
        cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename
             << " for writing... Aborting." << endl;
        reader.Close();
        return false;
    }

    // if user specified a REGION constraint, attempt to parse REGION string
    BamRegion region;
    if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
        cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl;
        cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
             << endl;
        reader.Close();
        writer.Close();
        return false;
    }

    // seed our random number generator
    srand( time(NULL) );

    // grab random alignments
    BamAlignment al;
    unsigned int i = 0;
    while ( i < m_settings->AlignmentCount ) {

        int randomRefId    = 0;
        int randomPosition = 0;

        // use REGION constraints to select random refId & position
        if ( m_settings->HasRegion ) {

            // select a random refId
            randomRefId = getRandomInt(region.LeftRefID, region.RightRefID);

            // select a random position based on randomRefId
            const int lowerBoundPosition = ( (randomRefId == region.LeftRefID)
                                             ? region.LeftPosition
                                             : 0 );
            const int upperBoundPosition = ( (randomRefId == region.RightRefID)
                                             ? region.RightPosition
                                             : (references.at(randomRefId).RefLength - 1) );
            randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
        }

        // otherwise select from all possible random refId & position
        else {

            // select random refId
            randomRefId = getRandomInt(0, (int)references.size() - 1);

            // select random position based on randomRefId
            const int lowerBoundPosition = 0;
            const int upperBoundPosition = references.at(randomRefId).RefLength - 1;
            randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition);
        }

        // if jump & read successful, save first alignment that overlaps random refId & position
        if ( reader.Jump(randomRefId, randomPosition) ) {
            while ( reader.GetNextAlignmentCore(al) ) {
                if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
                    writer.SaveAlignment(al);
                    ++i;
                    break;
                }
            }
        }
    }

    // cleanup & exit
    reader.Close();
    writer.Close();
    return true;
}
Beispiel #26
0
 /**
  * Construct a Vector using a reference Vector. This will be used to do
  * implicit casts from RefVector to Vector for most of the Vector operations.
  *
  * @param vec  the RefVector to copy
  */
 Vector::Vector(const RefVector& vec) :
   data({vec.x(), vec.y(), vec.z(), 0.0}) { }