Exemple #1
0
//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a,
// if both reads are on the same chrome, then read_l must map before read_r
// if the reads are on different strands then read_l must be on the lexo
// lesser chrom (using the string.compare() method)
SV_Pair::
SV_Pair(const BamAlignment &bam_a,
        const BamAlignment &bam_b,
        const RefVector &refs,
        int _weight,
        int _ev_id,
        SV_PairReader *_reader)
{
    reader = _reader;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct interval tmp_a, tmp_b;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition(false, false) - 1;
    tmp_a.chr = refs.at(bam_a.RefID).RefName;

    if ( bam_a.IsReverseStrand() == true )
        tmp_a.strand = '-';
    else
        tmp_a.strand = '+';

    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition(false, false) - 1;
    tmp_b.chr = refs.at(bam_b.RefID).RefName;

    if ( bam_b.IsReverseStrand() == true )
        tmp_b.strand = '-';
    else
        tmp_b.strand = '+';

    //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) {
    if ( bam_a.RefID < bam_b.RefID ) {
        read_l = tmp_a;
        read_r = tmp_b;
        //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) {
    } else if ( bam_a.RefID > bam_b.RefID) {
        read_l = tmp_b;
        read_r = tmp_a;
    } else { // ==
        if (tmp_a.start > tmp_b.start) {
            read_l = tmp_b;
            read_r = tmp_a;
        } else {
            read_l = tmp_a;
            read_r = tmp_b;
        }
    }

    weight = _weight;
    ev_id = _ev_id;
}
Exemple #2
0
int IsWithinWindow(BamAlignment& alignment, int winstartpos, int winendpos, int AllowableBasesInWindow) {
    //int allowableFracLen = ceil((float) PercentReadLengthInWindow * alignment.Length);

    if ((alignment.Position >= winstartpos) && ((alignment.GetEndPosition() - 1) <= winendpos)) // reads that are exactly in window
        return 0;
    else if (((alignment.Position >= winstartpos) && (alignment.Position <= winendpos))
            || (((alignment.GetEndPosition() - 1) <= winendpos) && ((alignment.GetEndPosition() - AllowableBasesInWindow - 1) >= winstartpos))) // certain reads that are partially in window
        return 0;
    else if ((alignment.Position < winstartpos) && ((alignment.GetEndPosition() - 1) >= winendpos)) // window is in read length
        return 0;
    else if ((alignment.Position < winstartpos) && ((alignment.GetEndPosition() - 1)< winstartpos) )
        return 1;
    else if (alignment.Position > winendpos)
        return 2;
}
pos_t VariantProcessor::processMatchOrMismatch(const BamAlignment& alignment, 
					       vector<VariantPtr>& read_variants, 
					       const uint32_t& op_length, const string& refseq, 
					       const pos_t& refpos, const pos_t& readpos) {
  // Process a matching or mismatching sequence in the CIGAR string,
  // adding any SNP variants present.
  int endpos = alignment.GetEndPosition();
  for (int i = 0; i < op_length; i++) {
    assert(alignment.Position + i < endpos);
    char query_base = alignment.QueryBases[readpos + i];
    assert(refpos + i < refseq.size());
    char ref_base = refseq[refpos + i];
    if (ref_base != query_base) {
      // SNP
      string ref(1, ref_base), alt(1, query_base);
      char qual_base = alignment.Qualities[refpos + i]; // TODO check

      VariantPtr snp(new Variant(VariantType::SNP, alignment.RefID, 
				 alignment.Position+i, 1, 0, ref, alt));
      block_variants.insert(snp);
      read_variants.push_back(snp);
      //cout << "mismatch at " << alignment.Position + i <<" refbase: " << ref_base << " querybase: " << query_base << endl;      
    }
  }
}
pos_t VariantProcessor::processAlignment(const BamAlignment& alignment) {
  /* 
     For each alignment, extract the MD and NM tags, validate against
     CIGAR string, and create Variants and ReadHaplotypes. All reads
     for a block are stored in a deque, and processed again to create
     candidate haplotypes.

     Returns the start position of this alignment (TODO correct?)
  */  
  
  if (!alignment.HasTag("NM") || !alignment.HasTag("MD")) {
    std::cerr << "error: BamAlignment '" << alignment.Name << 
      "' does not have either NM or MD tags" << std::endl;
  }
  
  int nm_tag; 
  string md_tag;
  unsigned int aln_len = alignment.GetEndPosition() - alignment.Position;

  alignment.GetTag("MD", md_tag);
  alignment.GetTag("NM", nm_tag);
  
  // Reconstruct reference sequence using MD tags
  string refseq = createReferenceSequence(alignment);

  // With reconstructed reference sequence and query sequence, look
  // for variants. It's a bit roundabout to reconstruct reference from
  // MD, then use it to find variants (already in MD) but keeping
  // state between CIGAR and MD is tricky. This also is a good
  // validation; variants found must much the number of variants in
  // CIGAR/MD.
  vector<VariantPtr> variants;
  vector<VariantPtr> read_variants;
  const vector<CigarOp>& cigar = alignment.CigarData;
  int refpos = 0, readpos = 0;
  
  for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) {
    if (op->Type == 'S') {
      readpos += op->Length;
    } else if (op->Type == 'M') {
      // match or SNP
      processMatchOrMismatch(alignment, read_variants, op->Length, refseq, refpos, readpos);
      readpos += op->Length;
      refpos += op->Length;
    } else if (op->Type == 'I') {
      processInsertion(alignment, read_variants, op->Length, refseq, refpos, readpos);
      readpos += op->Length;
    } else if (op->Type == 'D') {
      processDeletion(alignment, read_variants, op->Length, refseq, refpos, readpos);
      refpos += op->Length; // deletion w.r.t reference; skip ref length
    } else {
      cerr << "error: unidentified CIGAR type: " << op->Type << endl;
      exit(1);
    }
  }

  // Add to alignments list
  block_alignments.push_back(alignment);
  return 0; // TODO(vsbuffalo)
}
Exemple #5
0
// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {

    // if alignment is on any reference sequence before left bound
    if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;

    // if alignment starts on left bound reference
    else if ( bAlignment.RefID == Region.LeftRefID ) {

	// if alignment starts at or after left boundary
	if ( bAlignment.Position >= Region.LeftPosition) {

	    // if right boundary is specified AND
	    // left/right boundaries are on same reference AND
	    // alignment starts past right boundary
	    if ( Region.isRightBoundSpecified() &&
		 Region.LeftRefID == Region.RightRefID &&
		 bAlignment.Position > Region.RightPosition )
		return AFTER_REGION;

	    // otherwise, alignment is within region
	    return WITHIN_REGION;
	}

	// alignment starts before left boundary
	else {
	    // check if alignment overlaps left boundary
	    if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
	    else return BEFORE_REGION;
	}
    }

    // alignment starts on a reference after the left bound
    else {

	// if region has a right boundary
	if ( Region.isRightBoundSpecified() ) {

	    // alignment is on reference between boundaries
	    if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;

	    // alignment is on reference after right boundary
	    else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;

	    // alignment is on right bound reference
	    else {
		// check if alignment starts before or at right boundary
		if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
		else return AFTER_REGION;
	    }
	}

	// otherwise, alignment is after left bound reference, but there is no right boundary
	else return WITHIN_REGION;
    }
}
void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a)
{

    // tab-delimited, 0-based half-open
    // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
    // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>

    m_out << m_references.at(a.RefID).RefName << '\t' << a.Position << '\t' << a.GetEndPosition()
          << '\t' << a.Name << '\t' << a.MapQuality << '\t' << (a.IsReverseStrand() ? '-' : '+')
          << std::endl;
}
Exemple #7
0
void BedCoverage::CollectCoverageBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    // open the BAM file
    BamReader reader;
    reader.Open(bamFile);

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        if (bam.IsMapped()) {
            // treat the BAM alignment as a single "block"
            if (_obeySplits == false) {
                // construct a new BED entry from the current BAM alignment.
                BED a;
                a.chrom  = refs.at(bam.RefID).RefName;
                a.start  = bam.Position;
                a.end    = bam.GetEndPosition(false, false);
                a.strand = "+";
                if (bam.IsReverseStrand()) a.strand = "-";

                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            }
            // split the BAM alignment into discrete blocks and
            // look for overlaps only within each block.
            else {
                // vec to store the discrete BED "blocks" from a
                bedVector bedBlocks;
                // since we are counting coverage, we do want to split blocks when a
                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
    // close the BAM file
    reader.Close();
}
Exemple #8
0
//{{{ void process_pair(const BamAlignment &curr,
void
SV_Pair::
process_pair(const BamAlignment &curr,
             const RefVector refs,
             map<string, BamAlignment> &mapped_pairs,
             UCSCBins<SV_BreakPoint*> &r_bin,
             int weight,
             int ev_id,
             SV_PairReader *reader)
{
    if (mapped_pairs.find(curr.Name) == mapped_pairs.end())
        mapped_pairs[curr.Name] = curr;
    else {
        SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name],
                                        curr,
                                        refs,
                                        weight,
                                        ev_id,
                                        reader);
        //cerr << count_clipped(curr.CigarData) << "\t" <<
                //count_clipped(mapped_pairs[curr.Name].CigarData) << endl;
                
        if ( new_pair->is_sane() &&  
             new_pair->is_aberrant() &&
             (count_clipped(curr.CigarData) > 0) &&
             (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) {
            SV_BreakPoint *new_bp = new_pair->get_bp();

#ifdef TRACE

            cerr << "READ\t" << 
                    refs.at(mapped_pairs[curr.Name].RefID).RefName << "," <<
                    mapped_pairs[curr.Name].Position << "," <<
                    (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1)
                        << "\t" <<
                    refs.at(curr.RefID).RefName << "," <<
                    curr.Position << "," <<
                    (curr.GetEndPosition(false, false) - 1)
                        <<
                    endl;

            cerr << "\tPE\t" << *new_bp << endl;
#endif
            new_bp->cluster(r_bin);
        } else {
            delete(new_pair);
        }

        mapped_pairs.erase(curr.Name);
    }
}
Exemple #9
0
void CountDepth(Histogram& hist, BamMultiReader& reader, BamAlignment& al, int32_t refID, int64_t refLen)
{
    bool moreReads = (al.RefID == refID);

    int32_t maxReadLen = 1000;
    vector<int64_t> readEnds(maxReadLen);

    int64_t depth = 0;
    for(int64_t pos=0; pos<refLen; ++pos){
        while(moreReads and al.Position == pos){
            ++depth;
            assert(al.GetEndPosition() - pos < maxReadLen);
            ++readEnds[al.GetEndPosition() % maxReadLen];
            moreReads = GetNextAlignment(al, reader, refID);
        }
        depth -= readEnds[pos % maxReadLen];
        assert(depth >= 0);
        readEnds[pos % maxReadLen] = 0;
        if(depth >= hist.size())
            hist.resize(2 * depth);
        ++hist[depth];
    }
}
Exemple #10
0
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block,
SV_SplitRead::
SV_SplitRead(const BamAlignment &bam_a,
             const BamAlignment &bam_b,
             const RefVector &refs,
             int _weight,
             int _id,
             int _sample_id,
             SV_SplitReadReader *_reader)
{
    reader = _reader;
    sample_id = _sample_id;

    if ( bam_a.MapQuality < bam_b.MapQuality )
        min_mapping_quality = bam_a.MapQuality;
    else
        min_mapping_quality = bam_b.MapQuality;

    struct cigar_query query_a =
        calc_query_pos_from_cigar(bam_a.CigarData,
                                  bam_a.IsReverseStrand() );
    struct cigar_query query_b =
        calc_query_pos_from_cigar(bam_b.CigarData,
                                  bam_b.IsReverseStrand() );

    struct interval tmp_a, tmp_b;

    tmp_a.strand = '+';
    if (bam_a.IsReverseStrand())
        tmp_a.strand = '-';
    tmp_a.chr = refs.at(bam_a.RefID).RefName;
    tmp_a.start = bam_a.Position;
    tmp_a.end = bam_a.GetEndPosition();

    tmp_b.strand = '+';
    if (bam_b.IsReverseStrand())
        tmp_b.strand = '-';
    tmp_b.chr = refs.at(bam_b.RefID).RefName;
    tmp_b.start = bam_b.Position;
    tmp_b.end = bam_b.GetEndPosition();


    //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) ||
    //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) &&
    //( tmp_a.start > tmp_b.start ) ) ) {

    if ( (bam_a.RefID > bam_b.RefID) ||
            ( (bam_a.RefID == bam_b.RefID) &&
              (tmp_a.start > tmp_b.start ) ) ) {
        side_r = tmp_a;
        side_l = tmp_b;
        query_r = query_a;
        query_l = query_b;
    } else {
        side_l = tmp_a;
        side_r = tmp_b;
        query_l = query_a;
        query_r = query_b;
    }

    if (side_l.strand != side_r.strand)
        type = SV_BreakPoint::INVERSION;
    else if ( (	( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos < query_r.qs_pos ) ) ||
              (	( side_l.strand == '-' ) &&
                  ( side_r.strand == '-' ) &&
                  ( query_l.qs_pos > query_r.qs_pos) ) )
        type = SV_BreakPoint::DELETION;
    else if ( ( ( side_l.strand == '+' ) &&
                ( side_r.strand == '+' ) &&
                ( query_l.qs_pos > query_r.qs_pos ) ) ||
              ( ( side_l.strand == '-' ) &&
                ( side_r.strand == '-' ) &&
                ( query_l.qs_pos < query_r.qs_pos) ) )
        type = SV_BreakPoint::DUPLICATION;
    else {
        cerr << "ERROR IN BAM FILE.  " <<
             "TYPE not detected (DELETION,DUPLICATION,INVERSION)" <<
             endl;
        cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" <<
             query_r.qs_pos << "," << side_r.strand << "\t" <<
             tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" <<
             tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" <<
             endl;

        throw(1);
    }

    weight = _weight;
    id = _id;
}
Exemple #11
0
void realign_bam(Parameters& params) {

    FastaReference reference;
    reference.open(params.fasta_reference);

    bool suppress_output = false;

    int dag_window_size = params.dag_window_size;
    
    // open BAM file
    BamReader reader;
    if (!reader.Open("stdin")) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

    BamWriter writer;
    if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    vector<RefData> referenceSequences = reader.GetReferenceData();
    int i = 0;
    for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->RefName;
        ++i;
    }

    vcf::VariantCallFile vcffile;
    if (!params.vcf_file.empty()) {
        if (!vcffile.open(params.vcf_file)) {
            cerr << "could not open VCF file " << params.vcf_file << endl;
            exit(1);
        }
    } else {
        cerr << "realignment requires VCF file" << endl;
        exit(1);
    }
    vcf::Variant var(vcffile);

    BamAlignment alignment;
    map<long int, vector<BamAlignment> > alignmentSortQueue;

    // get alignment
    // assemble DAG in region around alignment
    // loop for each alignment in BAM:
    //     update DAG when current alignment gets close to edge of assembled DAG
    //     attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal
    //     if alignment to DAG has fewer mismatches and gaps than original alignment, use it
    //         flatten read into reference space (for now just output alleles from VCF un-spanned insertions)
    //     write read to queue for streaming re-sorting (some positional change will occur)

    long int dag_start_position = 0;
    string currentSeqname;
    string ref;
    //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph
    //vector<long int> refpositions; // contains the reference start coords of nodes in the graph
    ReferenceMappings ref_map;
    gssw_graph* graph = gssw_graph_create(0);
    int8_t* nt_table = gssw_create_nt_table();
    int8_t* mat = gssw_create_score_matrix(params.match, params.mism);

    int total_reads = 0;
    int total_realigned = 0;
    int total_improved = 0;
    bool emptyDAG = false; // if the dag is constructed over empty sequence
                           // such as when realigning reads mapped to all-N sequence
    if (params.debug) {
        cerr << "about to start processing alignments" << endl;
    }

    while (reader.GetNextAlignment(alignment)) {

        string& seqname = referenceIDToName[alignment.RefID];

        if (params.debug) {
            cerr << "--------------------------------------------" << endl
                 << "processing alignment " << alignment.Name << " at "
                 << seqname << ":" << alignment.Position << endl;
        }

        /*
        if (!alignment.IsMapped() && graph->size == 0) {
            if (params.debug) {
                cerr << "unable to build DAG using unmapped read "
                     << alignment.Name << " @ "
                     << seqname << ":" << alignment.Position
                     << " no previous mapped read found and DAG currently empty" << endl;
            }
            alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment);
            continue;
        }
        */

        ++total_reads;

        BamAlignment originalAlignment = alignment;
        long unsigned int initialAlignmentPosition = alignment.Position;
        //if (dag_start_position == 1) {
        //    dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2);
        //}

        // should we construct a new DAG?  do so when 3/4 of the way through the current one
        // center on current position + 1/2 dag window
        // TODO check this scheme using some scribbles on paper
        // alignment.IsMapped()
        if ((seqname != currentSeqname
             || ((alignment.Position + (alignment.QueryBases.size()/2)
                  > (3*dag_window_size/4) + dag_start_position)))
            && alignment.Position < reference.sequenceLength(seqname)) {

            if (seqname != currentSeqname) {
                if (params.debug) {
                    cerr << "switched ref seqs" << endl;
                }
                dag_start_position = max((long int) 0,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            // recenter DAG
            } else if (!ref_map.empty()) {
                dag_start_position = dag_start_position + dag_window_size/2;
                dag_start_position = max(dag_start_position,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            } else {
                dag_start_position = alignment.Position - dag_window_size/2;
            }
            dag_start_position = max((long int)0, dag_start_position);

            // TODO get sequence length and use to bound noted window size (edge case)
            //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl;

            // get variants for new DAG
            vector<vcf::Variant> variants;
            if (!vcffile.setRegion(seqname,
                                   dag_start_position + 1,
                                   dag_start_position + dag_window_size)) {
                // this is not necessarily an error; there should be a better way to check for VCF file validity
                /*
                cerr << "could not set region on VCF file to " << currentSeqname << ":"
                     << dag_start_position << "-" << dag_start_position + ref.size()
                     << endl;
                */
                //exit(1);
            } else {

                // check first variant
                if (vcffile.getNextVariant(var)) {
                    while (var.position <= dag_start_position + 1) {
                        //cerr << "var position == dag_start_position " << endl;
                        dag_start_position -= 1;
                        vcffile.setRegion(seqname,
                                          dag_start_position + 1,
                                          dag_start_position + dag_window_size);
                        if (!vcffile.getNextVariant(var)) { break; }
                    }
                }

                vcffile.setRegion(seqname,
                                  dag_start_position + 1,
                                  dag_start_position + dag_window_size);

                while (vcffile.getNextVariant(var)) {
                    if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl;
                    //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl;
                    //cerr << var.position << " >= " << dag_start_position << endl;
                    if (var.position + var.ref.length() <= dag_start_position + dag_window_size
                        && var.position >= dag_start_position) {
                        variants.push_back(var);
                    }
                }

            }

            //cerr << "dag_start_position " << dag_start_position << endl;
            ref = reference.getSubSequence(seqname,
                                           max((long int) 0, dag_start_position),
                                           dag_window_size); // 0/1 conversion

            // clear graph and metadata
            ref_map.clear();
            //cigars.clear();
            //refpositions.clear();
            gssw_graph_destroy(graph);

            if (params.debug) { cerr << "constructing DAG" << endl; }
            // and build the DAG
            graph = gssw_graph_create(0);
            constructDAGProgressive(graph,
                                    ref_map,
                                    ref,
                                    seqname,
                                    variants,
                                    dag_start_position,
                                    nt_table,
                                    mat,
                                    params.flat_input_vcf);

            if (params.debug) {
                cerr << "graph has " << graph->size << " nodes" << endl;
                cerr << "DAG generated from input variants over "
                     << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size
                     << endl;
            }
            if (params.display_dag) {
                gssw_graph_print(graph);
                /*
                for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) {
                    cout << b->first << " "
                         << b->first->id << " "
                         << b->second.ref_position << " "
                         << b->second.cigar << endl
                         << b->first->seq << endl;
                }
                */
            }

            if (graph->size == 1 && allN(ref) || graph->size == 0) {
                if (params.debug) {
                    cerr << "DAG is empty (1 node, all N).  Alignment is irrelevant." << endl;
                }
                emptyDAG = true;
            } else {
                emptyDAG = false;
            }

        }

        AlignmentStats stats_before;
        bool was_mapped = alignment.IsMapped();
        bool has_realigned = false;
        if (was_mapped) {
            if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                ref = reference.getSubSequence(seqname,
                                               max((long int) 0, dag_start_position),
                                               alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
            }
        }

        if (params.debug) {
            if (emptyDAG) {
                cerr << "cannot realign against empty (all-N single node) graph" << endl;
            }
        }

        if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) {

            ++total_realigned;

            if (params.debug) {
                cerr << "realigning: " << alignment.Name
                     << " " << alignment.QueryBases << endl
                     << " aligned @ " << alignment.Position
                     << " to variant graph over "
                     << seqname
                     << ":" << dag_start_position
                     << "-" << dag_start_position + dag_window_size << endl;
            }

            //{
            try {

                Cigar flat_cigar;
                string read = alignment.QueryBases;
                string qualities = alignment.Qualities;
                int score;
                long int position;
                string strand;
                gssw_graph_mapping* gm =
                    gswalign(graph,
                             ref_map,
                             read,
                             qualities,
                             params,
                             position,
                             score,
                             flat_cigar,
                             strand,
                             nt_table,
                             mat);
                //
                gssw_graph_mapping_destroy(gm);

                if (params.dry_run) {

                    if (strand == "-" && !alignment.IsMapped()) {
                        read = reverseComplement(read);
                    }
                    cout << read << endl;
                    cout << graph_mapping_to_string(gm) << endl;
                    cout << score << " " << strand << " "
                         << position << " "
                         << flat_cigar << endl;

                } else {

                    /*
                    if (strand == "-") {
                        read = reverseComplement(trace_report.read);
                    }
                   */
 
                    // TODO the qualities are not on the right side of the read
                    if (strand == "-" && alignment.IsMapped()) {
                        // if we're realigning, this is always true unless we swapped strands
                        alignment.SetIsReverseStrand(true);
                        //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities
                    }
                    //alignment.QueryBases = reverseComplement(trace_report.read);
                    alignment.QueryBases = read;
                    alignment.Qualities = qualities;

                    alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x;
                    alignment.SetIsMapped(true);
                    if (!alignment.MapQuality) {
                        alignment.MapQuality = 20; // horrible hack...  at least approximate with alignment mismatches against graph
                    }

                    // check if somehow we've ended up with an indel at the ends
                    // if so, grab the reference sequence right beyond it and add
                    // a single match to the cigar, allowing variant detection methods
                    // to run on the results without internal modification
                    Cigar& cigar = flat_cigar;
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;
                    int flankSize = params.flatten_flank;
                    if (cigar.front().isIndel() ||
                        (cigar.front().isSoftclip() && cigar.at(1).isIndel())) {
                        alignment.Position -= flankSize;
                        string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize);
                        if (cigar.front().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.begin(),
                                                       alignment.QueryBases.begin()+cigar.front().length);
                            alignment.Qualities.erase(alignment.Qualities.begin(),
                                                       alignment.Qualities.begin()+cigar.front().length);
                            cigar.erase(cigar.begin());
                        }
                        alignment.QueryBases.insert(0, refBase);
                        alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30)));
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        newCigar.append(flat_cigar);
                        flat_cigar = newCigar;
                    }
                    if (cigar.back().isIndel() ||
                        (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) {
                        string refBase = reference.getSubSequence(seqname,
                                                                  alignment.Position
                                                                  + flat_cigar.refLen(),
                                                                  flankSize);
                        if (cigar.back().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length,
                                                       alignment.QueryBases.end());
                            alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length,
                                                      alignment.Qualities.end());
                            cigar.pop_back();
                        }
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        flat_cigar.append(newCigar);
                        //flat_cigar.append(newCigar);
                        alignment.QueryBases.append(refBase);
                        alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30)));
                    }

                    flat_cigar.toCigarData(alignment.CigarData);
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;

                    if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                        ref = reference.getSubSequence(seqname,
                                                       max((long int) 0, dag_start_position),
                                                       alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
                    }

                    AlignmentStats stats_after;
                    countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug);
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum
                                         && stats_before.mismatch_qsum >= stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum
                                         >= stats_after.softclip_qsum + stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */

                    // we accept the new alignment if...
                    if (!was_mapped  // it wasn't mapped previously
                        // or if we have removed soft clips or mismatches (per quality) from the alignment
                        //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum
                        //     && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)
                        || ((stats_before.softclip_qsum + stats_before.mismatch_qsum
                             >= stats_after.softclip_qsum + stats_after.mismatch_qsum)
                            // and if we have added gaps, we have added them to remove mismatches or softclips
                            && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches
                                || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment
                                    && (stats_before.softclip_qsum 
                                        + stats_before.mismatch_qsum
                                        >
                                        stats_after.softclip_qsum
                                        + stats_after.mismatch_qsum))))
                            // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches
                            // as provided in input parameters
                        && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {

                        // keep the alignment
                        // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...)
                        if (params.debug) {
                            cerr << "realigned " << alignment.Name << " to graph, which it maps to with "
                                 << stats_after.mismatch_qsum << "q in mismatches and "
                                 << stats_after.softclip_qsum << "q in soft clips" << endl;
                        }
                        ++total_improved;
                        has_realigned = true;
                    } else {
                        // reset to old version of alignment
                        if (params.debug) {
                            cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " 
                                 << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and "
                                 << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl;
                        }
                        has_realigned = false;
                        alignment = originalAlignment;
                    }
                }
                //} // try block

            } catch (...) {
                cerr << "exception when realigning " << alignment.Name
                     << " at position " << referenceIDToName[alignment.RefID]
                     << ":" << alignment.Position
                     << " " << alignment.QueryBases << endl;
                // reset to original alignment
                has_realigned = false;
                alignment = originalAlignment;

            }
        }

        // ensure correct order if alignments move
        long int maxOutputPos = initialAlignmentPosition - dag_window_size;
        // if we switched sequences we need to flush out all the reads from the previous one
        string lastSeqname = currentSeqname;
        if (seqname != currentSeqname) {
            // so the max output position is set past the end of the last chromosome
            if (!currentSeqname.empty()) {
                maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size;
            }
            currentSeqname = seqname;
        }

        if (!params.dry_run) {
            map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
            for ( ; p != alignmentSortQueue.end(); ++p) {
                // except if we are running in unsorted mode, stop when we are at the window size
                if (!params.unsorted_output && p->first > maxOutputPos) {
                    break; // no more to do
                } else {
                    for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) {
                        writer.SaveAlignment(*a);
                    }
                }
            }
            if (p != alignmentSortQueue.begin()) {
                alignmentSortQueue.erase(alignmentSortQueue.begin(), p);
            }
            if (!params.only_realigned || has_realigned) {
                alignmentSortQueue[alignment.Position].push_back(alignment);
            }
        }
    } // end GetNextAlignment loop

    if (!params.dry_run) {
        map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
        for ( ; p != alignmentSortQueue.end(); ++p) {
            for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a)
                writer.SaveAlignment(*a);
        }
    }

    gssw_graph_destroy(graph);
    free(nt_table);
	free(mat);

    reader.Close();
    writer.Close();

    if (params.debug) {
        cerr << "total reads:\t" << total_reads << endl;
        cerr << "realigned:\t" << total_realigned << endl;
        cerr << "improved:\t" << total_improved << endl;
    }

}
Exemple #12
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
Exemple #13
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);

  //region file input (the region file should be sorted as the same way as the bam file)
  ifstream region_f;
  region_f.open(param->region_f, ios_base::in);  // the region file is opened

  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();

  if ( ! reader.LocateIndexes() )     // opens any existing index files that match our BAM files
    reader.CreateIndexes();         // creates index files for BAM files that still lack one


  // locus bias
  struct lb empty_profile = {0,0,0,0};
  vector <struct lb> locus_b(1000, empty_profile);
  // output locus bias file
  string locus_bias_set = param->lbias;
  ofstream locus_bias;
  if ( locus_bias_set != "" ) {
    locus_bias.open(param->lbias);
    if ( !locus_bias ) {
      cerr << "can not open locus_bias file.\n";
      exit(0);
    }
  }

  //should decide which chromosome
  string line;
  string old_chr = "SRP";
  string type = param->type;

  //whether do some position-level pile-up stuff
  bool posc = false;
  ofstream posc_f;
  ofstream chrmap_f;
  string poscset = param->posc;
  if ( poscset != "" ) {
    posc = true;
    posc_f.open(param->posc);
    chrmap_f.open(param->chrmap);
  }

  bool noChr;
  if ( param->nochr == 1 ){
    noChr = true;
  } else {
    noChr = false;
  }

  //regions for the input of region file
  deque <struct region> regions;

  getline(region_f, line); //get the first line
  eatline(line,regions,noChr);
  
  deque <struct region>::iterator it = regions.begin();

  while ( it->chr != old_chr ) {

    old_chr = it->chr;  // set the current chr as old chr

    int chr_id  = reader.GetReferenceID(it->chr);

    if ( chr_id == -1 ) {  //reference not found

      for (; it != regions.end() && it->chr == old_chr; ) {
        gene_processing(*it,locus_b);           // print the old region info
        it = regions.erase(it);         // erase the current region
      }
  
      while ( regions.empty() ) {    
        getline(region_f, line);
        if ( region_f.eof() ){
          cerr << "finished: end of region file, zone 0" << endl;
          break;
        }
        eatline(line, regions,noChr);
        it = regions.begin();
        if (it->chr == old_chr){  
          gene_processing(*it,locus_b);      
          regions.clear();
          continue;
        }
      }
      continue;
    }

    int chr_len = refs.at(chr_id).RefLength;

    if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region
      {
        cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl;
        reader.Close();
        exit(1);
      }

    //pile-up pos stats
    set <string> fragment;
    map <string, unsigned int> pileup;
    bool isposPileup = false;
    unsigned int old_start   = 0;
    unsigned int total_tags  = 0;
    unsigned int total_pos   = 0;
    unsigned int pileup_pos  = 0;


    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {

      if ( bam.IsMapped() == false ) continue;   // skip unaligned reads

      unsigned int unique;
      bam.GetTag("NH", unique);
      if (param->unique == 1) {
        if (unique != 1) {                       // skipe uniquelly mapped reads
          continue;
        }
      }

      if (read_length == 0){
        read_length = bam.Length;
      }

      //cout << bam.Name << endl;
      string chrom = refs.at(bam.RefID).RefName;
      string strand = "+";
      if (bam.IsReverseStrand()) strand = "-";

      unsigned int alignmentStart =  bam.Position+1;
      unsigned int mateStart;
      if (type == "p") mateStart = bam.MatePosition+1;
      unsigned int alignmentEnd = bam.GetEndPosition();
      unsigned int cigarEnd;
      vector <int> blockLengths;
      vector <int> blockStarts;
      blockStarts.push_back(0);
      ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd);


      // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads)
      if (posc == true && unique == 1) {

        if (type == "p" && fragment.count(bam.Name) > 0) 
          fragment.erase(bam.Name);

        else {

          total_tags++;
          if (type == "p"){
            fragment.insert(bam.Name);
          }
          string alignSum;
          if (type == "p") {
             alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand;
          } else {
             alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand;
          }

          if ( alignmentStart != old_start ) {
            isposPileup = false;
            map <string, unsigned int>::iterator pit = pileup.begin();            
            for (; pit != pileup.end(); pit++) {
              posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl;     //print pileup
            }
            pileup.clear();           //clear pileup set
            pileup.insert( pair <string, unsigned int> (alignSum, 1) );  //insert the new read
            total_pos++;
          }

          else if ( alignmentStart == old_start ) { // same starts
            if ( pileup.count(alignSum) > 0 ) {  // pileup
              if ( pileup[alignSum] == 1 && isposPileup == false ) { 
                pileup_pos++; isposPileup = true;
              }
              pileup[alignSum]++;
            }
            else {
              pileup.insert( pair <string, unsigned int> (alignSum, 1) );
            }
          } //same starts

        }   //new fragment

        old_start = alignmentStart;
      } // do pos check



      float incre = 1.;
      if (blockStarts.size() > 1) incre = 0.5;     // incre half for junction reads
      incre /= static_cast < float >(unique);        // for multi aligned reads

      deque <struct region>::iterator iter = regions.begin();

      if ( iter->start > alignmentEnd ) continue;  // skip reads not overlapping with the first region

      while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) {

        if (iter->end < alignmentStart) {            // the region end is beyond the alignmentStart

          gene_processing(*iter,locus_b);            // processing
          iter = regions.erase(iter);                // this region should be removed
          if ( regions.empty() ) { 
            getline(region_f, line);                        // get a line of region file
            if ( ! region_f.eof() ) {
              eatline(line, regions, noChr);                         // eat a line and put it into the duque
              iter = regions.begin();
            }
            else {  // it's reaching the end of the region file
              cerr << "finished: end of region file, zone 3" << endl;
              break;
            }
          }
          continue;
        }

        if (iter->end >= alignmentStart && iter->start <= alignmentEnd) {  //overlapping, should take action

          vector <int>::iterator cigit = blockStarts.begin();
          for (; cigit != blockStarts.end(); cigit++) {
            unsigned int current_start = *cigit + alignmentStart;
            int current_pos = current_start - (iter->start);
            //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl;
            if ( (iter->tags).count(current_pos) > 0 ) {
              (iter->tags)[current_pos] += incre;
            }
            else
              (iter->tags).insert( pair<int, float>(current_pos, incre) );  
          }

        }  // overlapping take action!

        if ( (iter+1) != regions.end() )
          iter++;                                           // if this region is not the last element in the deque
        else {                                              // the last element
          getline(region_f, line);                          // get a line of region file
          if ( ! region_f.eof() ){
            eatline(line, regions, noChr);                         // eat a line and put it into the duque
            iter = regions.end();
            iter--;
          }
          else {  //it's reaching the end of the region file
            cerr << "finished: end of region file, zone 4" << endl;
            break;
          }
        }

      } //while

    }  // read a bam


    // print chr map
    if (posc == true) {
      chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl;
    } 
 
    //somehow to loop back
    it = regions.begin();                   //reset to begin
    for (; it != regions.end() && it->chr == old_chr; ) {
      gene_processing(*it,locus_b);              // print the old region info
      it = regions.erase(it);             // erase the current region
    }
  
    while ( regions.empty() ) {    

      getline(region_f, line);
      if ( region_f.eof() ){
        cerr << "finished: end of region file, zone 5" << endl;
        //print locus bias
        for (unsigned int l = 0; l < 1000; l++){
	  locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl;
	}
        exit(0);
      }
      eatline(line, regions, noChr);
      it = regions.begin();
      if (it->chr == old_chr){
        gene_processing(*it, locus_b);      
        regions.clear();
        continue;
      }
    }

  } // region chr != old chr
      
  regions.clear();
  reader.Close();
  region_f.close();
  return 0;

} //main
int main_aseregion(const vector<string> &all_args)
{
    Init(all_args);
    
    cerr << "* Reading bam file " << endl;
    OpenBam(bam_reader, bam_file);
    bam_reader.OpenIndex(bam_file + ".bai");
    
    vector<string> readGroupVector; //Obtain all the readgroups.
    SamHeader header = bam_reader.GetHeader();
    SamReadGroupDictionary headerRG = header.ReadGroups;
    for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++)
    {
        readGroupVector.push_back(it -> ID);
    }
    
    cout << "#CHROM" << "\t" << "StartPos" << "\t" << "EndPos";
    for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
    {
        cout << "\t" << *it;
    }
    cout << endl;
    
    vector<RefData> chroms = bam_reader.GetReferenceData();
    
    StlFor(chrom_idx, chroms)
    {
        string &chrom = chroms[chrom_idx].RefName;
        cerr << "* On chrom " << chrom << endl;
        
        map<string, vector<GenomicRegion> >::iterator searchIt = chrom_genomicRegions.find(chrom);
        
        BamAlignment startPointer; // This pointer will point to the region immediately before the start of current regions under inspection.
        bam_reader.Jump(chrom_idx);
        if (!bam_reader.GetNextAlignment(startPointer))
            break;
        
        int count = 0;
        // For each region, walk through all the reads correspoinding to this region and count the reads.
        for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it)
        {
            bam_reader.Jump(chrom_idx, startPointer.Position); // Fix the reading pointer.
            if (!bam_reader.GetNextAlignment(startPointer))
                break;
            int flag = 0;
            while (true)
            {
                int startEnd = startPointer.GetEndPosition();
                if (startEnd < it -> start)
                {
                    if (!bam_reader.GetNextAlignment(startPointer))
                    {
                        flag = 1;
                        break;
                    }
                }
                else
                {
                    break;
                }
            }
            
            if (flag == 1)
            {
                break;
            }
            // Now startPointer assumes its rightful position.
            BamAlignment nextPointer = startPointer; //This pointer traverse through all reads that align to the current genomic region in bed file and the iteration ends when this pointer pass through the end of the region.
            
            while (true)
            {
                int nextStart = nextPointer.Position;
                if (nextStart > it -> end)
                {
                    break; // This iteration is done.
                }
                
                if (nextPointer.MapQuality < min_map_qual)
                {
                    if (!bam_reader.GetNextAlignment(nextPointer))
                    {
                        break;
                    }
                    continue;
                }
                
                string currentRG;
                Assert(nextPointer.GetReadGroup(currentRG));
                
                map<string, int> &RG_counts = nextPointer.IsReverseStrand() ? it -> revCounts : it -> fwdCounts;
                map<string, int>::iterator searchItForRG = RG_counts.find(currentRG);
                if (searchItForRG == RG_counts.end())
                {
                    RG_counts[currentRG] = 1;
                }
                else
                {
                    ++ RG_counts[currentRG];
                }
                if (!bam_reader.GetNextAlignment(nextPointer))
                {
                    break;
                }
            }
            count ++;
            if (count % 1000 == 0)
                cerr << "Processed" << "\t" << count << endl;
        }
        
        // Output the counts
        for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it)
        {
            cout << chrom << "\t" << it -> start << "\t" << it -> end;
            for (vector<string>::iterator subIt = readGroupVector.begin(); subIt != readGroupVector.end(); ++subIt)
            {
                map<string, int>::iterator searchItForRG = it -> fwdCounts.find(*subIt);
                if (searchItForRG != it -> fwdCounts.end())
                {
                    cout << "\t" << searchItForRG -> second << ",";
                }
                else
                {
                    cout << "\t" << "0,";
                }
                searchItForRG = it -> revCounts.find(*subIt);
                if (searchItForRG != it -> revCounts.end())
                {
                    cout << searchItForRG -> second;
                }
                else
                {
                    cout << "0";
                }
            }
            cout << endl;
        }
    }
Exemple #15
0
int CropBamTool::CropBam()
{
    // open bam files
    BamMultiReader bamReader;
    bamReader.Open(bamFiles);

    // the dictionary of chromosomes
    RefVector genome = bamReader.GetReferenceData();

    // get the scanning window
    vector<tuple<int,int,int>> windows;
    int numWindows = GenericRegionTools::toScanWindow(genome, regionStrings, windows);

    unordered_set<string> readpool;

    // temporary struct for sequence object
    typedef struct {
        string name;
        int head_soft_clip;
        int tail_soft_clip;
        string seq;
        string qual;
    }cropbam_seq_t;

    // temporary struct for unique seqs
    map<string,list<cropbam_seq_t>> uniqueSeqPool;

    // lambda expression for output
    auto Output = [this](cropbam_seq_t &a){
          if (this->outFormat=="fasta"){
              cout << ">" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
          }
          if (this->outFormat=="fastq"){
              cout << "@" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
              cout << "+" << endl
                   << a.qual << endl;
          }
    };

    // loop over windows
    omp_set_dynamic(0);
    omp_set_num_threads(numThreads);
    #pragma omp parallel for shared(genome)
    for (int i=0; i<numWindows; i++)
    {
        clock_t tStart = clock();

        bamReader.Open(bamFiles);

        int wId = get<0>(windows[i]);
        int wLp = get<1>(windows[i]);
        int wRp = get<2>(windows[i]);

        if (verbose>=1) Verbose("process the window " + genome[wId].RefName + ":" + to_string(wLp+1) + "-" + to_string(wRp));

        // rewind the bam reader
        bamReader.Rewind();
        // set the region
        bamReader.SetRegion(wId, wLp, wId, wRp);

        int numReads = 0;
        // retrieve the alignment
        BamAlignment aln;
        while (bamReader.GetNextAlignment(aln))
        {
            // skip the alignment if it doesn't overlap the window
            if (aln.Position>=wRp || aln.GetEndPosition()<=wLp)
                continue;

            // skip the invalid alignment
            if (!isValidAlignment(aln, readLenThres, mapQualThres, alnFlagMarker))
                continue;

            // skip the alignment harboring too many mismatches
            if (!GenericBamAlignmentTools::validReadIdentity(aln, 1-alnIdenThres))
                continue;

            stringstream keyss;
            keyss << GenericBamAlignmentTools::getBamAlignmentName(aln) << "-"
                  << wId << "-" << wLp << "-" << wRp;
            string key = keyss.str();
            auto ptr = readpool.find(key);
            if (ptr!=readpool.end())
                continue;
            readpool.emplace(key);

            // get the partial read
            string readSegment, readQualSegment, genomeSegment;
            GenericBamAlignmentTools::getLocalAlignment(aln, wLp, wRp-wLp, readSegment, readQualSegment, genomeSegment);

            // add soft clip
            int hsc=0;
            auto ptr0 = aln.CigarData.begin();
            if (aln.Position>=wLp && (ptr0->Type=='S' || ptr0->Type=='H'))
            {
                stringstream headClipSeq, headClipQual;
                for (int i=0; i<ptr0->Length; i++)
                {
                    headClipSeq << aln.QueryBases[i];
                    headClipQual << aln.Qualities[i];
                }

                if (keepClip)
                {
                    readSegment=headClipSeq.str()+readSegment;
                    readQualSegment=headClipQual.str()+readQualSegment;
                }

                hsc += ptr0->Length;
            }
            int tsc=0;
            auto ptr1 = aln.CigarData.rbegin();
            if (aln.GetEndPosition()<wRp && (ptr1->Type=='S' || ptr1->Type=='H'))
            {
                string ss="", qs="";
                auto str=aln.QueryBases.rbegin();
                auto qtr=aln.Qualities.rbegin();
                for (int i=0; i<ptr1->Length; i++,str++,qtr++)
                {
                    ss=(*str)+ss;
                    qs=(*qtr)+qs;
                }
                if (keepClip)
                {
                    readSegment=readSegment+ss;
                    readQualSegment=readQualSegment+qs;
                }
                tsc += ptr1->Length;
            }

            if (readSegment.length()>=segmentLenThres)
            {
                cropbam_seq_t a;
                a.name = GenericBamAlignmentTools::getBamAlignmentName(aln);
                a.head_soft_clip = hsc;
                a.tail_soft_clip = tsc;
                a.seq = readSegment;
                a.qual = readQualSegment;
                if (uniqueSeqPool.count(a.seq)==0)
                    uniqueSeqPool[a.seq] = list<cropbam_seq_t>(1,a);
                else
                    uniqueSeqPool[a.seq].emplace_back(a);
//                if (outFormat=="fasta"){
//                    cout << ">" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                }

//                if (outFormat=="fastq"){
//                    cout << "@" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                    cout << "+" << endl
//                         << readQualSegment << endl;
//                }

                numReads++;
            }
        }

        numReads = 0;
        if (useUnique){
            ofstream of;
            of.open(outFreq);
            for (auto a : uniqueSeqPool){
                if (a.second.size()>=thresFreq){
                    Output(*a.second.begin());
                    of << a.second.begin()->name << "\t" << a.second.size() << endl;
                    numReads ++;
                }
            }
            of.close();
        }else{
            for (auto a : uniqueSeqPool){
                for (auto b : a.second){
                    Output(b);
                    numReads ++;
                }
            }
        }

        clock_t tEnd = clock();

        if (verbose>=1) Verbose("retrieve " + to_string(numReads) + " reads");
        if (verbose>=1) Verbose("time elapsed " + to_string((double)(tEnd-tStart)/CLOCKS_PER_SEC) + " seconds");
    }

    return 0;
}
void BedIntersect::IntersectBam(string bamFile) {

	// load the "B" bed file into a map so
	// that we can easily compare "A" to it for overlaps
	_bedB->loadBedFileIntoMap();
	
	// open the BAM file
	BamReader reader;
	BamWriter writer;
	reader.Open(bamFile);

	// get header & reference information
	string header  = reader.GetHeaderText();
	RefVector refs = reader.GetReferenceData();

	// open a BAM output to stdout if we are writing BAM
	if (_bamOutput == true) {
		// open our BAM writer
        writer.Open("stdout", header, refs, _isUncompressedBam);
	}

	vector<BED> hits;
	// reserve some space
	hits.reserve(100);
	
	_bedA->bedType = 6;
	BamAlignment bam;	
	// get each set of alignments for each pair.
	while (reader.GetNextAlignment(bam)) {
		
		if (bam.IsMapped()) {	
			BED a;
			a.chrom = refs.at(bam.RefID).RefName;
			a.start = bam.Position;
			a.end   = bam.GetEndPosition(false);

			// build the name field from the BAM alignment.
			a.name = bam.Name;
			if (bam.IsFirstMate()) a.name += "/1";
			if (bam.IsSecondMate()) a.name += "/2";

			a.score  = ToString(bam.MapQuality);
			
			a.strand = "+"; 
			if (bam.IsReverseStrand()) a.strand = "-"; 
	
			if (_bamOutput == true) {
			    bool overlapsFound = false;
			    // treat the BAM alignment as a single "block"
			    if (_obeySplits == false) {
				    overlapsFound = FindOneOrMoreOverlap(a);
				}
				// split the BAM alignment into discrete blocks and
				// look for overlaps only within each block.
				else {
                    bool overlapFoundForBlock;
				    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
				    // we don't want to split on "D" ops, hence the "false"
                    getBamBlocks(bam, refs, bedBlocks, false);
                    
                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        overlapFoundForBlock = FindOneOrMoreOverlap(a);
            	        if (overlapFoundForBlock == true)
                            overlapsFound = true;
            	    }
				}
				if (overlapsFound == true) {
					if (_noHit == false)
						writer.SaveAlignment(bam);
				}
				else {
					if (_noHit == true) {
						writer.SaveAlignment(bam);
					}	
				}
			}
			else {
			    // treat the BAM alignment as a single BED "block"
			    if (_obeySplits == false) {
				    FindOverlaps(a, hits);
				    hits.clear();
			    }
			    // split the BAM alignment into discrete BED blocks and
				// look for overlaps only within each block.
			    else {
			        bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
                    getBamBlocks(bam, refs, bedBlocks, false);

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        FindOverlaps(*bedItr, hits);
                        hits.clear();
            	    }
			    }
			}
		}
	}
	
	// close the relevant BAM files.
	reader.Close();
	if (_bamOutput == true) {
		writer.Close();
	}
}
Exemple #17
0
int bbctools_create( BbcUtils::OptParser &optParser ) {
    const vector<string> cmdArgs = optParser.getArgs();

	// remove .bbc extension from bbc file root, if present
	string bbcfileRoot = optParser.getOptValue( "bbc" );
	int i = bbcfileRoot.size() - 4;
	if( i > 0 && bbcfileRoot.substr(i,4) == ".bbc" ) {
		bbcfileRoot = bbcfileRoot.substr(0,i);
	}
	bool f_bci = optParser.getOptBoolean("index");
	bool f_cbc = optParser.getOptBoolean("coarse");

	string targetRegions   = optParser.getOptValue("regions");
	string annotationFields = optParser.getOptValue( "annotationFields");
	vector<string> auxRegionSplit = BbcUtils::mapToPairList(annotationFields);

	string  sumstatsFile = optParser.getOptValue("sumStats");
	string  covstatsFile = optParser.getOptValue("covStats");
	string  readOrigFile = optParser.getOptValue("readOrigin");
	string  readType     = optParser.getOptValue("readType");
	string  covDepths    = optParser.getOptValue("covDepths","-");
	double  minPcCov     = optParser.getOptNumber("minPcCov");
	int32_t primerLength = optParser.getOptInteger( "primerLength", (readType == "AmpliSeq" ? 30 : 0) );
	int32_t maxE2eEndGap = optParser.getOptInteger( "e2eGap", (readType == "AmpliSeq" ? 2 : 0) );

	bool   autoCreateBamIndex = optParser.getOptBoolean("autoCreateBamIndex");
	bool     samdepth         = optParser.getOptBoolean("samdepth");
	int32_t  filterQuality    = optParser.getOptInteger("minMAPQ");
	int32_t  minAlignLength   = optParser.getOptInteger("minAlignLength");
	bool     filterDuplicates = optParser.getOptBoolean("noDups");
	bool     filterUnique     = optParser.getOptBoolean("unique");
	uint32_t skipFlag         = filterDuplicates ? 0x704 : 0x304;
	uint16_t minMapQuality    = filterUnique ? 1 : filterQuality;

	bool onlyOnTargetReads = optParser.getOptBoolean("onTargetReads");
	bool onlyOnTargetBases = optParser.getOptBoolean("onTargetBases");

	// possible future options
	bool invertOnTarget = false;

	// check basic valid argument values and combinations
	int numOuts  = !bbcfileRoot.empty() + !covstatsFile.empty() + !sumstatsFile.empty() + !readOrigFile.empty();
	int numPipes = (bbcfileRoot == "-") + (covstatsFile == "-") + (sumstatsFile == "-") + (readOrigFile == "-");
	if( numOuts == 0 && !f_bci && !f_cbc ) {
		bbcfileRoot = "-";	// default if no other output specified
	} else if( numPipes > 1 ) {
		cerr << "Error: bbctools create: Only one file output (--covStats, --sumStats, --readOrigin or --bbc) may be piped to STDOUT." << endl;
		return -1;
	} else if( samdepth && numOuts ) {
		cerr << "Error: bbctools create: --samdepth (-s) option may only be used without other output options." << endl;
		return -1;
	}
	// check if single argument is a BBC file and leave open for reading if so
	BbcView bbcView;
	bool haveBbcFile = cmdArgs.size() == 1 && bbcView.Open( cmdArgs[0], true );
	bbcView.SelectPrintStream( samdepth ? "SAMDEPTH" : "BBCVIEW" );

	// check distinction between default and explicit no target regions - only for BBC input
	bool explicitNoTargetRegions = false;
	if( targetRegions == "-" ) {
		explicitNoTargetRegions = haveBbcFile;
		targetRegions = "";
	}
	if( targetRegions.empty() ) {
		if( onlyOnTargetBases && explicitNoTargetRegions && !invertOnTarget ) {
			cerr << "Warning: bbctools create --onTargetBases (-b) option with --regions '-' produces no coverage." << endl;
		} else if( onlyOnTargetReads ) {
			cerr << "Error: bbctools create --onTargetReads (-r) option requires a --regions file." << endl;
			return -1;
		}
	}
	// check for legal BBC create options
	if( f_bci || f_cbc ) {
		if( (bbcfileRoot.empty() || bbcfileRoot == "-") && !haveBbcFile ) {
			string opt = f_bci ? "--index (-i)" : "--coarse (-c)";
			cerr << "Error: bbctools create "+opt+" option requires the --bbc (-B) option or a BBC source file." << endl;
			return -1;
		}
	}
	BamMultiReader bamReader;
	if( haveBbcFile ) {
		// warn for options that do not work with BBC input
		if( filterQuality > 0 || filterDuplicates || filterUnique || minAlignLength ) {
			cerr << "Warning: SAM flag, alignment length and MAPQ filters ignored for BBC source file." << endl;
		}
		if( samdepth ) {
			cerr << "Error: --samdepth option is not supported for BBC source files." << endl;
			return -1;
		}
		if( !readOrigFile.empty() ) {
			cerr << "Error: --readOrigin option is not supported for BBC source files." << endl;
			return -1;
		}
	} else {
		// check / open for multiple BAM file inputs
		if ( !bamReader.Open(cmdArgs) ) {
			if( cmdArgs.size() == 1 ) cerr << "ERROR: Could not read input BAM file:";
			else cerr << "ERROR: Could not read all input BAM files:";
			// get and clean up bamtools error msg
			string errMsg = bamReader.GetErrorString();
			size_t i = errMsg.find_first_of('\n');
			if( i != string::npos ) errMsg = errMsg.substr(i+1);
			i = errMsg.find("::");
			if( i != string::npos ) {
				i = errMsg.find(": ");
				if( i != string::npos ) errMsg = errMsg.substr(i+1);
			}
			errMsg = BbcUtils::stringTrim(errMsg);
			errMsg[0] = toupper(errMsg[0]);
			cerr << endl << errMsg << "." << endl;
			return 1;
		}
	}
	// grab reference list from either input source
	const RefVector &references = haveBbcFile ? bbcView.GetReferenceData() : bamReader.GetReferenceData();
	if( !references.size() ) {
		// Issue would already been detected if input was BBC file
		cerr << "ERROR: " << (cmdArgs.size() > 1 ? "One or more " : "");
		cerr << "BAM file contains unaligned reads (no references).\n";
		return 1;
	}
	// check/set up target regions input regions/region statistics output
	RegionCoverage *regions = NULL;
	string covstatsStaticFields;
	bool trackRegionBaseCov = !covDepths.empty();
	if( covstatsFile.empty() ) {
		trackRegionBaseCov = false;
		if( !annotationFields.empty() ) {
			cerr << "Warning: --annotationFields (A) option ignored without --covStats (-C) option." << endl;
		}
		if( !covDepths.empty() && covDepths != "-" ) {
			cerr << "Warning: --covDepths (-D) option ignored without --covStats (-C) option." << endl;
		}
		if( !readType.empty() ) {
			cerr << "Warning: --readType (-T) option ignored without --covStats (-C) option." << endl;
		}
		// read regions for input only and/or creating sumStats
		if( !targetRegions.empty() || explicitNoTargetRegions || !sumstatsFile.empty() ) {
			regions = new RegionCoverage(references);
		}
	} else if( readType == "trgreads" || readType == "amplicon" || readType == "AmpliSeq" ) {
		if( haveBbcFile ) {
			cerr << "Creation of read coverage requires BAM file input." << endl;
			return -1;
		}
		AmpliconRegionStatistics *ampRegionStats = new AmpliconRegionStatistics(references);
		ampRegionStats->SetGenericReads( readType == "trgreads" );
		ampRegionStats->SetSigFacCoverage( minPcCov/100 );
		ampRegionStats->SetMaxUpstreamPrimerStart( primerLength );
		ampRegionStats->SetMaxE2eEndDistance( maxE2eEndGap );
		covstatsStaticFields = "overlaps,";
		covstatsStaticFields += (minPcCov > 0) ? "fwd_cov,rev_cov" : "fwd_e2e,rev_e2e";
		covstatsStaticFields += ",total_reads,fwd_reads,rev_reads";
		regions = ampRegionStats;
	} else if( readType == "trgbases" ) {
		if( haveBbcFile && targetRegions.empty() && !explicitNoTargetRegions ) {
			cerr << "Warning: Assuming reference contigs for base coverage targets (=> option --regions -)" << endl;
		}
		RegionStatistics *regionStats = new RegionStatistics(references);
		covstatsStaticFields = "covered,uncov_5p,uncov_3p,ave_basereads,fwd_basereads,rev_basereads";
		trackRegionBaseCov = true;
		regions = regionStats;
	} else if( readType == "covdepth" || readType.empty() ) {
		// output (sorted) targets file with only covDepth stats (if any)
		regions = new RegionCoverage(references);
	} else {
		cerr << "Unknown read type '" << readType << "'" << endl;
		return -1;
	}
	// Load the input regions or default to whole reference contig targets
	if( regions ) {
		regions->SetCovAtDepths( covDepths == "-" ? "20,100,500" : covDepths );
		if( targetRegions.empty() ) {
			regions->SetWholeContigTargets();
			// set contigs as explicit regions means all reads will seen as on-target
			// for consistency these are inverted (for input from BBC)
			invertOnTarget = true;
		} else {
			string auxFieldIdx = auxRegionSplit.size() ? auxRegionSplit[0] : "";
			string errMsg = regions->Load( targetRegions, "BED", auxFieldIdx );
			if( !errMsg.empty() ) {
				cerr << "ERROR: " + errMsg + "\n";
				return 1;
			}
		}
		if( onlyOnTargetReads && haveBbcFile ) {
			cerr << "Error: bbctools create --onTargetReads option is not supported for BBC source file." << endl;
			return -1;
		}
	}
	//
	// Perform all bbctools create utilities
	//
	BbcCreate *bbcCreate = NULL;
	if( !bbcfileRoot.empty() && (bbcfileRoot != "-" || !haveBbcFile) ) {
		bbcCreate = new BbcCreate(references);
		if( bbcfileRoot != "-" && !bbcCreate->Open(bbcfileRoot+".bbc") ) {
			return 1;
		}
		bbcCreate->SetNoOffTargetPositions(onlyOnTargetBases);
	}
	bbcView.SetNoOffTargetPositions(onlyOnTargetBases);
	// Stream input to output creators
	if( haveBbcFile ) {
		// BBC reader and driver via BbcView object
		if( bbcfileRoot != "-" || !covstatsFile.empty() ) {
			// disable BbcView text stream if using for file creation
			bbcView.SelectPrintStream("NONE");
		}
		// process input BBC for just new BBC and target coverage (defer BCI/CBC)
		bbcView.SetBbcCreate(bbcCreate);
		bbcView.SetRegionCoverage(regions);
		// explicitNoTargetRegions intended for explicitly removing on-target coverage
		bbcView.SetInvertOnTarget(explicitNoTargetRegions ^ invertOnTarget);
		if( bbcCreate || regions || bbcfileRoot == "-" ) {
			bbcView.ReadAll();
		}
	} else {
		// Test read tracking option for file write
		TrackReads *readTracker = NULL;
		try {
			if( !readOrigFile.empty() )
				readTracker = new TrackReads( readOrigFile, regions );
		} catch( std::runtime_error & ) {
			cerr << "ERROR: Unable to write to read tracking file " << readOrigFile << endl;
			return 1;
		}
		// BAM reader, BaseCoverage driver, dispatching to BbcCreate and BbcView objects
		BaseCoverage baseCov(references);
		baseCov.SetRegionCoverage(regions);
		baseCov.SetBbcCreate(bbcCreate);
		baseCov.SetInvertOnTarget(invertOnTarget);
		if( bbcfileRoot == "-" ) {
			baseCov.SetBbcView(&bbcView);
		}
		// Certain options require that all reads are processed, invalidating other performance options
		bool trackAllReads = !sumstatsFile.empty() || readTracker;
		// Implicit set of onlyOnTargetReads for performance when only these reads are required
		bool useBaseCov = (bbcfileRoot == "-" || bbcCreate);
		if( !targetRegions.empty() && !trackAllReads ) {
			onlyOnTargetReads |= onlyOnTargetBases;
			if( samdepth || !useBaseCov ) onlyOnTargetReads = true;
		}
		useBaseCov |= trackRegionBaseCov;
		// do not allow jumping if sumStats option is used - need to count all reads
		bool bamReaderSetRegions = (s_useBamReaderJump && !trackAllReads);
		int trgContig = 0, trgSrtPos = 0, trgEndPos = 0;
		int minJumpLen = s_initialMinJumpLen;
		int maxReadLen = s_initialMaxReadLen;
		if( onlyOnTargetReads ) {
			// load/create BAM index files for targeted reading
			// Note: BamIndex::BAMTOOLS format performed very badly and cannot use mixed with BTI/BAI files
			if( bamReaderSetRegions && !bamReader.LocateIndexes() ) {
				string plural( cmdArgs.size() > 1 ? "s" : "" );
				if( autoCreateBamIndex ) {
					cerr << "Warning: Did not locate BAM index (BAI) file" << plural << ", creating bamtools version..." << endl;
					// to avoid bug use new instance of BamMultiReader
					BamMultiReader bamReader2;
					if( !bamReader2.Open(cmdArgs) || !bamReader2.CreateIndexes() ) {
						cerr << "WARNING: Failed to create BAM index file" << plural << "." << endl;
						bamReaderSetRegions = false;
					} else {
						if( cmdArgs.size() == 1 ) {
							cerr << "Successfully created BAM index file: " << BbcUtils::fileName(cmdArgs[0]) << ".bai" << endl;
						} else {
							cerr << "Successfully created BAM index files." << endl;
						}
						// re-locate indexes with first reader - could not seem to locate BTI files created!
						if( !bamReader.LocateIndexes() ) {
							cerr << "WARNING: Failed to locate BAM index file" << plural << " just created!" << endl;
							bamReaderSetRegions = false;
						}
					}
				} else {
					cerr << "Warning: BAM index file" << plural << " not located for targeted BAM access." << endl;
					bamReaderSetRegions = false;
				}
			}
			// cancel region filtering if there are no regions to iterate (unexpected)
			if( !regions->GetNextRegion( trgContig, trgSrtPos, trgEndPos ) ) {
				onlyOnTargetReads = bamReaderSetRegions = false;
			}
			if( bamReaderSetRegions ) {
				bamReader.Jump( trgContig, trgSrtPos-maxReadLen );
			}
		}
		BamAlignment aln;
		while( bamReader.GetNextAlignmentCore(aln) ) {
			// appears to be an undocumented behavior here
			if( aln.RefID < 0 ) continue;
			// skip filtered reads by flag, length or mapping quality
			if( aln.AlignmentFlag & skipFlag ) continue;
			if( aln.MapQuality < minMapQuality ) continue;
			int32_t endPos = aln.GetEndPosition();
			if( minAlignLength > 0 ) {
				if( endPos - aln.Position < minAlignLength ) continue;
			}
			// screen for on-target reads
			if( onlyOnTargetReads ) {
				// find next region overlapping or beyond of current read
				bool moreRegions = true;
				bool setRegion = false;
				while( aln.RefID > trgContig || (aln.RefID == trgContig && aln.Position > trgEndPos) ) {
					if( !regions->GetNextRegion( trgContig, trgSrtPos, trgEndPos ) ) {
						moreRegions = false;
						break;
					}
					setRegion = bamReaderSetRegions;
				}
				if( !moreRegions ) {
					// prevent further on-target checks and exit early if not using sumStats
					onlyOnTargetReads = false;
					if( trackAllReads ) {
						// force tracking of off-target reads
						regions->TrackReadsOnRegion(aln,endPos);
						if( readTracker ) readTracker->Write(aln,endPos);
						continue;
					}
					break;
				}
				if( setRegion ) {
					// track max read length for future index jumps - just in case long reads ever used
					if( endPos - aln.Position > maxReadLen ) {
						maxReadLen = endPos - aln.Position;
						if( maxReadLen > minJumpLen ) minJumpLen = maxReadLen;
					}
					if( aln.RefID != trgContig || trgSrtPos - aln.Position > minJumpLen ) {
						bamReader.Jump( trgContig, trgSrtPos-maxReadLen );
					}
				}
				if( aln.RefID < trgContig || endPos < trgSrtPos ) {
					// force tracking of off-target reads
					if( trackAllReads ) {
						regions->TrackReadsOnRegion(aln,endPos);
						if( readTracker ) readTracker->Write(aln,endPos);
					}
					continue;	// current is before next target region - fetch the next within bounds
				}
			}
			// record base coverage and region coverage statistics
			if( useBaseCov ) {
				endPos = baseCov.AddAlignment(aln,endPos);
				if( endPos <= 0 ) {
					if( endPos == 0 ) continue;	// read was silently ignored
					cerr << "ERROR: BAM file is not correctly sorted vs. reference." << endl;
					return 1;
				}
			}
			// record read coverage and region coverage statistics
			if( regions ) {
				regions->TrackReadsOnRegion(aln,endPos);
			}
			if( readTracker ) {
				readTracker->Write(aln,endPos);
			}
		}
		// flush and close objects associated with output
		baseCov.Flush();
	}
	// Output in-memory region stats file and ensure BBC file is closed
	if( regions ) {
		// build output fields title string
		string outFields = "contig_id,contig_srt,contig_end";
		if( !auxRegionSplit.empty() ) outFields += "," + auxRegionSplit[1];
		if( !covstatsStaticFields.empty() ) outFields += "," + covstatsStaticFields;
		regions->Write( covstatsFile, outFields );
		if( !sumstatsFile.empty() ) {
			regions->WriteSummary( sumstatsFile, invertOnTarget );
		}
		delete regions;
	}
	delete bbcCreate;

	// Complete remaining file creation options using a BBC file input
	// NOTE: Using BbbCreate for this would require code duplication and concurrent file output streaming
	if( f_bci || f_cbc ) {
		// Check BBC file source
		if( haveBbcFile ) {
			bbcfileRoot = cmdArgs[0];
	    	int i = bbcfileRoot.size() - 4;
	    	if( i > 0 && bbcfileRoot.substr(i,4) == ".bbc" ) {
	    		bbcfileRoot = bbcfileRoot.substr(0,i);
	    	}
		} else if( !bbcView.Open( bbcfileRoot+".bbc", true ) ) {
			cerr << "ERROR: Unexpected failure to read new BBC file '"+bbcfileRoot+".bam'" << endl;
			return 1;
		}
		if( f_bci ) {
			BbcIndex indexer( bbcfileRoot+".bci" );
			if( !bbcView.CreateIndex(indexer) ) {
				cerr << "ERROR: Failed to create index file '" << bbcfileRoot << ".bci'" << endl;
				return 1;
			}
		}
		if( f_cbc ) {
			// CBC generation can use BCI file but is no faster since whole BBC file is read
			BbcCoarse cbcWriter( bbcfileRoot+".cbc" );
			if( !bbcView.CreateCbc(cbcWriter) ) {
				cerr << "ERROR: Failed to create coarse base coverage file '" << bbcfileRoot << ".cbc'" << endl;
				return 1;
			}
		}
	}
	return 0;
}
Exemple #18
0
int get_overlap(BamAlignment& aln, BamRegion& region) {
    int ovlp_beg = max(aln.Position, region.LeftPosition);
    int ovlp_end = min(aln.GetEndPosition(), region.RightPosition);
    return max(0, ovlp_end-ovlp_beg);
}
// builds index from associated BAM file & writes out to index file
bool BamToolsIndex::Create(void) {

    // skip if BamReader is invalid or not open
    if ( m_reader == 0 || !m_reader->IsOpen() ) {
        SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
        return false;
    }

    // rewind BamReader
    if ( !m_reader->Rewind() ) {
        const string readerError = m_reader->GetErrorString();
        const string message = "could not create index: \n\t" + readerError;
        SetErrorString("BamToolsIndex::Create", message);
        return false;
    }

    try {
        // open new index file (read & write)
        const string indexFilename = m_reader->Filename() + Extension();
        OpenFile(indexFilename, IBamIODevice::ReadWrite);

        // initialize BtiFileSummary with number of references
        const int& numReferences = m_reader->GetReferenceCount();
        InitializeFileSummary(numReferences);

        // intialize output file header
        WriteHeader();

        // index building markers
        uint32_t currentBlockCount      = 0;
        int64_t currentAlignmentOffset  = m_reader->Tell();
        int32_t blockRefId              = -1;
        int32_t blockMaxEndPosition     = -1;
        int64_t blockStartOffset        = currentAlignmentOffset;
        int32_t blockStartPosition      = -1;

        // plow through alignments, storing index entries
        BamAlignment al;
        BtiReferenceEntry refEntry;
        while ( m_reader->LoadNextAlignment(al) ) {

            // if moved to new reference
            if ( al.RefID != blockRefId ) {

                // if first pass, check:
                if ( currentBlockCount == 0 ) {

                    // write any empty references up to (but not including) al.RefID
                    for ( int i = 0; i < al.RefID; ++i )
                        WriteReferenceEntry( BtiReferenceEntry(i) );
                }

                // not first pass:
                else {

                    // store previous BTI block data in reference entry
                    const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
                    refEntry.Blocks.push_back(block);

                    // write reference entry, then clear
                    WriteReferenceEntry(refEntry);
                    ClearReferenceEntry(refEntry);

                    // write any empty references between (but not including)
                    // the last blockRefID and current al.RefID
                    for ( int i = blockRefId+1; i < al.RefID; ++i )
                        WriteReferenceEntry( BtiReferenceEntry(i) );

                    // reset block count
                    currentBlockCount = 0;
                }

                // set ID for new reference entry
                refEntry.ID = al.RefID;
            }

            // if beginning of block, update counters
            if ( currentBlockCount == 0 ) {
                blockRefId          = al.RefID;
                blockStartOffset    = currentAlignmentOffset;
                blockStartPosition  = al.Position;
                blockMaxEndPosition = al.GetEndPosition();
            }

            // increment block counter
            ++currentBlockCount;

            // check end position
            const int32_t alignmentEndPosition = al.GetEndPosition();
            if ( alignmentEndPosition > blockMaxEndPosition )
                blockMaxEndPosition = alignmentEndPosition;

            // if block is full, get offset for next block, reset currentBlockCount
            if ( currentBlockCount == m_blockSize ) {

                // store previous block data in reference entry
                const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
                refEntry.Blocks.push_back(block);

                // update markers
                blockStartOffset  = m_reader->Tell();
                currentBlockCount = 0;
            }

            // not the best name, but for the next iteration, this value will be the offset of the
            // *current* alignment. this is necessary because we won't know if this next alignment
            // is on a new reference until we actually read it
            currentAlignmentOffset = m_reader->Tell();
        }

        // after finishing alignments, if any data was read, check:
        if ( blockRefId >= 0 ) {

            // store last BTI block data in reference entry
            const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
            refEntry.Blocks.push_back(block);

            // write last reference entry, then clear
            WriteReferenceEntry(refEntry);
            ClearReferenceEntry(refEntry);

            // then write any empty references remaining at end of file
            for ( int i = blockRefId+1; i < numReferences; ++i )
                WriteReferenceEntry( BtiReferenceEntry(i) );
        }

    } catch ( BamException& e ) {
        m_errorString = e.what();
        return false;
    }

    // rewind BamReader
    if ( !m_reader->Rewind() ) {
        const string readerError = m_reader->GetErrorString();
        const string message = "could not create index: \n\t" + readerError;
        SetErrorString("BamToolsIndex::Create", message);
        return false;
    }

    // return success
    return true;
}
Exemple #20
0
void BedGenomeCoverage::CoverageBam(string bamFile) {

    ResetChromCoverage();

    // open the BAM file
    BamReader reader;
    if (!reader.Open(bamFile)) {
        cerr << "Failed to open BAM file " << bamFile << endl;
        exit(1);
    }

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // load the BAM header references into a BEDTools "genome file"
    _genome = new GenomeFile(refs);
    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // skip if the read is unaligned
        if (bam.IsMapped() == false)
            continue;

        bool _isReverseStrand = bam.IsReverseStrand();

        //changing second mate's strand to opposite
        if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate())
            _isReverseStrand = !bam.IsReverseStrand();

        // skip if we care about strands and the strand isn't what
        // the user wanted
        if ( (_filterByStrand == true) &&
             ((_requestedStrand == "-") != _isReverseStrand) )
            continue;

        // extract the chrom, start and end from the BAM alignment
        string chrom(refs.at(bam.RefID).RefName);
        CHRPOS start = bam.Position;
        CHRPOS end = bam.GetEndPosition(false, false) - 1;

        // are we on a new chromosome?
        if ( chrom != _currChromName )
            StartNewChrom(chrom);
        if(_pair_chip_) {
            // Skip if not a proper pair
            if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) )
                continue;
            // Skip if wrong coordinates
            if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) ||
                ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) {
                    //chemically designed: left on positive strand, right on reverse one
                    continue;
            }

            /*if(_haveSize) {
                if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = bam.MatePosition+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
                else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = start+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
            } else */

            if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left
                AddCoverage(bam.MatePosition, end);
            }
            else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right
                AddCoverage(start, start + abs(bam.InsertSize) - 1);
            }
        } else if (_haveSize) {
            if(bam.IsReverseStrand()) {
                if(end<_fragmentSize) { //sometimes fragmentSize is bigger :(
                    AddCoverage(0, end);
                } else {
                    AddCoverage(end + 1 - _fragmentSize, end );
                }
            } else {
                AddCoverage(start,start+_fragmentSize - 1);
            }
        } else
        // add coverage accordingly.
        if (!_only_5p_end && !_only_3p_end) {
            bedVector bedBlocks;
            // we always want to split blocks when a D CIGAR op is found.
            // if the user invokes -split, we want to also split on N ops.
            if (_obeySplits) { // "D" true, "N" true
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true);
            }
            else { // "D" true, "N" false
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false);
            }
            AddBlockedCoverage(bedBlocks);
        }
        else if (_only_5p_end) {
            CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
        else if (_only_3p_end) {
            CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
    }
    // close the BAM
    reader.Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults)
{
    VariantCallSetting settingForPyroHMMsnp = snpCallSettings;

    // allele pool
    vector<Allele> allelePool;
    for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++)
    {
        allelePool.push_back(*allelesInBlockIter);
    }

    // add 10bp flanking segment at each side
    int windowLeftPosition  = leftPosition  - snpCallSettings.m_flankingSize;
    int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize;

    // genome
    string genome;
    fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome);

    int    globalDepth;
    double globalMapQual;
    int    globalStrandPos;
    int    globalStrandNeg;

    vector<PyroHMMsnp_Sequence_t> readsInWindow;

    // rewind BAM reader
    bamObj.Rewind();
    // set BAM region
    bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition);
    // read alignment
    BamAlignment al;
    while (bamObj.GetNextAlignment(al))
    {
        // skip if it is not a good alignment
        if (!GenericBamAlignmentTools::goodAlignment(al))
        {
            continue;
        }

        // skip if it is not valid at length
        if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength))
        {
            continue;
        }

        // skip if it is not valid at map quality
        if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality))
        {
            continue;
        }

        // skip if it is not valid at alignment identity
        if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac))
        {
            continue;
        }

        // global info
        globalDepth   += 1;
        globalMapQual += al.MapQuality*al.MapQuality;
        if (al.IsReverseStrand())
            globalStrandNeg += 1;
        else
            globalStrandPos += 1;

        // get local alignment
        string t_localRead, t_localGenome;
        Cigar  t_cigar;
        BamMD  t_md;
        int    t_numMismatch, t_numInDel;
        GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition,
                                                    t_localRead, t_localGenome, t_cigar, t_md,
                                                    t_numMismatch, t_numInDel);

        if (t_localRead.empty() || t_localGenome.empty())
            continue;


        // save into set
        PyroHMMsnp_Sequence_t t_seq;
        t_seq.t_ID           = GenericBamAlignmentTools::getBamAlignmentID(al);
        t_seq.t_sequence     = t_localRead;
        t_seq.t_cigar        = t_cigar;
        t_seq.t_md           = t_md;
        t_seq.t_numMismatch  = t_numMismatch;
        t_seq.t_numInDel     = t_numInDel;
        t_seq.t_mapQualScore = al.MapQuality;


        if (al.Position>windowLeftPosition)
            t_seq.t_startPositionShift = al.Position-windowLeftPosition;
        else
            t_seq.t_startPositionShift = 0;

        if (al.GetEndPosition()<windowRightPosition)
            t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition();
        else
            t_seq.t_endPositionShift = 0;

        readsInWindow.push_back(t_seq);
    }

    int numData = readsInWindow.size();

    // construct the consensus sequence graph
    GenericDagGraph consensusGraph;
    vector<string>  consensusGraphReads;
    vector<Cigar>   consensusGraphReadCigars;
    vector<int>     consensusGraphReadStarts;

    // set of aligned reads to construct the graph
    for (int i=0; i<numData; ++i)
    {
        consensusGraphReads.push_back(readsInWindow[i].t_sequence);
        consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar);
        consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift);
    }

    // build up the graph
    consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts);
    consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel);

    // search topK paths, excluding reference
    vector<string>       topRankConsensusGraphPaths;
    vector<list<Vertex>> topRankConsensusGraphPathVertexs;
    vector<double>       topRankConsensusGraphPathWeights;
    consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights);

    // change vertex list to vertex set
    vector<set<Vertex>>  topRankConsensusGraphPathVertexSet;
    for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++)
    {
        list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin();
        set<Vertex> vertexSet;
        for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++)
        {
            vertexSet.insert(*vertexIter);
        }
        topRankConsensusGraphPathVertexSet.push_back(vertexSet);
    }

    // get variant vertices
    vector<int>    allelePositions;
    vector<string> alleleChars;
    for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++)
    {
        Allele allele = *alleleIter;
        allelePositions.push_back(allele.m_chrPosition-windowLeftPosition);
        alleleChars.push_back(allele.m_allele);
    }
    // map allele to graph vertex
    set<Vertex> variantVertexs;
    map<int,Vertex> mapAlleleToVertex;
    map<Vertex,int> mapVertexToAllele;
    for (int v=0; v<consensusGraph.m_numVertexs; v++)
    {
        if (consensusGraph.m_skip[v])
            continue;

        if (!consensusGraph.m_isMismatch[v])
            continue;

        int gp = consensusGraph.m_genomePosition[v] - 1;


        for (int j=0; j<allelePool.size(); j++)
        {
            int ap = allelePositions[j];
            if (ap==gp)
            {
                if (alleleChars[j]==consensusGraph.m_labels[v])
                {
                    variantVertexs.insert(v);
                    mapAlleleToVertex[j] = v;
                    mapVertexToAllele[v] = j;
                }
            }
        }
    }


    // set up the haplotypes
    vector<string> haplotypes;
    vector<int>    haplotypeToPathIndex;
    vector<set<Vertex>> haplotypeVariantVertexs;

    haplotypes.push_back(genome);
    haplotypeToPathIndex.push_back(-1);
    haplotypeVariantVertexs.push_back(set<Vertex>());

    int kk = 0;
    for (int i=0; i<topRankConsensusGraphPaths.size(); i++)
    {
        if (kk>=snpCallSettings.m_topK)
            continue;

        bool hasVariantVertex = false;
        int  deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length());
        deltaLength = abs(deltaLength);

        if (deltaLength>5)
            continue;

        set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i];
        set<Vertex> pathVariantVertexs;
        for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++)
        {
            if (pathVertexs.find(*variantIter)!=pathVertexs.end())
            {
                hasVariantVertex = true;
                pathVariantVertexs.insert(*variantIter);
            }
        }

        int totalNumberVariantVertexInPath = 0;
        for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++)
        {
            int v = *vertexIter;
            if (consensusGraph.m_isMismatch[v])
            {
                totalNumberVariantVertexInPath += 1;
            }
        }

        if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size())
        {
            haplotypes.push_back(topRankConsensusGraphPaths[i]);
            haplotypeToPathIndex.push_back(i);
            haplotypeVariantVertexs.push_back(pathVariantVertexs);

            kk++;
        }
    }

    int numHaplotypes = haplotypes.size();

    // skip if there is no variant haplotype
    if (numHaplotypes==1)
    {
        return;
    }

    // compute haplotype data likelihood
    vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes);
    PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods);


    // genotype
    vector<vector<int>> genotypes;
    set<set<int>> genotypeDiscovered;
    for (int i=0; i<numHaplotypes; i++)
    {
        vector<int> precedeHaplotypes;
        PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered);
    }

    int numGenotypes = genotypes.size();

    // genotype variant vertex
    vector<set<Vertex>> genotypeVariantVertexs;
    for (int i=0; i<numGenotypes; i++)
    {
        set<Vertex> variantVertexInGenotype;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[i][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end());
        }
        genotypeVariantVertexs.push_back(variantVertexInGenotype);
    }

    // genotype priors
    vector<long double> genotypePriors(numGenotypes);
    PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors);

    // genotype likelihoods
    vector<long double> genotypeLikelihoods(numGenotypes);
    PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods);

    // genotype posteriors
    vector<long double> genotypePosteriors(numGenotypes);
    PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors);

    // search maximal genotype posterior
    long double maxGenotypePosterior = 0;
    int inferGenotype;
    for (int i=1; i<numGenotypes; i++)
    {
        if (maxGenotypePosterior<genotypePosteriors[i])
        {
            maxGenotypePosterior = genotypePosteriors[i];
            inferGenotype = i;
        }
    }

    // all variant vertexs in the inferred genotype
    set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype];

    // count haploid type of variant
    map<Vertex,vector<int>> inferGenotypeVariantHaploidType;
    set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        vector<int> variantHaploidType;
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            int haplotype = genotypes[inferGenotype][j];
            set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype];
            if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end())
            {
                variantHaploidType.push_back(0);
            }else
            {
                variantHaploidType.push_back(1);
            }
        }
        inferGenotypeVariantHaploidType[v] = variantHaploidType;
    }
    // variant score
    map<Vertex,long double> inferGenotypeVariantScore;
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        int v = *inferVariantIter;
        long double variantScore = 0;
        for (int i=0; i<numGenotypes; i++)
        {
            set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i];
            if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end())
                variantScore += genotypePosteriors[i];
        }

        inferGenotypeVariantScore[v] = variantScore;
    }

    // save variant result
    inferVariantIter = inferGenotypeVariantVertexs.begin();
    for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++)
    {
        GenericVariant result;

        int v = *inferVariantIter;
        int a = mapVertexToAllele[v];

        int variantChrID;
        int variantChrPos;

        vector<int> haploidType = inferGenotypeVariantHaploidType[v];
        for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++)
        {
            if (haploidType[j]==0)
            {
                int g = consensusGraph.m_genomePosition[v];

                Allele allele;
                allele.m_allele = consensusGraph.m_labels[g];
                result.m_alleles.push_back(allele);
            }else
            {
                Allele allele = allelePool[a];
                result.m_alleles.push_back(allele);

                variantChrID  = allele.m_chrID;
                variantChrPos = allele.m_chrPosition;
            }
        }

        result.m_chrID           = variantChrID;
        result.m_chrPosition     = variantChrPos;
        result.m_probScoreRef    = genotypePosteriors[0];
        result.m_probScoreVar    = genotypePosteriors[inferGenotype];
        result.m_variantType     = VARIANT_SNP;
        long double variantScore = inferGenotypeVariantScore[v];
        if (fabs(1-variantScore)<1e-300)
            result.m_quality     = 3000;
        else if (variantScore<1e-300)
            result.m_quality     = 0;
        else
            result.m_quality     = -10*log10(1-variantScore);

        char refBase;
        fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase);
        result.m_reference       = refBase;

        for (int i=0; i<result.m_alleles.size(); i++)
        {
            if (result.m_alleles[i].m_allele==result.m_reference)
                result.m_haploidType.push_back(0);
            else
                result.m_haploidType.push_back(1);
        }


        // filter
        if (result.m_quality>=snpCallSettings.m_variantQualityFilter)
            variantResults.push_back(result);

    }

}
Exemple #22
0
void BedWindow::WindowIntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    vector<BED> hits;                   // vector of potential hits
    // reserve some space
    hits.reserve(100);

    _bedA->bedType = 6;
    BamAlignment bam;
    bool overlapsFound;
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
            a.end   = bam.GetEndPosition(false, false);

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);
            a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                overlapsFound = FindOneOrMoreWindowOverlaps(a);
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true)
                        writer.SaveAlignment(bam);
                }
            }
            else {
                FindWindowOverlaps(a, hits);
                hits.clear();
            }
        }
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}