Exemple #1
0
// Checks the reference area around variantPos for a multi-nucleotide repeat and it's span
// Logic: When shifting a window of the same period as the MNR, the base entering the window has to be equal to the base leaving the window.
// example with period 2: XYZACACA|CA|CACAIJK
bool AlleleIdentity::IdentifyMultiNucRepeatSection(const LocalReferenceContext &seq_context, unsigned int rep_period,
    const ReferenceReader &ref_reader, int chr_idx) {

  //cout << "Hello from IdentifyMultiNucRepeatSection with period " << rep_period << "!"<< endl;
  unsigned int variantPos = seq_context.position0 + left_anchor;
  if (variantPos + rep_period >= (unsigned long)ref_reader.chr_size(chr_idx))
    return (false);

  CircluarBuffer<char> window(rep_period);
  for (unsigned int idx = 0; idx < rep_period; idx++)
    window.assign(idx, ref_reader.base(chr_idx,variantPos+idx));

  // Investigate (inclusive) start position of MNR region
  start_window = variantPos - 1; // 1 anchor base
  window.shiftLeft(1);
  while (start_window > 0 and window.first() == ref_reader.base(chr_idx,start_window)) {
    start_window--;
    window.shiftLeft(1);
  }

  // Investigate (exclusive) end position of MNR region
  end_window = variantPos + rep_period;
  if (end_window >= ref_reader.chr_size(chr_idx))
    return false;
  for (unsigned int idx = 0; idx < rep_period; idx++)
    window.assign(idx, ref_reader.base(chr_idx,variantPos+idx));
  window.shiftRight(1);
  while (end_window < ref_reader.chr_size(chr_idx) and window.last() == ref_reader.base(chr_idx,end_window)) {
    end_window++;
    window.shiftRight(1);
  }

  //cout << "Found repeat stretch of length: " << (end_window - start_window) << endl;
  // Require that a stretch of at least 3*rep_period has to be found to count as a MNR
  if ((end_window - start_window) >= (3*(int)rep_period)) {

    // Correct start and end of the window if they are not fully outside variant allele
    if (start_window >= seq_context.position0)
        start_window = seq_context.my_hp_start_pos[0] - 1;
    if (end_window <= seq_context.right_hp_start) {
      if (status.isInsertion)
        end_window = seq_context.right_hp_start + seq_context.right_hp_length + 1;
      else
        end_window = seq_context.right_hp_start + 1;
    }
    if (start_window < 0)
      start_window = 0;
    if (end_window > ref_reader.chr_size(chr_idx))
      end_window = ref_reader.chr_size(chr_idx);
    return (true);
  }
  else
    return (false);
}
Exemple #2
0
// Entry point for variant classification
bool AlleleIdentity::getVariantType(
  const string _altAllele,
  const LocalReferenceContext &reference_context,
  const TIonMotifSet & ErrorMotifs,
  const ClassifyFilters &filter_variant,
  const ReferenceReader &ref_reader,
  int chr_idx) {

  altAllele = _altAllele;
  bool is_ok = reference_context.context_detected;

  if ((reference_context.position0 + (long)altAllele.length()) > ref_reader.chr_size(chr_idx)) {
    is_ok = false;
  }

  // We should now be guaranteed a valid variant position in here
  if (is_ok) {
    is_ok = CharacterizeVariantStatus(reference_context, ref_reader, chr_idx);
    PredictSequenceMotifSSE(reference_context, ErrorMotifs, ref_reader, chr_idx);
  }
  is_ok = is_ok and CheckValidAltAllele(reference_context);

  if (!is_ok) {
    status.isProblematicAllele = true;
    filterReasons.push_back("BADCANDIDATE");
  }

  return(is_ok);
}
Exemple #3
0
// Identify some special motives
bool AlleleIdentity::IdentifyDyslexicMotive(char base, int position,
    const ReferenceReader &ref_reader, int chr_idx) {

  status.isDyslexic = false;
  long  test_position = position-2;

  unsigned int max_hp_distance = 4;
  unsigned int hp_distance = 0;
  unsigned int my_hp_length = 0;

  // Test left vicinity of insertion
  while (!status.isDyslexic and test_position>0 and hp_distance < max_hp_distance) {
    if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) {
      hp_distance++;
      my_hp_length = 0;
    }
    else if (ref_reader.base(chr_idx,test_position) == base) {
      my_hp_length++;
      if(my_hp_length >= 2) {  // trigger when a 3mer or more is found
    	  status.isDyslexic = true;
      }
    }
    test_position--;
  }
  if (status.isDyslexic) return (true);

  // test right vicinity of insertion
  hp_distance = 0;
  my_hp_length = 0;
  test_position = position+1;

  while (!status.isDyslexic and test_position<ref_reader.chr_size(chr_idx) and hp_distance < max_hp_distance) {
    if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) {
      hp_distance++;
      my_hp_length = 0;
    }
    else if (ref_reader.base(chr_idx,test_position) == base) {
      my_hp_length++;
      if(my_hp_length >= 2) {  // trigger when a 3mer or more is found
    	  status.isDyslexic = true;
      }
    }
    test_position++;
  }
  return status.isDyslexic;
}
Exemple #4
0
void AlleleIdentity::PredictSequenceMotifSSE(const LocalReferenceContext &reference_context,
                             const TIonMotifSet & ErrorMotifs,
                             const ReferenceReader &ref_reader, int chr_idx) {

  //cout << "Hello from PredictSequenceMotifSSE" << endl;
  sse_prob_positive_strand = 0;
  sse_prob_negative_strand = 0;
  //long vcf_position = reference_context.position0+1;
  long var_position = reference_context.position0 + left_anchor; // This points to the first deleted base

  string seqContext;
  // status.isHPIndel && status.isDeletion implies reference_context.my_hp_length.at(left_anchor) > 1
  if (status.isHPIndel && status.isDeletion) {

    // cout << start_pos << "\t" << variant_context.refBaseAtCandidatePosition << variant_context.ref_hp_length << "\t" << variant_context.refBaseLeft << variant_context.left_hp_length << "\t" << variant_context.refBaseRight  << variant_context.right_hp_length << "\t";

    unsigned context_left = var_position >= 10 ? 10 : var_position;
    //if (var_position + reference_context.my_hp_length.at(left_anchor) + 10 < ref_reader.chr_size(chr_idx))
      seqContext = ref_reader.substr(chr_idx, var_position - context_left, context_left + (unsigned int)reference_context.my_hp_length[left_anchor] + 10);
    //  else
    //  seqContext = ref_reader.substr(chr_idx, var_position - context_left);

    if (seqContext.length() > 0 && context_left < seqContext.length()) {
      sse_prob_positive_strand = ErrorMotifs.get_sse_probability(seqContext, context_left);

       //cout << seqContext << "\t" << context_left << "\t" << sse_prob_positive_strand << "\t";

      context_left = seqContext.length() - context_left - 1;
      string reverse_seqContext;
      ReverseComplement(seqContext, reverse_seqContext);

      sse_prob_negative_strand = ErrorMotifs.get_sse_probability(reverse_seqContext, context_left);

     // cout << reverse_seqContext << "\t" << context_left << "\t" << sse_prob_negative_strand << "\t";

    }
  }
}
Exemple #5
0
// open BAM input file
void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames)
{
  if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) {
    cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl;
    exit(1);
  }

  if (not bam_reader_.Open(bam_filenames)) {
    cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString() << endl;
    exit(1);
  }
  if (not bam_reader_.LocateIndexes()) {
    cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString() << endl;
    exit(1);
  }

  // BAM multi reader combines the read group information of the different BAMs but does not merge comment sections
  bam_header_ = bam_reader_.GetHeader();
  if (!bam_header_.HasReadGroups()) {
    cerr << "ERROR: there is no read group in BAM files specified" << endl;
    exit(1);
  }

  // Manually merge comment sections of BAM files if we have more than one BAM file
  if (bam_filenames.size() > 1) {

    unsigned int num_duplicates = 0;
    unsigned int num_merged = 0;

    for (unsigned int bam_idx = 0; bam_idx < bam_filenames.size(); bam_idx++) {

      BamReader reader;
      if (not reader.Open(bam_filenames.at(bam_idx))) {
        cerr << "TVC ERROR: Failed to open input BAM file " << reader.GetErrorString() << endl;
    	 exit(1);
      }
      SamHeader header = reader.GetHeader();

      for (unsigned int i_co = 0; i_co < header.Comments.size(); i_co++) {

        // Step 1: Check if this comment is already part of the merged header
    	unsigned int m_co = 0;
    	while (m_co < bam_header_.Comments.size() and bam_header_.Comments.at(m_co) != header.Comments.at(i_co))
    	  m_co++;

    	if (m_co < bam_header_.Comments.size()){
          num_duplicates++;
          continue;
    	}

    	// Add comment line to merged header if it is a new one
    	num_merged++;
    	bam_header_.Comments.push_back(header.Comments.at(i_co));
      }
    }
    // Verbose what we did
    cout << "Merged " << num_merged << " unique comment lines into combined BAM header. Encountered " << num_duplicates << " duplicate comments." << endl;
  }

  //
  // Reference sequences in the bam file must match that in the fasta file
  //

  vector<RefData> referenceSequences = bam_reader_.GetReferenceData();

  if ((int)referenceSequences.size() != ref_reader.chr_count()) {
    cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
         << "       BAM has " << referenceSequences.size()
         << " chromosomes while fasta has " << ref_reader.chr_count() << endl;
    exit(1);
  }

  for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) {
    if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) {
      cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
           << "       Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName
           << " while fasta has " << ref_reader.chr_str(chr_idx) << endl;
      exit(1);
    }
    if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) {
      cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
           << "       Chromosome " << referenceSequences[chr_idx].RefName
           << "in BAM has length " << referenceSequences[chr_idx].RefLength
           << " while fasta has " << ref_reader.chr_size(chr_idx) << endl;
      exit(1);
    }
  }


  //
  // Retrieve BaseCaller and TMAP version strings from BAM header
  //

  set<string> basecaller_versions;
  set<string> tmap_versions;
  for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) {
    if (I->ID.substr(0,2) == "bc")
      basecaller_versions.insert(I->Version);
    if (I->ID.substr(0,4) == "tmap")
      tmap_versions.insert(I->Version);
  }
  basecaller_version_ = "";
  for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) {
    if (not basecaller_version_.empty())
      basecaller_version_ += ", ";
    basecaller_version_ += *I;
  }
  tmap_version_ = "";
  for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) {
    if (not tmap_version_.empty())
      tmap_version_ += ", ";
    tmap_version_ += *I;
  }

}
Exemple #6
0
void TargetsManager::Initialize(const ReferenceReader& ref_reader, const string& _targets, bool _trim_ampliseq_primers /*const ExtendParameters& parameters*/)
{

  //
  // Step 1. Retrieve raw target definitions
  //

  list<UnmergedTarget>  raw_targets;

  if (not _targets.empty()) {
    LoadRawTargets(ref_reader, _targets, raw_targets);

  } else {
    for (int chr = 0; chr < ref_reader.chr_count(); ++chr) {
      raw_targets.push_back(UnmergedTarget());
      UnmergedTarget& target = raw_targets.back();
      target.begin = 0;
      target.end = ref_reader.chr_size(chr);
      target.chr = chr;
    }
  }

  //
  // Step 2. Sort raw targets and transfer to the vector
  //

  int num_unmerged = raw_targets.size();
  vector<UnmergedTarget*> raw_sort;
  raw_sort.reserve(num_unmerged);
  for (list<UnmergedTarget>::iterator I = raw_targets.begin(); I != raw_targets.end(); ++I)
    raw_sort.push_back(&(*I));
  sort(raw_sort.begin(), raw_sort.end(), CompareTargets);

  unmerged.reserve(num_unmerged);
  bool already_sorted = true;
  list<UnmergedTarget>::iterator I = raw_targets.begin();
  for (int idx = 0; idx < num_unmerged; ++idx, ++I) {
    if (raw_sort[idx] != &(*I) and already_sorted) {
      already_sorted = false;
      cerr << "TargetsManager: BED not sorted at position " << idx;
      cerr << " replaced " << I->name << ":" << I->chr << ":" << I->begin << "-" << I->end;
      cerr << " with " << raw_sort[idx]->name << ":" << raw_sort[idx]->chr << ":" << raw_sort[idx]->begin << "-" << raw_sort[idx]->end << endl;
    }
    unmerged.push_back(*raw_sort[idx]);
  }



  //
  // Step 3. Merge targets and link merged/unmerged entries
  //

  merged.reserve(num_unmerged);
  bool already_merged = true;
  for (int idx = 0; idx < num_unmerged; ++idx) {
    if (idx and merged.back().chr == unmerged[idx].chr and merged.back().end >= unmerged[idx].begin) {
      merged.back().end = max(merged.back().end, unmerged[idx].end);
      already_merged = false;
    } else {
      merged.push_back(MergedTarget());
      merged.back().chr = unmerged[idx].chr;
      merged.back().begin = unmerged[idx].begin;
      merged.back().end = unmerged[idx].end;
      merged.back().first_unmerged = idx;
    }
    unmerged[idx].merged = merged.size();
  }

  if (_targets.empty()) {
    cout << "TargetsManager: No targets file specified, processing entire reference" << endl;

  } else  {
    cout << "TargetsManager: Loaded targets file " << _targets << endl;

    cout << "TargetsManager: " << num_unmerged << " target(s)";
    if (not already_merged)
      cout << " (" << merged.size() << " after merging)";
    cout << endl;
    if (not already_sorted)
      cout << "TargetsManager: Targets required sorting" << endl;

    trim_ampliseq_primers = _trim_ampliseq_primers;
    if (trim_ampliseq_primers)
      cout << "TargetsManager: Trimming of AmpliSeq primers is enabled" << endl;
  }


}
Exemple #7
0
void TargetsManager::LoadRawTargets(const ReferenceReader& ref_reader, const string& bed_filename, list<UnmergedTarget>& raw_targets)
{
  FILE *bed_file = fopen(bed_filename.c_str(), "r");
  if (not bed_file) {
    cerr << "ERROR: Unable to open target file " << bed_filename << " : " << strerror(errno) << endl;
    exit(1);
  }

  char line[4096];
  char chr_name[4096];
  int begin;
  int end;
  char region_name[4096];
  int line_number = 0;

  while (fgets(line, 4096, bed_file)) {
    ++line_number;

    if (strncmp(line,"track",5) == 0) {
      // Parse track line if needed
      continue;
    }


    int num_fields = sscanf(line, "%s\t%d\t%d\t%s", chr_name, &begin, &end, region_name);
    if (num_fields == 0)
      continue;
    if (num_fields < 3) {
      cerr << "ERROR: Failed to parse target file line " << line_number << endl;
      exit(1);
    }

    raw_targets.push_back(UnmergedTarget());
    UnmergedTarget& target = raw_targets.back();
    target.begin = begin;
    target.end = end;
    target.chr = ref_reader.chr_idx(chr_name);
    if (num_fields > 3 and strcmp(region_name,".") != 0)
      target.name = region_name;

    if (target.chr < 0) {
      cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")"
           << " has unrecognized chromosome name" << endl;
      exit(1);
    }

    if (begin < 0 || end > ref_reader.chr_size(target.chr)) {
      cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")"
           << " is outside of reference sequence bounds ("
           << chr_name << ":0-" << ref_reader.chr_size(target.chr) << ")" << endl;
      exit(1);
    }
    if (end < begin) {
      cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")"
           << " has inverted coordinates" << endl;
      exit(1);
    }
    AddExtraTrim(target, line, num_fields);
  }

  fclose(bed_file);

  if (raw_targets.empty()) {
    cerr << "ERROR: No targets loaded from " << bed_filename
         << " after parsing " << line_number << " lines" << endl;
    exit(1);
  }
}
bool SpliceVariantHypotheses(const Alignment &current_read, const EnsembleEval &my_ensemble,
                        const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects,
                        int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses,
                        vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context,
                        const ReferenceReader &ref_reader, int chr_idx)
{

  // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses
  my_hypotheses.resize(my_ensemble.allele_identity_vector.size()+2);
  same_as_null_hypothesis.assign(my_hypotheses.size(), false);

  // Set up variables to log the flows we splice into
  splice_start_flow = -1;
  splice_end_flow = -1;
  int splice_start_idx = -1;
  vector<int> splice_end_idx;
  splice_end_idx.assign(my_hypotheses.size(), -1);

  // 1) Null hypothesis is read as called
  if (global_context.resolve_clipped_bases) {
    unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc;
    my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length);
  }
  else
    my_hypotheses[0] = current_read.read_bases;

  // Initialize hypotheses variables for splicing
  for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) {
    my_hypotheses[i_hyp].clear();
    my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length());
    // Add soft clipped bases on the left side of alignment if desired
    if (!global_context.resolve_clipped_bases)
      my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc);
  }

  int read_idx = current_read.left_sc;
  int ref_idx  = current_read.alignment.Position;
  int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc;
  bool did_splicing = false;
  bool just_did_splicing = false;
  string pretty_alignment;
  changed_alignment = false;

  // do realignment of a small region around variant if desired
  if (my_ensemble.doRealignment) {
    pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0,
                                            changed_alignment, global_context.DEBUG, ref_reader, chr_idx);
    if (pretty_alignment.empty() and global_context.DEBUG > 0)
      cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl;
  }

  if (pretty_alignment.empty()) {
    pretty_alignment = current_read.pretty_aln;
    changed_alignment = false;
  }

  // Now fill in 2) and 3)

  for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) {

    bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end;
    bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length());

    // Basic sanity checks
    if (read_idx >= read_idx_max
        or  ref_idx > ref_reader.chr_size(chr_idx)
        or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) {
      did_splicing = false;
      break;
    }

    // --- Splice ---
    if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) {
      // Add insertions before variant window
      while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') {
    	for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
          my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]);
        read_idx++;
        pretty_idx++;
      }
      did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble,
    		                    local_context, my_hypotheses, pretty_idx, global_context.DEBUG);
      just_did_splicing = did_splicing;
    } // --- ---

    // Have reference bases inside of window but outside of span of reference allele
    if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
        my_hypotheses[i_hyp].push_back(ref_reader.base(chr_idx,ref_idx));
    }

    // Have read bases as called outside of variant window
    if (outside_of_window and pretty_alignment[pretty_idx] != '-') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
        my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]);

      // --- Information to log flows. Indices are w.r.t. aligned portion of the read
      if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses.
        splice_start_idx = read_idx - current_read.left_sc;
      }
      else if (just_did_splicing) { // Log length of hypothesis after splicing
    	splice_end_idx[0] = read_idx  - current_read.left_sc;
    	int clipped_bases = 0;
    	if (!global_context.resolve_clipped_bases)
    	  clipped_bases = current_read.left_sc;
        for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
          splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving!
        just_did_splicing = false;
      }
      // --- ---
    }

    IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx);

  } // end of for loop over extended pretty alignment

  // Check whether the whole reference allele fit
  // It seems that with primer trimming ion TVC, many a read throw this warning
  if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) {
    did_splicing = false;
    if (global_context.DEBUG>0)
      cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl;
  }

  if (did_splicing) {
    // --- Add soft clipped bases to the right of the alignment and reverse complement ---
    for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) {
      if (!global_context.resolve_clipped_bases)
        my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc);

      if (current_read.is_reverse_strand)
        RevComplementInPlace(my_hypotheses[i_hyp]);
    }

    // Get the main flows before and after splicing
    splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis,
                                     splice_start_idx, splice_end_idx, splice_start_flow);
    if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) {
      did_splicing = false;
      cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name
           << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl;
    }
  }

  // Check for non-ACGT bases in hypotheses
  bool valid_bases = true;
  for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) {
	unsigned int iBase = 0;
	while (iBase<my_hypotheses[i_hyp].length() and valid_bases){
      if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or
          my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T')
      iBase++;
      else
        valid_bases = false;
	}
  }
  if (not valid_bases){
    cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1
         << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl;
    did_splicing = false;
  }

  // --- Fail safe for hypotheses and verbose
  if (!did_splicing) {
	for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
      my_hypotheses[i_hyp] = my_hypotheses[0];
    if (global_context.DEBUG > 1) {
      cout << "Failed to splice " << local_context.reference_allele << "->";
      for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
    	cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
        if (i_alt < my_ensemble.allele_identity_vector.size()-1)
          cout << ",";
      }
      cout << " into read " << current_read.alignment.Name << endl;
    }
  }
  else if (global_context.DEBUG > 1) {
	cout << "Spliced " << local_context.reference_allele << "->";
    for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
      cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
      if (i_alt < my_ensemble.allele_identity_vector.size()-1)
        cout << ",";
    }
    cout << " into ";
    if (current_read.is_reverse_strand) cout << "reverse ";
    else cout << "forward ";
    cout <<	"strand read read " << current_read.alignment.Name << endl;
    cout << "- Read as called: " << my_hypotheses[0] << endl;
    cout << "- Reference Hyp.: " << my_hypotheses[1] << endl;
    for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++)
      cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl;
    cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl;
  }

  return did_splicing;
};
string SpliceDoRealignement (PersistingThreadObjects &thread_objects, const Alignment &current_read, long variant_position,
		                     bool &changed_alignment, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) {

  // We do not allow any clipping since we align a short substring
  thread_objects.realigner.SetClipping(0, true);
  string new_alignment;


  // --- Get index positions at snp variant position
  int read_idx = current_read.left_sc;
  int ref_idx  = current_read.alignment.Position;
  unsigned int pretty_idx = 0;

  while (pretty_idx < current_read.pretty_aln.length() and ref_idx < variant_position) {
    IncrementAlignmentIndices(current_read.pretty_aln[pretty_idx], ref_idx, read_idx);
    pretty_idx++;
  }
  if (DEBUG > 1)
    cout << "Computed variant position as (red, ref, pretty) " << read_idx << " " << ref_idx << " " << pretty_idx << endl;

  if (pretty_idx >= current_read.pretty_aln.length()
       or ref_idx  >= ref_reader.chr_size(chr_idx)
       or read_idx >= (int)current_read.alignment.QueryBases.length() - current_read.right_sc)
    return new_alignment;

  // --- Get small sequence context for very local realignment ------------------------
  int min_bases = 5;

  // Looking at alignment to the left of variant position to find right place to cut sequence
  int read_left = read_idx;
  int ref_left  = ref_idx;
  unsigned int pretty_left = pretty_idx;
  bool continue_looking = pretty_idx > 0;

  while (continue_looking) {
    pretty_left--;
	DecrementAlignmentIndices(current_read.pretty_aln[pretty_left], ref_left, read_left);

	// Stopping criterion
	if (pretty_left < 1) {
      continue_looking = false;
      break;
	}
	if (ref_idx - ref_left < min_bases)
      continue_looking = true;
	else {
	  // make sure to start with a matching base and don't split large HPs
	  if (current_read.pretty_aln[pretty_left] != '|'
          or (ref_reader.base(chr_idx,ref_left+1) == ref_reader.base(chr_idx,ref_left)))
	    continue_looking = true;
	  else
	    continue_looking = false;
	}
  }
  if (DEBUG > 1)
    cout << "Computed left realignment window as (red, ref, pretty) " << read_left << " " << ref_left << " " << pretty_left << endl;


  // Looking at alignment to the right to find right place to cut sequence
  int read_right = read_idx;
  int ref_right  = ref_idx;
  unsigned int pretty_right = pretty_idx;
  continue_looking = pretty_idx < current_read.pretty_aln.length()-1;

  while (continue_looking) {
  	IncrementAlignmentIndices(current_read.pretty_aln[pretty_right], ref_right, read_right);
    pretty_right++;
  	// Stopping criterion (half open interval)
  	if (pretty_right >= current_read.pretty_aln.length()
        or ref_right >= ref_reader.chr_size(chr_idx)) {
      continue_looking = false;
      break;
  	}
  	if (ref_right - ref_idx < min_bases)
        continue_looking = true;
  	else {
  	  // make sure to stop with a matching base and don't split large HPs
  	  if (current_read.pretty_aln[pretty_right-1] != '|'
          or (ref_reader.base(chr_idx,ref_right-1) == ref_reader.base(chr_idx,ref_right)))
  	    continue_looking = true;
  	  else
  	    continue_looking = false;
  	}
  }
  if (DEBUG > 1)
    cout << "Computed right realignment window as (red, ref, pretty) " << read_right << " " << ref_right << " " << pretty_right << endl;
  // Put in some sanity checks for alignment boundaries found...


  // --- Realign -------------------------
  unsigned int start_position_shift;
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;

  // printouts
  if (DEBUG > 1) {
    thread_objects.realigner.verbose_ = true;
    cout << "Realigned " << current_read.alignment.Name << " from " << endl;
  }
  if (read_left >= read_right and ref_left >= ref_right) {
    if (DEBUG > 1)
      cout << "ERROR: realignment window has zero size! " << endl;
    return new_alignment;
  }

  string old_alignment = current_read.pretty_aln.substr(pretty_left, pretty_right-pretty_left);
  thread_objects.realigner.SetSequences(current_read.alignment.QueryBases.substr(read_left, read_right-read_left),
                         ref_reader.substr(chr_idx, ref_left, ref_right-ref_left), old_alignment, true);

  if (!thread_objects.realigner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
    if (DEBUG > 1)
      cout << "ERROR: realignment failed! " << endl;
    return new_alignment;
  }

  // --- Fuse realigned partial sequence back into pretty_aln string
  new_alignment = current_read.pretty_aln;
  if (old_alignment == thread_objects.realigner.pretty_aln()) {
    changed_alignment = false;
  }
  else {
    new_alignment.replace(pretty_left, (pretty_right-pretty_left), thread_objects.realigner.pretty_aln());
    changed_alignment = true;
  }
  return new_alignment;
}
Exemple #10
0
void EnsembleEval::SetupAllAlleles(const ExtendParameters &parameters,
                                                 const InputStructures  &global_context,
                                                 const ReferenceReader &ref_reader,
                                                 int chr_idx)
{
  seq_context.DetectContext(*variant, global_context.DEBUG, ref_reader, chr_idx);
  allele_identity_vector.resize(variant->alt.size());

  if (global_context.DEBUG > 0 and variant->alt.size()>0) {
    cout << "Investigating variant candidate " << seq_context.reference_allele
         << " -> " << variant->alt[0];
    for (uint8_t i_allele = 1; i_allele < allele_identity_vector.size(); i_allele++)
      cout << ',' << variant->alt[i_allele];
    cout << endl;
  }

  //now calculate the allele type (SNP/Indel/MNV/HPIndel etc.) and window for hypothesis calculation for each alt allele.
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {

    // TODO: Hotspot should be an allele property but we only set all or none to Hotspots, depending on the vcf record
    allele_identity_vector[i_allele].status.isHotSpot = variant->isHotSpot;
    allele_identity_vector[i_allele].filterReasons.clear();
    allele_identity_vector[i_allele].DEBUG = global_context.DEBUG;

    allele_identity_vector[i_allele].indelActAsHPIndel = parameters.my_controls.filter_variant.indel_as_hpindel;

    allele_identity_vector[i_allele].getVariantType(variant->alt[i_allele], seq_context,
        global_context.ErrorMotifs,  parameters.my_controls.filter_variant, ref_reader, chr_idx);
    allele_identity_vector[i_allele].CalculateWindowForVariant(seq_context, global_context.DEBUG, ref_reader, chr_idx);
  }

  //GetMultiAlleleVariantWindow();
  multiallele_window_start = -1;
  multiallele_window_end   = -1;


  // Mark Ensemble for realignment if any of the possible variants should be realigned
  // TODO: Should we exclude already filtered alleles?
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {
    //if (!allele_identity_vector[i_allele].status.isNoCallVariant) {
    if (allele_identity_vector[i_allele].start_window < multiallele_window_start or multiallele_window_start == -1)
      multiallele_window_start = allele_identity_vector[i_allele].start_window;
    if (allele_identity_vector[i_allele].end_window > multiallele_window_end or multiallele_window_end == -1)
      multiallele_window_end = allele_identity_vector[i_allele].end_window;

    if (allele_identity_vector[i_allele].ActAsSNP() && parameters.my_controls.filter_variant.do_snp_realignment) {
      doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment;
    }
    if (allele_identity_vector[i_allele].ActAsMNP() && parameters.my_controls.filter_variant.do_mnp_realignment) {
      doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment;
    }
  }
  // Hack: pass allele windows back down the object
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {
    allele_identity_vector[i_allele].start_window = multiallele_window_start;
    allele_identity_vector[i_allele].end_window = multiallele_window_end;
  }


  if (global_context.DEBUG > 0) {
	cout << "Realignment for this candidate is turned " << (doRealignment ? "on" : "off") << endl;
    cout << "Final window for multi-allele: " << ": (" << multiallele_window_start << ") ";
    for (int p_idx = multiallele_window_start; p_idx < multiallele_window_end; p_idx++)
      cout << ref_reader.base(chr_idx,p_idx);
    cout << " (" << multiallele_window_end << ") " << endl;
  }
}
Exemple #11
0
void AlleleIdentity::CalculateWindowForVariant(const LocalReferenceContext &seq_context, int DEBUG,
    const ReferenceReader &ref_reader, int chr_idx) {

  // If we have an invalid vcf candidate, set a length zero window and exit
  if (!seq_context.context_detected or status.isProblematicAllele) {
    start_window = seq_context.position0;
    end_window = seq_context.position0;
    return;
  }

  // Check for MNRs first, for InDelLengths 2,3,4,5
  if (status.isIndel and !status.isHPIndel and inDelLength < 5)
    for (int rep_period = 2; rep_period < 6; rep_period++)
      if (IdentifyMultiNucRepeatSection(seq_context, rep_period, ref_reader, chr_idx)) {
        if (DEBUG > 0) {
          cout << "MNR found in allele " << seq_context.reference_allele << " -> " << altAllele << endl;
          cout << "Window for allele " << altAllele << ": (" << start_window << ") ";
          for (int p_idx = start_window; p_idx < end_window; p_idx++)
            cout << ref_reader.base(chr_idx,p_idx);
          cout << " (" << end_window << ") " << endl;
        }
        return; // Found a matching period and computed window
      }

  // not an MNR. Moving on along to InDels.
  if (status.isIndel) {
	// Default variant window
    end_window = seq_context.right_hp_start +1; // Anchor base to the right of allele
    start_window = seq_context.position0;

    // Adjustments if necessary
    if (status.isDeletion)
      if (seq_context.my_hp_start_pos[left_anchor] == seq_context.my_hp_start_pos[0])
        start_window = seq_context.my_hp_start_pos[0] - 1;

    if (status.isInsertion) {
      if (left_anchor == 0) {
        start_window = seq_context.my_hp_start_pos[0] - 1;
      }
      else if (altAllele[left_anchor] == altAllele[left_anchor - 1] and
          seq_context.position0 > (seq_context.my_hp_start_pos[left_anchor - 1] - 1)) {
        start_window = seq_context.my_hp_start_pos[left_anchor - 1] - 1;
      }
      if (altAllele[altAllele.length() - 1] == seq_context.ref_right_hp_base) {
        end_window += seq_context.right_hp_length;
      }
    }

    // Safety
    if (start_window < 0)
      start_window = 0;
    if (end_window > ref_reader.chr_size(chr_idx))
      end_window = ref_reader.chr_size(chr_idx);
  }
  else {
    // SNPs and MNVs are 1->1 base replacements
    start_window = seq_context.position0;
    end_window = seq_context.position0 + seq_context.reference_allele.length();
  } // */

  if (DEBUG > 0) {
    cout << "Window for allele " << altAllele << ": (" << start_window << ") ";
    for (int p_idx = start_window; p_idx < end_window; p_idx++)
      cout << ref_reader.base(chr_idx,p_idx);
    cout << " (" << end_window << ") " << endl;
  }
}
// open BAM input file
void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames)
{
  if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) {
    cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl;
    exit(1);
  }

  if (not bam_reader_.Open(bam_filenames)) {
    cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString();
    exit(1);
  }
  if (not bam_reader_.LocateIndexes()) {
    cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString();
    exit(1);
  }


  bam_header_ = bam_reader_.GetHeader();
  if (!bam_header_.HasReadGroups()) {
    cerr << "ERROR: there is no read group in BAM files specified" << endl;
    exit(1);
  }

  //
  // Reference sequences in the bam file must match that in the fasta file
  //

  vector<RefData> referenceSequences = bam_reader_.GetReferenceData();

  if ((int)referenceSequences.size() != ref_reader.chr_count()) {
    cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
         << "       BAM has " << referenceSequences.size()
         << " chromosomes while fasta has " << ref_reader.chr_count() << endl;
    exit(1);
  }

  for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) {
    if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) {
      cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
           << "       Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName
           << " while fasta has " << ref_reader.chr_str(chr_idx) << endl;
      exit(1);
    }
    if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) {
      cerr << "ERROR: Reference in BAM file does not match fasta file" << endl
           << "       Chromosome " << referenceSequences[chr_idx].RefName
           << "in BAM has length " << referenceSequences[chr_idx].RefLength
           << " while fasta has " << ref_reader.chr_size(chr_idx) << endl;
      exit(1);
    }
  }


  //
  // Retrieve BaseCaller and TMAP version strings from BAM header
  //

  set<string> basecaller_versions;
  set<string> tmap_versions;
  for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) {
    if (I->ID.substr(0,2) == "bc")
      basecaller_versions.insert(I->Version);
    if (I->ID.substr(0,4) == "tmap")
      tmap_versions.insert(I->Version);
  }
  basecaller_version_ = "";
  for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) {
    if (not basecaller_version_.empty())
      basecaller_version_ += ", ";
    basecaller_version_ += *I;
  }
  tmap_version_ = "";
  for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) {
    if (not tmap_version_.empty())
      tmap_version_ += ", ";
    tmap_version_ += *I;
  }

}