Example #1
0
int CReadBlastApp::AnalyzeSeqsViaBioseqs(CBioseq& left,
    CBioseq_set::TSeq_set& against_seqs, bool against_prot)
{
  if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, against_seqs, against_prot): "
    << GetStringDescr(left)
    << ", against_prot= " << against_prot << "\n";
  NON_CONST_ITERATE( CBioseq_set::TSeq_set, right, against_seqs)
      {
      if((*right)->IsSet())
        {
        CBioseq_set::TSeq_set& seqs_down = (*right)->SetSet().SetSeq_set();
        AnalyzeSeqsViaBioseqs(left, seqs_down, against_prot);
        }
      else
        {
        string name = GetStringDescr((*right)->GetSeq());
/*
        if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, against_seqs, against_prot): "
           << "right seq entry "
           << name << "\n";
*/
        if(   ( against_prot &&  is_prot_entry((*right)->GetSeq())) ||
              (!against_prot && !is_prot_entry((*right)->GetSeq()))
            )
        AnalyzeSeqsViaBioseqs(left, (*right)->SetSeq() );
        }
      }
  return -1;
}
Example #2
0
int CReadBlastApp::AnalyzeSeqsViaBioseqs(
    CBioseq_set::TSeq_set& in_pool_seqs,
    CBioseq_set::TSeq_set& against_seqs,
    bool in_pool_prot, bool against_prot)
{
  if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(in_pool_seqs, against_seqs, in_pool_prot, against_prot): "
    << ", in_pool_prot= " << in_pool_prot
    << ", against_prot= " << against_prot << "\n";

    NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, in_pool_seqs)
      {
      if((*left)->IsSet())
        {
        CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set();
        AnalyzeSeqsViaBioseqs(seqs_down, against_seqs, in_pool_prot, against_prot);
        }
      else
        {
        string name = GetStringDescr((*left)->GetSeq());
        if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(in_pool_seqs, against_seqs, in_pool_prot, against_prot): "
           << "left seq entry "
           << name << "\n";
        if(   ( in_pool_prot &&  is_prot_entry((*left)->GetSeq())) ||
              (!in_pool_prot && !is_prot_entry((*left)->GetSeq()))
            )
          {
          AnalyzeSeqsViaBioseqs1((*left)->SetSeq());
          AnalyzeSeqsViaBioseqs((*left)->SetSeq(), against_seqs, against_prot);
          }
        }
      }

    return -1;

}
Example #3
0
int CReadBlastApp::AnalyzeSeqsViaBioseqs(CBioseq& left, CBioseq& right)
{
/*
  if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, right): "
    << GetStringDescr(left) << ", " << GetStringDescr(right) << NcbiEndl;
*/
  if(is_prot_entry(left) && !is_prot_entry(right))
    {
//    if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, right): going for overlaps\n";
    overlaps_prot_na(left, right.GetAnnot());
    }

  return -1;
}
Example #4
0
int CReadBlastApp::CollectSimpleSeqs(TSimpleSeqs& seqs)
{
// collect stuff from proteins
  for(CTypeIterator< CSeq_entry > s = Begin(); s; ++s)
    {
    if(s->IsSet()) continue;
    if(!is_prot_entry(s->GetSeq())) continue;
    TSimpleSeq seq;
    seq.description = GetProtName(s->GetSeq());
    seq.name = GetStringDescr (s->GetSeq());
    seq.type = "CDS";
    seq.seq = CRef<CBioseq>(&(s->SetSeq()));
    const CSeq_loc&  loc = getGenomicLocation(s->GetSeq());
    addLoctoSimpleSeq(seq, loc);
    seqs.push_back(seq);
    if(PrintDetails())
      {
      NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to CDS: " 
               << "(" << seq.name  << ")"
               << "(" << printed_range(loc) << ")"
               << "(" << seq.key << ":" << printed_range(seq) << ")"
               <<  NcbiEndl;
      }
    }
// collect features from RNAs and genes
  string name;
  TSimpleSeqs genes;
  for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
    {
    const CSeq_loc&  loc = f->GetLocation();
    if(f->GetData().IsGene())
      {
      name = "Bad or no locus tag";
      if (f->GetData().GetGene().CanGetLocus_tag())
        name =  f->GetData().GetGene().GetLocus_tag();
// I am assuming that each RNA feature is preceded by a gene
      TSimpleSeq gene; 
      gene.type = "gene";
      gene.locus_tag = name;
      addLoctoSimpleSeq(gene, loc);
      genes.push_back(gene);
      if(PrintDetails())
      {
      NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to gene: " 
               << "(" << name << ")"
               << "(" << printed_range(loc) << ")"
               << "(" << gene.key << ":" << printed_range(gene) << ")"
               <<  NcbiEndl;
      }
      continue;
      }
    else if(!f->GetData().IsRna()) continue;
    CRNA_ref::EType rna_type = f->GetData().GetRna().GetType();
    string description="Bad or no descriptioin";
    if ( rna_type == CRNA_ref::eType_tRNA )
      {
      if ( f->GetData().GetRna().CanGetExt() )
        {
        string type1;
        try { type1 = Get3type(f->GetData().GetRna());}
        catch  (...)
          {
          NcbiCerr << "simple_overlaps: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
          throw;
          }
        description = "tRNA:" + type1;
        }
      } // if tRNA
    else
      {
      if(f->GetData().GetRna().CanGetExt() &&
         f->GetData().GetRna().GetExt().IsName())
         description = f->GetData().GetRna().GetExt().GetName();
      }
    TSimpleSeq seq;
    if      ( rna_type == CRNA_ref::eType_tRNA ) { seq.type = "tRNA"; }
    else if ( rna_type == CRNA_ref::eType_rRNA ) { seq.type = GetRRNAtype(f->GetData().GetRna());}
    else if ( rna_type == CRNA_ref::eType_premsg ) { seq.type = "premsg"; }
    else if ( rna_type == CRNA_ref::eType_mRNA ) { seq.type = "mRNA"; }
    else if ( rna_type == CRNA_ref::eType_snRNA ) { seq.type = "snRNA"; }
    else if ( rna_type == CRNA_ref::eType_scRNA ) { seq.type = "scRNA"; }
    else if ( rna_type == CRNA_ref::eType_snoRNA ) { seq.type = "snoRNA"; }
    else if ( rna_type == CRNA_ref::eType_other ) { seq.type = "other RNA"; }
    else { seq.type = "unknown RNA"; }
    seq.name = name;
    seq.description = description;
    addLoctoSimpleSeq(seq, loc);
    seqs.push_back(seq);
    } // features

// need to tidy up  before doing what is next
  seqs.sort(less_simple_seq);
  genes.sort(less_simple_seq);

// now go over all gene features and match them to seqs features;
// first of all do all exact locations
  TSimpleSeqs::iterator seq = seqs.begin();
  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
    {
    string gene_range = printed_range(gene);
    int seq_from=0, seq_to=0;
    int gene_from = gene->exons[0].from;
    int gene_to   = gene->exons[0].to;
    for(;seq!=seqs.end(); seq++)
       {
       string seq_range = printed_range(seq);
       seq_from = seq->exons[0].from;
       seq_to   = seq->exons[0].to;
       if(PrintDetails()) 
         {
         NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") to reach gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
         }
       if(gene->key<=seq->key) break; 
       }
    if(seq==seqs.end()) break; 

      seq_from = seq->exons[0].from;
      seq_to   = seq->exons[0].to;
      string seq_range = printed_range(seq);
      if(PrintDetails()) 
        {
        NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") reached gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
        }
      seq_to  = seq->exons[seq->exons.size()-1].to;
      if(seq->exons[0].strand != eNa_strand_plus) // JIRA-PR-147
        {
        seq_to   = seq->exons[0].to;
        seq_from= seq->exons[seq->exons.size()-1].from;
        }
      gene_to = gene->exons[gene->exons.size()-1].to;
      if(seq_to==gene_to && seq_from==gene_from)  // match
        { 
        seq->locus_tag = gene->locus_tag; 
        gene=genes.erase(gene++); 
        }
      else gene++;
    
    }
/////////////////////////////
// now try to assign non-exact gene-CDS matches
/////////////////////////////
  seq=seqs.begin();
  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
    {
    string gene_printed_range = printed_range(gene);
    if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches: gene: " << gene_printed_range << NcbiEndl;
    int gene_from = gene->exons[0].from;
// find first sequence that could match a gene
    TSimpleSeqs::iterator seq_start=seq;
    for(;seq_start!=seqs.end(); seq_start++)
       {
       if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying seq_start: " 
                                   << printed_range(seq_start) << NcbiEndl;
       if(seq_start->locus_tag != "") 
         {
         if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: " 
                                     << seq->locus_tag << ", continue..."<<NcbiEndl;
         continue; // this is done
         }

       int seq_from = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[0].from : seq_start->exons[seq_start->exons.size()-1].from;
       if(gene_from<=seq_from) break; // in case there are cross-origin seqs, they will be in the end of seqs list, so they will be tested the last, thus this incorrect sliding should be fine
       }
    if(seq_start==seqs.end()) break; // done with seqs
// now check if other ends fit
    if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_start: " << printed_range(seq_start) << NcbiEndl;
    int seq_to =  seq_start->exons[0].strand == eNa_strand_plus 
      ? seq_start->exons[seq_start->exons.size()-1].to
      : seq_start->exons[0].to;
    int gene_to = gene->exons[gene->exons.size()-1].to;
    if ( gene->exons[0].strand != eNa_strand_plus ) gene_to = gene->exons[0].to;
    if (seq_to > gene_to) 
      {
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: sequences jumped over this gene, this gene does not fit any sequence, will be flagged later"  << NcbiEndl;
// sequences jumped over this gene, this gene does not fit any sequence, will be flagged later
      gene++;
      continue;
      }
// end find first sequence that could match a gene
// find first sequence that does not match a gene
    TSimpleSeqs::iterator seq_end = seq_start;
    int nmatches=0;
    for(;seq_end!=seqs.end() &&
         gene_to >= (seq_end->exons[0].strand == eNa_strand_plus 
           ? seq_end->exons[seq_end->exons.size()-1].to
           : seq_end->exons[0].to);
        seq_end++)
       {
       if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying to find: current_seq_end " 
                                   << printed_range(seq_end)  
                                   << ", gene_to = " << gene_to
                                   << ", seq_end.to = " << (seq_end->exons[0].strand == eNa_strand_plus
                                                           ? seq_end->exons[seq_end->exons.size()-1].to
                                                           : seq_end->exons[0].to)
                                   << NcbiEndl;

       if(seq_end->type == "CDS" && seq_end->locus_tag == "" ) nmatches++;
       }
    if(seq_end!=seqs.end() ) seq_end++;
    if(PrintDetails()) 
      {
      if(seq_end!=seqs.end() ) 
        NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: " << printed_range(seq_start) << NcbiEndl;
      else
        NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: end()"  << NcbiEndl;
      }
// end find first sequence that does not match a gene
    if(PrintDetails()) 
       {
       if(seq_end!=seqs.end() )
         NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: " << printed_range(seq_end) << NcbiEndl;
       else
         NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: end()" << NcbiEndl;
       }
    if(nmatches>1)
      {
      string range = printed_range(gene);
      NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene["<<gene_printed_range<<"] matches several (" << nmatches << ") CDS features: "
         << "locus = " << gene->locus_tag << ", "
         << "[" << range << "]" << NcbiEndl;
      }

// look at all found fits
    bool gene_used=false;
// find best fit and assign locus tag only for that feature
    TSimpleSeqs::iterator best_seq=seqs.end();
    int best_gene_feat_fit = 0x0FFFFFFF; // intentionally less than the const in gene_feat_fit function
    for(seq=seq_start; seq!=seq_end; seq++)
      { 
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq)  << NcbiEndl;
      if(seq->locus_tag != "") continue; // this is done already
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq)  
         << " does not have a locus tag yet"
         << NcbiEndl;
/*
      if(seq->type != "CDS" )
        {
        string range = printed_range(seq);
        NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: non-CDS sequence does not have a gene with exactly the same boundaries: "
         << "type = " << seq->type << ", "
         << "name = " << seq->name << ", "
         << "[" << range << "]" << NcbiEndl;
        }
      else 
*/
        {
        int fit=gene_feat_fit(seq, gene_from, gene_to);
        if(fit <= best_gene_feat_fit )
          {
          best_seq=seq; best_gene_feat_fit = fit; 
          }
        }
      } // for(seq=seq_start; seq!=seq_end; seq++)
// found suitable seqs
   if(best_seq!=seqs.end())
      {
      best_seq->locus_tag = gene->locus_tag;
      gene_used = true;
      }
// go to next gene
    if(gene_used) gene=genes.erase(gene);
    else gene++;
    }

// swipe over seqs flag those that do not have locus tag
  NON_CONST_ITERATE(TSimpleSeqs,seq, seqs)
    {
    if(seq->locus_tag != "") 
      {
      if(seq->type == "CDS")
         {
         for(CTypeIterator<CSeq_feat> feat=::Begin(*(seq->seq)); feat; ++feat)
           {
           if(feat->CanGetComment() && feat->GetComment().find("Genomic Location: ") != string::npos)
             {
             string comment = "Genomic Location: " + seq->locus_tag;
             feat->SetComment(comment);
             }
           }
         }
      continue;
      }
    string range = printed_range(seq);
    NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: feature does not have a matching gene: "
      << "type = " << seq->type << ", "
      << "name = " << seq->name << ", "
      << "[" << range << "]" << NcbiEndl;
    }
// swipe over genes and flag those that are not used 
  NON_CONST_ITERATE(TSimpleSeqs,gene, genes)
    {
    string range = printed_range(gene);
    NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene does not match any feature: "
         << "locus = " << gene->locus_tag << ", "
         << "[" << range << "]" << NcbiEndl;
    }
Example #5
0
// w.out CTypeConstIterator
int CReadBlastApp::AnalyzeSeqs(CBioseq_set::TSeq_set& seqs)
{
   CArgs args = GetArgs();
   IncreaseVerbosity();
   string tblFile;
   if (args["outTbl"].HasValue())
      tblFile = args["outTbl"].AsString();
   else
      tblFile = "/dev/null";
   ofstream tblOut(tblFile.c_str(), IOS_BASE::app | IOS_BASE::out );
   NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, seqs)
     {
     if((*left)->IsSet())
       {
       if(PrintDetails())
           NcbiCerr << "AnalyzeSeqs: going down: "
                    << NcbiEndl;
       CBioseq_set::TSeq_set& seqs2 = (*left)->SetSet().SetSeq_set();
       PushVerbosity();
       AnalyzeSeqs(seqs2);
       PopVerbosity();
       continue;
       }

     if(PrintDetails())
          NcbiCerr << "AnalyzeSeqs: left: "
                   // <<  CSeq_id::GetStringDescr ((*left)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl;
                   <<  GetStringDescr ((*left)->GetSeq()) << NcbiEndl;
/////////////////////////////////
// not a protein. Do NA stuff
     if( !is_prot_entry((*left)->GetSeq())  )
       {
// NA, process all RNA and what not annotations here and compare for overlaps
       // CheckMissingRibosomalRNA((*left)->GetSeq().GetAnnot() );

// check overlaps of the sequence with other features
       overlaps_na((*left)->GetSeq().GetAnnot() );
       continue;
       }
///////////////////////////////////
// compare to...
     CBioseq_set::TSeq_set::iterator right = left; 
     bool again=true;
     bool last_right=false;
     while(again) // have overlaps
       {
       again=false;
       ++right;
       if(!skip_toprot(right, seqs)) {last_right=true; break;}
       if(PrintDetails())
          {
          NcbiCerr << "AnalyzeSeqs: right: "
                   <<  GetStringDescr ((*right)->GetSeq()) << NcbiEndl;
          }
// analyze for overlaps with the next one
       PushVerbosity();
// if there are overlaps, keep on working on left, iterating through right
       again=overlaps((*left)->GetSeq(), (*right)->GetSeq() );
       PopVerbosity();
       }
     if (last_right) break;
     if(PrintDetails())
       NcbiCerr << "AnalyzeSeqs: finished lower level seq, overlaps: "
                << NcbiEndl;
     }

   NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, seqs)
     {
     if((*left)->IsSet()) continue;
// does not hit. Skip
     if( !has_blast_hits((*left)->GetSeq()) ) continue;
     if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: left: valid" <<  NcbiEndl;
     CBioseq_set::TSeq_set::iterator right = left;  ++right;
     if(!skip_to_valid_seq_cand(right, seqs)) break;
     if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: right: valid" <<  NcbiEndl;
     string common_subject;
     bool fit_blast_result = fit_blast((*left)->GetSeq(), (*right)->GetSeq(), common_subject);
     bool lhp = hasProblems((*left)->GetSeq(), m_diag, eFrameShift);
     bool rhp = hasProblems((*right)->GetSeq(), m_diag, eFrameShift);
     bool lhoe = hasProblems((*left)->GetSeq(), m_diag, eMayBeNotFrameShift);
     bool rhoe = hasProblems((*right)->GetSeq(), m_diag, eMayBeNotFrameShift);
     if(PrintDetails())
       NcbiCerr << "AnalyzeSeqs: after fit_blast:"
        << fit_blast_result
        << lhp
        << lhoe
        << rhp
        << rhoe
        << NcbiEndl;
//     if(fit_blast_result && (lhp && !lhoe) && (rhp && !rhoe))
     if(fit_blast_result)
        {
// go to the same sequence set, find first NA, add misc_feature
        append_misc_feature(seqs, GetStringDescr((*left)->GetSeq()), eFrameShift);
        }
     if(PrintDetails())
       NcbiCerr << "AnalyzeSeqs: finished lower level seq, frameshifts: "
                << NcbiEndl;
     }
   DecreaseVerbosity();
   return -1;
}