Example #1
0
void CReadBlastApp::CheckUniqLocusTag()
{
  typedef map<string, int> THaveIt;
  THaveIt locuses; 
  for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
    {
    // const CSeq_loc&  loc = f->GetLocation();
    if(f->GetData().IsGene())
      {
      if (f->GetData().GetGene().CanGetLocus_tag())
        {
        string locus_tag =  f->GetData().GetGene().GetLocus_tag();
        locuses[locus_tag]++;
        }
      }
    }

  bool bad=false;
  ITERATE(THaveIt, locus, locuses)
    {
    if(locus->second<2) continue;
    bad=true;
    NcbiCerr << "ERROR: CReadBlastApp::CheckUniqLocusTag : more than one gene with the same locus_tag: "
             << locus->first 
             << " = "
             << locus->second
             << NcbiEndl;
    }
  if(bad)
    {
    NcbiCerr << "FATAL: sequences or genes with the same locus_tag are not allowed" << NcbiEndl;
    throw;
    }
}
Example #2
0
void CReadBlastApp::GetGenomeLen()
{
    for (CTypeIterator<CBioseq> seq = Begin();  seq;  ++seq)
      {
// check if na
      if(seq->GetInst().GetMol()!=CSeq_inst::eMol_dna) continue;
      if(PrintDetails()) NcbiCerr << "GetGenomeLen: found DNA" << NcbiEndl;
      m_length = seq->GetInst().GetLength(); // let the toolkit take care of exception
      } // end iteration over all genomic sequences
}
Example #3
0
int CReadBlastApp::CollectSimpleSeqs(TSimpleSeqs& seqs)
{
// collect stuff from proteins
  for(CTypeIterator< CSeq_entry > s = Begin(); s; ++s)
    {
    if(s->IsSet()) continue;
    if(!is_prot_entry(s->GetSeq())) continue;
    TSimpleSeq seq;
    seq.description = GetProtName(s->GetSeq());
    seq.name = GetStringDescr (s->GetSeq());
    seq.type = "CDS";
    seq.seq = CRef<CBioseq>(&(s->SetSeq()));
    const CSeq_loc&  loc = getGenomicLocation(s->GetSeq());
    addLoctoSimpleSeq(seq, loc);
    seqs.push_back(seq);
    if(PrintDetails())
      {
      NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to CDS: " 
               << "(" << seq.name  << ")"
               << "(" << printed_range(loc) << ")"
               << "(" << seq.key << ":" << printed_range(seq) << ")"
               <<  NcbiEndl;
      }
    }
// collect features from RNAs and genes
  string name;
  TSimpleSeqs genes;
  for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
    {
    const CSeq_loc&  loc = f->GetLocation();
    if(f->GetData().IsGene())
      {
      name = "Bad or no locus tag";
      if (f->GetData().GetGene().CanGetLocus_tag())
        name =  f->GetData().GetGene().GetLocus_tag();
// I am assuming that each RNA feature is preceded by a gene
      TSimpleSeq gene; 
      gene.type = "gene";
      gene.locus_tag = name;
      addLoctoSimpleSeq(gene, loc);
      genes.push_back(gene);
      if(PrintDetails())
      {
      NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to gene: " 
               << "(" << name << ")"
               << "(" << printed_range(loc) << ")"
               << "(" << gene.key << ":" << printed_range(gene) << ")"
               <<  NcbiEndl;
      }
      continue;
      }
    else if(!f->GetData().IsRna()) continue;
    CRNA_ref::EType rna_type = f->GetData().GetRna().GetType();
    string description="Bad or no descriptioin";
    if ( rna_type == CRNA_ref::eType_tRNA )
      {
      if ( f->GetData().GetRna().CanGetExt() )
        {
        string type1;
        try { type1 = Get3type(f->GetData().GetRna());}
        catch  (...)
          {
          NcbiCerr << "simple_overlaps: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
          throw;
          }
        description = "tRNA:" + type1;
        }
      } // if tRNA
    else
      {
      if(f->GetData().GetRna().CanGetExt() &&
         f->GetData().GetRna().GetExt().IsName())
         description = f->GetData().GetRna().GetExt().GetName();
      }
    TSimpleSeq seq;
    if      ( rna_type == CRNA_ref::eType_tRNA ) { seq.type = "tRNA"; }
    else if ( rna_type == CRNA_ref::eType_rRNA ) { seq.type = GetRRNAtype(f->GetData().GetRna());}
    else if ( rna_type == CRNA_ref::eType_premsg ) { seq.type = "premsg"; }
    else if ( rna_type == CRNA_ref::eType_mRNA ) { seq.type = "mRNA"; }
    else if ( rna_type == CRNA_ref::eType_snRNA ) { seq.type = "snRNA"; }
    else if ( rna_type == CRNA_ref::eType_scRNA ) { seq.type = "scRNA"; }
    else if ( rna_type == CRNA_ref::eType_snoRNA ) { seq.type = "snoRNA"; }
    else if ( rna_type == CRNA_ref::eType_other ) { seq.type = "other RNA"; }
    else { seq.type = "unknown RNA"; }
    seq.name = name;
    seq.description = description;
    addLoctoSimpleSeq(seq, loc);
    seqs.push_back(seq);
    } // features

// need to tidy up  before doing what is next
  seqs.sort(less_simple_seq);
  genes.sort(less_simple_seq);

// now go over all gene features and match them to seqs features;
// first of all do all exact locations
  TSimpleSeqs::iterator seq = seqs.begin();
  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
    {
    string gene_range = printed_range(gene);
    int seq_from=0, seq_to=0;
    int gene_from = gene->exons[0].from;
    int gene_to   = gene->exons[0].to;
    for(;seq!=seqs.end(); seq++)
       {
       string seq_range = printed_range(seq);
       seq_from = seq->exons[0].from;
       seq_to   = seq->exons[0].to;
       if(PrintDetails()) 
         {
         NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") to reach gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
         }
       if(gene->key<=seq->key) break; 
       }
    if(seq==seqs.end()) break; 

      seq_from = seq->exons[0].from;
      seq_to   = seq->exons[0].to;
      string seq_range = printed_range(seq);
      if(PrintDetails()) 
        {
        NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") reached gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
        }
      seq_to  = seq->exons[seq->exons.size()-1].to;
      if(seq->exons[0].strand != eNa_strand_plus) // JIRA-PR-147
        {
        seq_to   = seq->exons[0].to;
        seq_from= seq->exons[seq->exons.size()-1].from;
        }
      gene_to = gene->exons[gene->exons.size()-1].to;
      if(seq_to==gene_to && seq_from==gene_from)  // match
        { 
        seq->locus_tag = gene->locus_tag; 
        gene=genes.erase(gene++); 
        }
      else gene++;
    
    }
/////////////////////////////
// now try to assign non-exact gene-CDS matches
/////////////////////////////
  seq=seqs.begin();
  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
    {
    string gene_printed_range = printed_range(gene);
    if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches: gene: " << gene_printed_range << NcbiEndl;
    int gene_from = gene->exons[0].from;
// find first sequence that could match a gene
    TSimpleSeqs::iterator seq_start=seq;
    for(;seq_start!=seqs.end(); seq_start++)
       {
       if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying seq_start: " 
                                   << printed_range(seq_start) << NcbiEndl;
       if(seq_start->locus_tag != "") 
         {
         if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: " 
                                     << seq->locus_tag << ", continue..."<<NcbiEndl;
         continue; // this is done
         }

       int seq_from = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[0].from : seq_start->exons[seq_start->exons.size()-1].from;
       if(gene_from<=seq_from) break; // in case there are cross-origin seqs, they will be in the end of seqs list, so they will be tested the last, thus this incorrect sliding should be fine
       }
    if(seq_start==seqs.end()) break; // done with seqs
// now check if other ends fit
    if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_start: " << printed_range(seq_start) << NcbiEndl;
    int seq_to =  seq_start->exons[0].strand == eNa_strand_plus 
      ? seq_start->exons[seq_start->exons.size()-1].to
      : seq_start->exons[0].to;
    int gene_to = gene->exons[gene->exons.size()-1].to;
    if ( gene->exons[0].strand != eNa_strand_plus ) gene_to = gene->exons[0].to;
    if (seq_to > gene_to) 
      {
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: sequences jumped over this gene, this gene does not fit any sequence, will be flagged later"  << NcbiEndl;
// sequences jumped over this gene, this gene does not fit any sequence, will be flagged later
      gene++;
      continue;
      }
// end find first sequence that could match a gene
// find first sequence that does not match a gene
    TSimpleSeqs::iterator seq_end = seq_start;
    int nmatches=0;
    for(;seq_end!=seqs.end() &&
         gene_to >= (seq_end->exons[0].strand == eNa_strand_plus 
           ? seq_end->exons[seq_end->exons.size()-1].to
           : seq_end->exons[0].to);
        seq_end++)
       {
       if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying to find: current_seq_end " 
                                   << printed_range(seq_end)  
                                   << ", gene_to = " << gene_to
                                   << ", seq_end.to = " << (seq_end->exons[0].strand == eNa_strand_plus
                                                           ? seq_end->exons[seq_end->exons.size()-1].to
                                                           : seq_end->exons[0].to)
                                   << NcbiEndl;

       if(seq_end->type == "CDS" && seq_end->locus_tag == "" ) nmatches++;
       }
    if(seq_end!=seqs.end() ) seq_end++;
    if(PrintDetails()) 
      {
      if(seq_end!=seqs.end() ) 
        NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: " << printed_range(seq_start) << NcbiEndl;
      else
        NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: end()"  << NcbiEndl;
      }
// end find first sequence that does not match a gene
    if(PrintDetails()) 
       {
       if(seq_end!=seqs.end() )
         NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: " << printed_range(seq_end) << NcbiEndl;
       else
         NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: end()" << NcbiEndl;
       }
    if(nmatches>1)
      {
      string range = printed_range(gene);
      NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene["<<gene_printed_range<<"] matches several (" << nmatches << ") CDS features: "
         << "locus = " << gene->locus_tag << ", "
         << "[" << range << "]" << NcbiEndl;
      }

// look at all found fits
    bool gene_used=false;
// find best fit and assign locus tag only for that feature
    TSimpleSeqs::iterator best_seq=seqs.end();
    int best_gene_feat_fit = 0x0FFFFFFF; // intentionally less than the const in gene_feat_fit function
    for(seq=seq_start; seq!=seq_end; seq++)
      { 
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq)  << NcbiEndl;
      if(seq->locus_tag != "") continue; // this is done already
      if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq)  
         << " does not have a locus tag yet"
         << NcbiEndl;
/*
      if(seq->type != "CDS" )
        {
        string range = printed_range(seq);
        NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: non-CDS sequence does not have a gene with exactly the same boundaries: "
         << "type = " << seq->type << ", "
         << "name = " << seq->name << ", "
         << "[" << range << "]" << NcbiEndl;
        }
      else 
*/
        {
        int fit=gene_feat_fit(seq, gene_from, gene_to);
        if(fit <= best_gene_feat_fit )
          {
          best_seq=seq; best_gene_feat_fit = fit; 
          }
        }
      } // for(seq=seq_start; seq!=seq_end; seq++)
// found suitable seqs
   if(best_seq!=seqs.end())
      {
      best_seq->locus_tag = gene->locus_tag;
      gene_used = true;
      }
// go to next gene
    if(gene_used) gene=genes.erase(gene);
    else gene++;
    }

// swipe over seqs flag those that do not have locus tag
  NON_CONST_ITERATE(TSimpleSeqs,seq, seqs)
    {
    if(seq->locus_tag != "") 
      {
      if(seq->type == "CDS")
         {
         for(CTypeIterator<CSeq_feat> feat=::Begin(*(seq->seq)); feat; ++feat)
           {
           if(feat->CanGetComment() && feat->GetComment().find("Genomic Location: ") != string::npos)
             {
             string comment = "Genomic Location: " + seq->locus_tag;
             feat->SetComment(comment);
             }
           }
         }
      continue;
      }
    string range = printed_range(seq);
    NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: feature does not have a matching gene: "
      << "type = " << seq->type << ", "
      << "name = " << seq->name << ", "
      << "[" << range << "]" << NcbiEndl;
    }
// swipe over genes and flag those that are not used 
  NON_CONST_ITERATE(TSimpleSeqs,gene, genes)
    {
    string range = printed_range(gene);
    NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene does not match any feature: "
         << "locus = " << gene->locus_tag << ", "
         << "[" << range << "]" << NcbiEndl;
    }