void CReadBlastApp::ugly_simple_overlaps_call(int& n_user_neighbors, int& n_ext_neighbors, TSimpleSeqs::iterator& ext_rna, TSimpleSeqs::iterator& first_user_in_range, TSimpleSeqs::iterator& first_user_non_in_range, TSimpleSeqs& seqs, int max_distance, TSimpleSeqs::iterator& first_ext_in_range, TSimpleSeqs::iterator& first_ext_non_in_range, string& bufferstr) { if(PrintDetails()) { if(first_user_in_range==seqs.end()) { NcbiCerr << "ugly_simple_overlaps_call: first_user_in_range is already at the end" << NcbiEndl; } else { NcbiCerr << "ugly_simple_overlaps_call: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl; } } n_user_neighbors = get_neighboring_sequences(ext_rna, first_user_in_range, first_user_non_in_range, seqs, max_distance); n_ext_neighbors = get_neighboring_sequences(ext_rna, first_ext_in_range, first_ext_non_in_range, m_extRNAtable2, max_distance); if(PrintDetails()) { if(first_user_in_range==seqs.end()) { NcbiCerr << "ugly_simple_overlaps_call: after call: first_user_in_range is already at the end" << NcbiEndl; } else { NcbiCerr << "ugly_simple_overlaps_call: after call: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl; } } strstream buffer; addSimpleTab(buffer, "CENTER_REFERENCE", ext_rna, max_distance); for(TSimpleSeqs::iterator entry = first_ext_in_range; entry!= first_ext_non_in_range; entry++) { if(entry==ext_rna) continue; // addSimpleTab(buffer, "CENTER_REFERENCE", entry); else addSimpleTab(buffer, "REFERENCE", entry, max_distance); } for(TSimpleSeqs::iterator entry = first_user_in_range; entry!=first_user_non_in_range; entry++) { addSimpleTab(buffer, "VICINITY", entry, max_distance); } buffer << '\0'; bufferstr=buffer.str(); }
int CReadBlastApp::CollectSimpleSeqs(TSimpleSeqs& seqs) { // collect stuff from proteins for(CTypeIterator< CSeq_entry > s = Begin(); s; ++s) { if(s->IsSet()) continue; if(!is_prot_entry(s->GetSeq())) continue; TSimpleSeq seq; seq.description = GetProtName(s->GetSeq()); seq.name = GetStringDescr (s->GetSeq()); seq.type = "CDS"; seq.seq = CRef<CBioseq>(&(s->SetSeq())); const CSeq_loc& loc = getGenomicLocation(s->GetSeq()); addLoctoSimpleSeq(seq, loc); seqs.push_back(seq); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to CDS: " << "(" << seq.name << ")" << "(" << printed_range(loc) << ")" << "(" << seq.key << ":" << printed_range(seq) << ")" << NcbiEndl; } } // collect features from RNAs and genes string name; TSimpleSeqs genes; for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f) { const CSeq_loc& loc = f->GetLocation(); if(f->GetData().IsGene()) { name = "Bad or no locus tag"; if (f->GetData().GetGene().CanGetLocus_tag()) name = f->GetData().GetGene().GetLocus_tag(); // I am assuming that each RNA feature is preceded by a gene TSimpleSeq gene; gene.type = "gene"; gene.locus_tag = name; addLoctoSimpleSeq(gene, loc); genes.push_back(gene); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to gene: " << "(" << name << ")" << "(" << printed_range(loc) << ")" << "(" << gene.key << ":" << printed_range(gene) << ")" << NcbiEndl; } continue; } else if(!f->GetData().IsRna()) continue; CRNA_ref::EType rna_type = f->GetData().GetRna().GetType(); string description="Bad or no descriptioin"; if ( rna_type == CRNA_ref::eType_tRNA ) { if ( f->GetData().GetRna().CanGetExt() ) { string type1; try { type1 = Get3type(f->GetData().GetRna());} catch (...) { NcbiCerr << "simple_overlaps: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl; throw; } description = "tRNA:" + type1; } } // if tRNA else { if(f->GetData().GetRna().CanGetExt() && f->GetData().GetRna().GetExt().IsName()) description = f->GetData().GetRna().GetExt().GetName(); } TSimpleSeq seq; if ( rna_type == CRNA_ref::eType_tRNA ) { seq.type = "tRNA"; } else if ( rna_type == CRNA_ref::eType_rRNA ) { seq.type = GetRRNAtype(f->GetData().GetRna());} else if ( rna_type == CRNA_ref::eType_premsg ) { seq.type = "premsg"; } else if ( rna_type == CRNA_ref::eType_mRNA ) { seq.type = "mRNA"; } else if ( rna_type == CRNA_ref::eType_snRNA ) { seq.type = "snRNA"; } else if ( rna_type == CRNA_ref::eType_scRNA ) { seq.type = "scRNA"; } else if ( rna_type == CRNA_ref::eType_snoRNA ) { seq.type = "snoRNA"; } else if ( rna_type == CRNA_ref::eType_other ) { seq.type = "other RNA"; } else { seq.type = "unknown RNA"; } seq.name = name; seq.description = description; addLoctoSimpleSeq(seq, loc); seqs.push_back(seq); } // features // need to tidy up before doing what is next seqs.sort(less_simple_seq); genes.sort(less_simple_seq); // now go over all gene features and match them to seqs features; // first of all do all exact locations TSimpleSeqs::iterator seq = seqs.begin(); for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); ) { string gene_range = printed_range(gene); int seq_from=0, seq_to=0; int gene_from = gene->exons[0].from; int gene_to = gene->exons[0].to; for(;seq!=seqs.end(); seq++) { string seq_range = printed_range(seq); seq_from = seq->exons[0].from; seq_to = seq->exons[0].to; if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") to reach gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl; } if(gene->key<=seq->key) break; } if(seq==seqs.end()) break; seq_from = seq->exons[0].from; seq_to = seq->exons[0].to; string seq_range = printed_range(seq); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") reached gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl; } seq_to = seq->exons[seq->exons.size()-1].to; if(seq->exons[0].strand != eNa_strand_plus) // JIRA-PR-147 { seq_to = seq->exons[0].to; seq_from= seq->exons[seq->exons.size()-1].from; } gene_to = gene->exons[gene->exons.size()-1].to; if(seq_to==gene_to && seq_from==gene_from) // match { seq->locus_tag = gene->locus_tag; gene=genes.erase(gene++); } else gene++; } ///////////////////////////// // now try to assign non-exact gene-CDS matches ///////////////////////////// seq=seqs.begin(); for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); ) { string gene_printed_range = printed_range(gene); if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches: gene: " << gene_printed_range << NcbiEndl; int gene_from = gene->exons[0].from; // find first sequence that could match a gene TSimpleSeqs::iterator seq_start=seq; for(;seq_start!=seqs.end(); seq_start++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying seq_start: " << printed_range(seq_start) << NcbiEndl; if(seq_start->locus_tag != "") { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: " << seq->locus_tag << ", continue..."<<NcbiEndl; continue; // this is done } int seq_from = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[0].from : seq_start->exons[seq_start->exons.size()-1].from; if(gene_from<=seq_from) break; // in case there are cross-origin seqs, they will be in the end of seqs list, so they will be tested the last, thus this incorrect sliding should be fine } if(seq_start==seqs.end()) break; // done with seqs // now check if other ends fit if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_start: " << printed_range(seq_start) << NcbiEndl; int seq_to = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[seq_start->exons.size()-1].to : seq_start->exons[0].to; int gene_to = gene->exons[gene->exons.size()-1].to; if ( gene->exons[0].strand != eNa_strand_plus ) gene_to = gene->exons[0].to; if (seq_to > gene_to) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: sequences jumped over this gene, this gene does not fit any sequence, will be flagged later" << NcbiEndl; // sequences jumped over this gene, this gene does not fit any sequence, will be flagged later gene++; continue; } // end find first sequence that could match a gene // find first sequence that does not match a gene TSimpleSeqs::iterator seq_end = seq_start; int nmatches=0; for(;seq_end!=seqs.end() && gene_to >= (seq_end->exons[0].strand == eNa_strand_plus ? seq_end->exons[seq_end->exons.size()-1].to : seq_end->exons[0].to); seq_end++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying to find: current_seq_end " << printed_range(seq_end) << ", gene_to = " << gene_to << ", seq_end.to = " << (seq_end->exons[0].strand == eNa_strand_plus ? seq_end->exons[seq_end->exons.size()-1].to : seq_end->exons[0].to) << NcbiEndl; if(seq_end->type == "CDS" && seq_end->locus_tag == "" ) nmatches++; } if(seq_end!=seqs.end() ) seq_end++; if(PrintDetails()) { if(seq_end!=seqs.end() ) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: " << printed_range(seq_start) << NcbiEndl; else NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: end()" << NcbiEndl; } // end find first sequence that does not match a gene if(PrintDetails()) { if(seq_end!=seqs.end() ) NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: " << printed_range(seq_end) << NcbiEndl; else NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: end()" << NcbiEndl; } if(nmatches>1) { string range = printed_range(gene); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene["<<gene_printed_range<<"] matches several (" << nmatches << ") CDS features: " << "locus = " << gene->locus_tag << ", " << "[" << range << "]" << NcbiEndl; } // look at all found fits bool gene_used=false; // find best fit and assign locus tag only for that feature TSimpleSeqs::iterator best_seq=seqs.end(); int best_gene_feat_fit = 0x0FFFFFFF; // intentionally less than the const in gene_feat_fit function for(seq=seq_start; seq!=seq_end; seq++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq) << NcbiEndl; if(seq->locus_tag != "") continue; // this is done already if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq) << " does not have a locus tag yet" << NcbiEndl; /* if(seq->type != "CDS" ) { string range = printed_range(seq); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: non-CDS sequence does not have a gene with exactly the same boundaries: " << "type = " << seq->type << ", " << "name = " << seq->name << ", " << "[" << range << "]" << NcbiEndl; } else */ { int fit=gene_feat_fit(seq, gene_from, gene_to); if(fit <= best_gene_feat_fit ) { best_seq=seq; best_gene_feat_fit = fit; } } } // for(seq=seq_start; seq!=seq_end; seq++) // found suitable seqs if(best_seq!=seqs.end()) { best_seq->locus_tag = gene->locus_tag; gene_used = true; } // go to next gene if(gene_used) gene=genes.erase(gene); else gene++; } // swipe over seqs flag those that do not have locus tag NON_CONST_ITERATE(TSimpleSeqs,seq, seqs) { if(seq->locus_tag != "") { if(seq->type == "CDS") { for(CTypeIterator<CSeq_feat> feat=::Begin(*(seq->seq)); feat; ++feat) { if(feat->CanGetComment() && feat->GetComment().find("Genomic Location: ") != string::npos) { string comment = "Genomic Location: " + seq->locus_tag; feat->SetComment(comment); } } } continue; } string range = printed_range(seq); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: feature does not have a matching gene: " << "type = " << seq->type << ", " << "name = " << seq->name << ", " << "[" << range << "]" << NcbiEndl; } // swipe over genes and flag those that are not used NON_CONST_ITERATE(TSimpleSeqs,gene, genes) { string range = printed_range(gene); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene does not match any feature: " << "locus = " << gene->locus_tag << ", " << "[" << range << "]" << NcbiEndl; }
int CReadBlastApp::simple_overlaps() { int nabsent=0; int saved_m_verbosity_threshold = m_verbosity_threshold; // m_verbosity_threshold = 300; if(PrintDetails()) NcbiCerr << "simple_overlaps starts: " << NcbiEndl; TSimpleSeqs& seqs=m_simple_seqs; // now calculated in CopyGenestoforgotthename TSimpleSeqs::iterator first_user_in_range = seqs.begin(); TSimpleSeqs::iterator first_user_non_in_range = seqs.begin(); TSimpleSeqs::iterator first_ext_in_range = m_extRNAtable2.begin(); TSimpleSeqs::iterator first_ext_non_in_range = m_extRNAtable2.begin(); TSimpleSeqs::iterator seq = seqs.begin(); NON_CONST_ITERATE(TSimpleSeqs, ext_rna, m_extRNAtable2) { int from, to; from = ext_rna->exons[0].from; to = ext_rna->exons[ext_rna->exons.size()-1].to; ENa_strand strand = ext_rna->exons[0].strand; int range_scale = to - from; int max_distance = get_max_distance(range_scale); string type2 = ext_rna->name; string ext_rna_range = printed_range(ext_rna); if(PrintDetails()) NcbiCerr << "simple_overlaps[" << type2 << "[" << ext_rna_range << "]" << "]" << NcbiEndl; // find BEST overlap, not good enough here TSimpleSeqs best_seq; find_overlap(seq, ext_rna, seqs, best_seq); // this will slide seq along seqs bool absent = true; string diag_name = ext_rna->name; // for buffer int n_user_neighbors=0; int n_ext_neighbors = 0; string bufferstr=""; NON_CONST_ITERATE(TSimpleSeqs, seq2, best_seq) { int overlap=0; overlaps(ext_rna, seq2, overlap); strstream seq2_range_stream; string seq2_range = printed_range(seq2); if(PrintDetails()) NcbiCerr << "simple_overlaps" << "[" << type2 << "[" << ext_rna_range << "]" << "[" << seq2_range << "]" << "]" << ". " << "Overlap = " << overlap << NcbiEndl; if(PrintDetails()) NcbiCerr << "ext_rna->type = " << ext_rna->type << NcbiEndl; if(PrintDetails()) NcbiCerr << "seq2->type = " << seq2->type << NcbiEndl; if(PrintDetails()) NcbiCerr << "strand = " << int(strand) << NcbiEndl; if(PrintDetails()) NcbiCerr << "seq2->exons[0].strand = " << int(seq2->exons[0].strand) << NcbiEndl; absent = absent && (!overlap || ext_rna->type != seq2->type); // Absent bool bad_strand = (overlap>0 && ext_rna->type == seq2->type && strand != seq2->exons[0].strand); // BadStrand if(!bad_strand) continue; string diag_name2 = seq2->name; int from2, to2; from2 = seq2->exons[0].from; to2 = seq2->exons[seq2->exons.size()-1].to; bool undef_strand = seq2->exons[0].strand == eNa_strand_unknown; if(!bufferstr.size()) { if(PrintDetails()) { if(first_user_in_range==seqs.end()) { NcbiCerr << "simple_overlaps: first_user_in_range is already at the end" << NcbiEndl; } else { NcbiCerr << "simple_overlaps: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl; } } ugly_simple_overlaps_call(n_user_neighbors, n_ext_neighbors, ext_rna, first_user_in_range, first_user_non_in_range, seqs, max_distance, first_ext_in_range, first_ext_non_in_range, bufferstr); } strstream misc_feat; string seq_range = printed_range(seq); EProblem trnaStrandProblem = undef_strand ? eTRNAUndefStrand : eTRNABadStrand; misc_feat << "RNA does not match strand for feature located at " << seq_range << NcbiEndl; misc_feat << '\0'; // this goes to the misc_feat, has to be original location, and name, corrected strand problemStr problem = {trnaStrandProblem, bufferstr, misc_feat.str(), "", "", from2, to2, strand}; m_diag[diag_name2].problems.push_back(problem); if(PrintDetails()) NcbiCerr << "simple_overlaps: adding problem:" << "\t" << diag_name << "\t" << "eTRNABadStrand" << "\t" << bufferstr << "\t" << NcbiEndl; // this goes to the log, has to be new problemStr problem2 = {trnaStrandProblem, bufferstr, "", "", "", from, to, strand}; m_diag[diag_name].problems.push_back(problem2); } // best_Seq iteration NON_CONST_ITERATE(TSimpleSeqs, seq2, best_seq)