int CReadBlastApp::AnalyzeSeqsViaBioseqs(CBioseq& left, CBioseq_set::TSeq_set& against_seqs, bool against_prot) { if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, against_seqs, against_prot): " << GetStringDescr(left) << ", against_prot= " << against_prot << "\n"; NON_CONST_ITERATE( CBioseq_set::TSeq_set, right, against_seqs) { if((*right)->IsSet()) { CBioseq_set::TSeq_set& seqs_down = (*right)->SetSet().SetSeq_set(); AnalyzeSeqsViaBioseqs(left, seqs_down, against_prot); } else { string name = GetStringDescr((*right)->GetSeq()); /* if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(left, against_seqs, against_prot): " << "right seq entry " << name << "\n"; */ if( ( against_prot && is_prot_entry((*right)->GetSeq())) || (!against_prot && !is_prot_entry((*right)->GetSeq())) ) AnalyzeSeqsViaBioseqs(left, (*right)->SetSeq() ); } } return -1; }
int CReadBlastApp::AnalyzeSeqsViaBioseqs( CBioseq_set::TSeq_set& in_pool_seqs, CBioseq_set::TSeq_set& against_seqs, bool in_pool_prot, bool against_prot) { if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(in_pool_seqs, against_seqs, in_pool_prot, against_prot): " << ", in_pool_prot= " << in_pool_prot << ", against_prot= " << against_prot << "\n"; NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, in_pool_seqs) { if((*left)->IsSet()) { CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set(); AnalyzeSeqsViaBioseqs(seqs_down, against_seqs, in_pool_prot, against_prot); } else { string name = GetStringDescr((*left)->GetSeq()); if(PrintDetails()) NcbiCerr << "AnalyzeSeqsViaBioseqs(in_pool_seqs, against_seqs, in_pool_prot, against_prot): " << "left seq entry " << name << "\n"; if( ( in_pool_prot && is_prot_entry((*left)->GetSeq())) || (!in_pool_prot && !is_prot_entry((*left)->GetSeq())) ) { AnalyzeSeqsViaBioseqs1((*left)->SetSeq()); AnalyzeSeqsViaBioseqs((*left)->SetSeq(), against_seqs, against_prot); } } } return -1; }
int CReadBlastApp::CollectSimpleSeqs(TSimpleSeqs& seqs) { // collect stuff from proteins for(CTypeIterator< CSeq_entry > s = Begin(); s; ++s) { if(s->IsSet()) continue; if(!is_prot_entry(s->GetSeq())) continue; TSimpleSeq seq; seq.description = GetProtName(s->GetSeq()); seq.name = GetStringDescr (s->GetSeq()); seq.type = "CDS"; seq.seq = CRef<CBioseq>(&(s->SetSeq())); const CSeq_loc& loc = getGenomicLocation(s->GetSeq()); addLoctoSimpleSeq(seq, loc); seqs.push_back(seq); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to CDS: " << "(" << seq.name << ")" << "(" << printed_range(loc) << ")" << "(" << seq.key << ":" << printed_range(seq) << ")" << NcbiEndl; } } // collect features from RNAs and genes string name; TSimpleSeqs genes; for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f) { const CSeq_loc& loc = f->GetLocation(); if(f->GetData().IsGene()) { name = "Bad or no locus tag"; if (f->GetData().GetGene().CanGetLocus_tag()) name = f->GetData().GetGene().GetLocus_tag(); // I am assuming that each RNA feature is preceded by a gene TSimpleSeq gene; gene.type = "gene"; gene.locus_tag = name; addLoctoSimpleSeq(gene, loc); genes.push_back(gene); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to gene: " << "(" << name << ")" << "(" << printed_range(loc) << ")" << "(" << gene.key << ":" << printed_range(gene) << ")" << NcbiEndl; } continue; } else if(!f->GetData().IsRna()) continue; CRNA_ref::EType rna_type = f->GetData().GetRna().GetType(); string description="Bad or no descriptioin"; if ( rna_type == CRNA_ref::eType_tRNA ) { if ( f->GetData().GetRna().CanGetExt() ) { string type1; try { type1 = Get3type(f->GetData().GetRna());} catch (...) { NcbiCerr << "simple_overlaps: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl; throw; } description = "tRNA:" + type1; } } // if tRNA else { if(f->GetData().GetRna().CanGetExt() && f->GetData().GetRna().GetExt().IsName()) description = f->GetData().GetRna().GetExt().GetName(); } TSimpleSeq seq; if ( rna_type == CRNA_ref::eType_tRNA ) { seq.type = "tRNA"; } else if ( rna_type == CRNA_ref::eType_rRNA ) { seq.type = GetRRNAtype(f->GetData().GetRna());} else if ( rna_type == CRNA_ref::eType_premsg ) { seq.type = "premsg"; } else if ( rna_type == CRNA_ref::eType_mRNA ) { seq.type = "mRNA"; } else if ( rna_type == CRNA_ref::eType_snRNA ) { seq.type = "snRNA"; } else if ( rna_type == CRNA_ref::eType_scRNA ) { seq.type = "scRNA"; } else if ( rna_type == CRNA_ref::eType_snoRNA ) { seq.type = "snoRNA"; } else if ( rna_type == CRNA_ref::eType_other ) { seq.type = "other RNA"; } else { seq.type = "unknown RNA"; } seq.name = name; seq.description = description; addLoctoSimpleSeq(seq, loc); seqs.push_back(seq); } // features // need to tidy up before doing what is next seqs.sort(less_simple_seq); genes.sort(less_simple_seq); // now go over all gene features and match them to seqs features; // first of all do all exact locations TSimpleSeqs::iterator seq = seqs.begin(); for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); ) { string gene_range = printed_range(gene); int seq_from=0, seq_to=0; int gene_from = gene->exons[0].from; int gene_to = gene->exons[0].to; for(;seq!=seqs.end(); seq++) { string seq_range = printed_range(seq); seq_from = seq->exons[0].from; seq_to = seq->exons[0].to; if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") to reach gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl; } if(gene->key<=seq->key) break; } if(seq==seqs.end()) break; seq_from = seq->exons[0].from; seq_to = seq->exons[0].to; string seq_range = printed_range(seq); if(PrintDetails()) { NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") reached gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl; } seq_to = seq->exons[seq->exons.size()-1].to; if(seq->exons[0].strand != eNa_strand_plus) // JIRA-PR-147 { seq_to = seq->exons[0].to; seq_from= seq->exons[seq->exons.size()-1].from; } gene_to = gene->exons[gene->exons.size()-1].to; if(seq_to==gene_to && seq_from==gene_from) // match { seq->locus_tag = gene->locus_tag; gene=genes.erase(gene++); } else gene++; } ///////////////////////////// // now try to assign non-exact gene-CDS matches ///////////////////////////// seq=seqs.begin(); for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); ) { string gene_printed_range = printed_range(gene); if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches: gene: " << gene_printed_range << NcbiEndl; int gene_from = gene->exons[0].from; // find first sequence that could match a gene TSimpleSeqs::iterator seq_start=seq; for(;seq_start!=seqs.end(); seq_start++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying seq_start: " << printed_range(seq_start) << NcbiEndl; if(seq_start->locus_tag != "") { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: " << seq->locus_tag << ", continue..."<<NcbiEndl; continue; // this is done } int seq_from = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[0].from : seq_start->exons[seq_start->exons.size()-1].from; if(gene_from<=seq_from) break; // in case there are cross-origin seqs, they will be in the end of seqs list, so they will be tested the last, thus this incorrect sliding should be fine } if(seq_start==seqs.end()) break; // done with seqs // now check if other ends fit if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_start: " << printed_range(seq_start) << NcbiEndl; int seq_to = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[seq_start->exons.size()-1].to : seq_start->exons[0].to; int gene_to = gene->exons[gene->exons.size()-1].to; if ( gene->exons[0].strand != eNa_strand_plus ) gene_to = gene->exons[0].to; if (seq_to > gene_to) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: sequences jumped over this gene, this gene does not fit any sequence, will be flagged later" << NcbiEndl; // sequences jumped over this gene, this gene does not fit any sequence, will be flagged later gene++; continue; } // end find first sequence that could match a gene // find first sequence that does not match a gene TSimpleSeqs::iterator seq_end = seq_start; int nmatches=0; for(;seq_end!=seqs.end() && gene_to >= (seq_end->exons[0].strand == eNa_strand_plus ? seq_end->exons[seq_end->exons.size()-1].to : seq_end->exons[0].to); seq_end++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying to find: current_seq_end " << printed_range(seq_end) << ", gene_to = " << gene_to << ", seq_end.to = " << (seq_end->exons[0].strand == eNa_strand_plus ? seq_end->exons[seq_end->exons.size()-1].to : seq_end->exons[0].to) << NcbiEndl; if(seq_end->type == "CDS" && seq_end->locus_tag == "" ) nmatches++; } if(seq_end!=seqs.end() ) seq_end++; if(PrintDetails()) { if(seq_end!=seqs.end() ) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: " << printed_range(seq_start) << NcbiEndl; else NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: end()" << NcbiEndl; } // end find first sequence that does not match a gene if(PrintDetails()) { if(seq_end!=seqs.end() ) NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: " << printed_range(seq_end) << NcbiEndl; else NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: end()" << NcbiEndl; } if(nmatches>1) { string range = printed_range(gene); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene["<<gene_printed_range<<"] matches several (" << nmatches << ") CDS features: " << "locus = " << gene->locus_tag << ", " << "[" << range << "]" << NcbiEndl; } // look at all found fits bool gene_used=false; // find best fit and assign locus tag only for that feature TSimpleSeqs::iterator best_seq=seqs.end(); int best_gene_feat_fit = 0x0FFFFFFF; // intentionally less than the const in gene_feat_fit function for(seq=seq_start; seq!=seq_end; seq++) { if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq) << NcbiEndl; if(seq->locus_tag != "") continue; // this is done already if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq) << " does not have a locus tag yet" << NcbiEndl; /* if(seq->type != "CDS" ) { string range = printed_range(seq); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: non-CDS sequence does not have a gene with exactly the same boundaries: " << "type = " << seq->type << ", " << "name = " << seq->name << ", " << "[" << range << "]" << NcbiEndl; } else */ { int fit=gene_feat_fit(seq, gene_from, gene_to); if(fit <= best_gene_feat_fit ) { best_seq=seq; best_gene_feat_fit = fit; } } } // for(seq=seq_start; seq!=seq_end; seq++) // found suitable seqs if(best_seq!=seqs.end()) { best_seq->locus_tag = gene->locus_tag; gene_used = true; } // go to next gene if(gene_used) gene=genes.erase(gene); else gene++; } // swipe over seqs flag those that do not have locus tag NON_CONST_ITERATE(TSimpleSeqs,seq, seqs) { if(seq->locus_tag != "") { if(seq->type == "CDS") { for(CTypeIterator<CSeq_feat> feat=::Begin(*(seq->seq)); feat; ++feat) { if(feat->CanGetComment() && feat->GetComment().find("Genomic Location: ") != string::npos) { string comment = "Genomic Location: " + seq->locus_tag; feat->SetComment(comment); } } } continue; } string range = printed_range(seq); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: feature does not have a matching gene: " << "type = " << seq->type << ", " << "name = " << seq->name << ", " << "[" << range << "]" << NcbiEndl; } // swipe over genes and flag those that are not used NON_CONST_ITERATE(TSimpleSeqs,gene, genes) { string range = printed_range(gene); NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene does not match any feature: " << "locus = " << gene->locus_tag << ", " << "[" << range << "]" << NcbiEndl; }
// w.out CTypeConstIterator int CReadBlastApp::AnalyzeSeqs(CBioseq_set::TSeq_set& seqs) { CArgs args = GetArgs(); IncreaseVerbosity(); string tblFile; if (args["outTbl"].HasValue()) tblFile = args["outTbl"].AsString(); else tblFile = "/dev/null"; ofstream tblOut(tblFile.c_str(), IOS_BASE::app | IOS_BASE::out ); NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, seqs) { if((*left)->IsSet()) { if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: going down: " << NcbiEndl; CBioseq_set::TSeq_set& seqs2 = (*left)->SetSet().SetSeq_set(); PushVerbosity(); AnalyzeSeqs(seqs2); PopVerbosity(); continue; } if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: left: " // << CSeq_id::GetStringDescr ((*left)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl; << GetStringDescr ((*left)->GetSeq()) << NcbiEndl; ///////////////////////////////// // not a protein. Do NA stuff if( !is_prot_entry((*left)->GetSeq()) ) { // NA, process all RNA and what not annotations here and compare for overlaps // CheckMissingRibosomalRNA((*left)->GetSeq().GetAnnot() ); // check overlaps of the sequence with other features overlaps_na((*left)->GetSeq().GetAnnot() ); continue; } /////////////////////////////////// // compare to... CBioseq_set::TSeq_set::iterator right = left; bool again=true; bool last_right=false; while(again) // have overlaps { again=false; ++right; if(!skip_toprot(right, seqs)) {last_right=true; break;} if(PrintDetails()) { NcbiCerr << "AnalyzeSeqs: right: " << GetStringDescr ((*right)->GetSeq()) << NcbiEndl; } // analyze for overlaps with the next one PushVerbosity(); // if there are overlaps, keep on working on left, iterating through right again=overlaps((*left)->GetSeq(), (*right)->GetSeq() ); PopVerbosity(); } if (last_right) break; if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: finished lower level seq, overlaps: " << NcbiEndl; } NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, seqs) { if((*left)->IsSet()) continue; // does not hit. Skip if( !has_blast_hits((*left)->GetSeq()) ) continue; if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: left: valid" << NcbiEndl; CBioseq_set::TSeq_set::iterator right = left; ++right; if(!skip_to_valid_seq_cand(right, seqs)) break; if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: right: valid" << NcbiEndl; string common_subject; bool fit_blast_result = fit_blast((*left)->GetSeq(), (*right)->GetSeq(), common_subject); bool lhp = hasProblems((*left)->GetSeq(), m_diag, eFrameShift); bool rhp = hasProblems((*right)->GetSeq(), m_diag, eFrameShift); bool lhoe = hasProblems((*left)->GetSeq(), m_diag, eMayBeNotFrameShift); bool rhoe = hasProblems((*right)->GetSeq(), m_diag, eMayBeNotFrameShift); if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: after fit_blast:" << fit_blast_result << lhp << lhoe << rhp << rhoe << NcbiEndl; // if(fit_blast_result && (lhp && !lhoe) && (rhp && !rhoe)) if(fit_blast_result) { // go to the same sequence set, find first NA, add misc_feature append_misc_feature(seqs, GetStringDescr((*left)->GetSeq()), eFrameShift); } if(PrintDetails()) NcbiCerr << "AnalyzeSeqs: finished lower level seq, frameshifts: " << NcbiEndl; } DecreaseVerbosity(); return -1; }