void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) { //mrnas sorted by start coordinate //and so are the loci //int rdisc=0; for (int t=0;t<mrnas.Count();t++) { GArray<int> mrgloci(false); GffObj* mrna=mrnas[t]; int lfound=0; //count of parent loci /*for (int l=0;l<loci.Count();l++) { if (loci[l]->end<mrna->exons.First()->start) continue; if (loci[l]->start>mrna->exons.Last()->end) break; */ for (int l=loci.Count()-1;l>=0;l--) { if (loci[l]->end<mrna->exons.First()->start) { if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break; continue; } if (loci[l]->start>mrna->exons.Last()->end) continue; //here we have mrna overlapping loci[l] if (loci[l]->add_mRNA(mrna)) { //a parent locus was found lfound++; mrgloci.Add(l); //locus indices added here, in decreasing order } }//loci loop //if (lfound<0) continue; //mrna was a ref duplicate, skip it if (lfound==0) { //create a locus with only this mRNA loci.Add(new GLocus(mrna, qfidx)); } else if (lfound>1) { //more than one locus found parenting this mRNA, merge loci lfound--; for (int l=0;l<lfound;l++) { int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna); loci.Delete(mlidx); } } }//mrnas loop //if (rdisc>0) mrnas.Pack(); //return rdisc; }
bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) { bool keep=false; GTData* tdata=NULL; //int tidx=-1; /* if (debug) { GMessage(">>Placing transcript %s\n", t->getID()); debugState=true; } else debugState=false; */ //dumb TRNA case for RefSeq: gene parent link missing //try to restore it here; BUT this only works if gene feature comes first ////DEBUG ONLY: //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) { // GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count()); //} //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count()); if (t->parent==NULL && t->isTranscript()) { int gidx=gdata->gfs.Count()-1; while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) { GffObj& g = *(gdata->gfs[gidx]); if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) { g.children.Add(t); keep=true; if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } t->parent=&g; //disable printing of gene if transcriptsOnly if (transcriptsOnly) { g.udata|=4; //tag it as non-printable } const char* geneName=g.getAttr("Name"); if (t->getAttr("Name")==NULL && geneName) { t->addAttr("Name", geneName); t->addAttr("gene_name", geneName); } t->addAttr("geneID", g.getID()); break; } --gidx; } } /* if (t->exons.Count()==0 && t->children.Count()==0 && forceExons) { //a non-mRNA feature with no subfeatures //just so we get some sequence functions working, add a dummy "exon"-like subfeature here //--this could be a single "pseudogene" entry or another genomic region without exons // t->addExon(t->start,t->end); } */ if (t->exons.Count()>0) { gdata->rnas.Add(t); //added it in sorted order if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } keep=true; } else { if (t->isGene() || !this->transcriptsOnly) { gdata->gfs.Add(t); keep=true; //GTData* tdata=new GTData(t); //additional transcript data if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } return true; } else return false; //nothing to do with these non-transcript objects } if (!doCluster) return keep; if (!keep) return false; //---- place into a locus if (gdata->loci.Count()==0) { gdata->loci.Add(new GffLocus(t)); return true; //new locus on this ref seq } int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx); if (nidx==0) { //cannot have any overlapping loci //if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end); gdata->loci.Add(new GffLocus(t)); return true; } if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end int lfound=0; //count of parent loci GArray<int> mrgloci(false); GList<GffLocus> tloci(true); //candidate parent loci to adopt this //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1); for (int l=nidx-1;l>=0;l--) { GffLocus& loc=*(gdata->loci[l]); if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue; if (t->start>loc.end) { if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already continue; } if (loc.start>t->end) { //this should never be the case if nidx was found correctly GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID()); continue; } if (loc.add_RNA(t)) { //will add this transcript to loc lfound++; mrgloci.Add(l); if (collapseRedundant) { //compare to every single transcript in this locus for (int ti=0;ti<loc.rnas.Count();ti++) { if (loc.rnas[ti]==t) continue; GTData* odata=(GTData*)(loc.rnas[ti]->uptr); //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID()); GffObj* container=NULL; if (odata->replaced_by==NULL && (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) { if (container==t) { odata->replaced_by=t; preserveContainedCDS(t, loc.rnas[ti]); } else {// t is being replaced by previously defined transcript tdata->replaced_by=loc.rnas[ti]; preserveContainedCDS(loc.rnas[ti], t); } } }//for each transcript in the exon-overlapping locus } //if doCollapseRedundant } //overlapping locus } //for each existing locus if (lfound==0) { //overlapping loci not found, create a locus with only this mRNA int addidx=gdata->loci.Add(new GffLocus(t)); if (addidx<0) { //should never be the case! GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end); } } else { //found at least one overlapping locus lfound--; int locidx=mrgloci[lfound]; GffLocus& loc=*(gdata->loci[locidx]); //last locus index found is also the smallest index if (lfound>0) { //more than one loci found parenting this mRNA, merge loci /* if (debug) GMessage(" merging %d loci \n",lfound); */ for (int l=0;l<lfound;l++) { int mlidx=mrgloci[l]; loc.addMerge(*(gdata->loci[mlidx]), t); gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove } } int i=locidx; while (i>0 && loc<*(gdata->loci[i-1])) { //bubble down until it's in the proper order i--; gdata->loci.Swap(i,i+1); } }//found at least one overlapping locus return true; }