Example #1
0
void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
	//mrnas sorted by start coordinate
	//and so are the loci
	//int rdisc=0;
		for (int t=0;t<mrnas.Count();t++) {
		GArray<int> mrgloci(false);
		GffObj* mrna=mrnas[t];
		int lfound=0; //count of parent loci
		/*for (int l=0;l<loci.Count();l++) {
			if (loci[l]->end<mrna->exons.First()->start) continue;
			if (loci[l]->start>mrna->exons.Last()->end) break; */
		 for (int l=loci.Count()-1;l>=0;l--) {
		   if (loci[l]->end<mrna->exons.First()->start) {
		       if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
		       continue;
		       }
		   if (loci[l]->start>mrna->exons.Last()->end) continue;
			//here we have mrna overlapping loci[l]
			if (loci[l]->add_mRNA(mrna)) {
				//a parent locus was found
				lfound++;
				mrgloci.Add(l); //locus indices added here, in decreasing order
			}
		}//loci loop
		//if (lfound<0) continue; //mrna was a ref duplicate, skip it
		if (lfound==0) {
			//create a locus with only this mRNA
 			 loci.Add(new GLocus(mrna, qfidx));
		    }
		 else if (lfound>1) {
			//more than one locus found parenting this mRNA, merge loci
		     lfound--;
			 for (int l=0;l<lfound;l++) {
				  int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
				  loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
				  loci.Delete(mlidx);
			    }
		    }
	}//mrnas loop
	//if (rdisc>0) mrnas.Pack();
	//return rdisc;
}
Example #2
0
bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
                                               bool matchAllIntrons, bool fuzzSpan) {
  bool keep=false;
  GTData* tdata=NULL;
  //int tidx=-1;
  /*
  if (debug) {
     GMessage(">>Placing transcript %s\n", t->getID());
     debugState=true;
     }
    else debugState=false; 
   */
  //dumb TRNA case for RefSeq: gene parent link missing
  //try to restore it here; BUT this only works if gene feature comes first
  ////DEBUG ONLY:
  //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) {
  //	 GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count());
  //}
  //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count());
  if (t->parent==NULL && t->isTranscript()) {
  	int gidx=gdata->gfs.Count()-1;
  	while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
  		GffObj& g = *(gdata->gfs[gidx]);
  		if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
  			g.children.Add(t);
  			keep=true;
  			if (tdata==NULL) {
  		       tdata=new GTData(t); //additional transcript data
  		       gdata->tdata.Add(tdata);
  			}
  			t->parent=&g;
  			//disable printing of gene if transcriptsOnly
  			if (transcriptsOnly) {
  				g.udata|=4; //tag it as non-printable
  			}
  			const char* geneName=g.getAttr("Name");
  			if (t->getAttr("Name")==NULL && geneName) {
  				t->addAttr("Name", geneName);
  				t->addAttr("gene_name", geneName);
  			}
  			t->addAttr("geneID", g.getID());
  			break;
  		}
  		--gidx;
  	}
  }

  /*
	if (t->exons.Count()==0  && t->children.Count()==0 && forceExons) {
		//a non-mRNA feature with no subfeatures
		//just so we get some sequence functions working, add a dummy "exon"-like subfeature here
		//--this could be a single "pseudogene" entry or another genomic region without exons
		//
		t->addExon(t->start,t->end);
	}
  */
  if (t->exons.Count()>0) {
                gdata->rnas.Add(t); //added it in sorted order
    		    if (tdata==NULL) {
    		       tdata=new GTData(t); //additional transcript data
    		       gdata->tdata.Add(tdata);
    		    }
                keep=true;
              }
            else {
              if (t->isGene() || !this->transcriptsOnly) {
              	   gdata->gfs.Add(t);
              	   keep=true;
              	   //GTData* tdata=new GTData(t); //additional transcript data
        		   if (tdata==NULL) {
        		       tdata=new GTData(t); //additional transcript data
        		       gdata->tdata.Add(tdata);
        		   }
              	   return true;
                 }
              else
                 return false; //nothing to do with these non-transcript objects
              }
  if (!doCluster) return keep;
  if (!keep) return false;
  //---- place into a locus
  if (gdata->loci.Count()==0) {
       gdata->loci.Add(new GffLocus(t));
       return true; //new locus on this ref seq
       }
  int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
  //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
  if (nidx==0) {
     //cannot have any overlapping loci
     //if (debug) GMessage("  <<no ovls possible, create locus %d-%d \n",t->start, t->end);
     gdata->loci.Add(new GffLocus(t));
     return true;
     }
  if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
  int lfound=0; //count of parent loci
  GArray<int> mrgloci(false);
  GList<GffLocus> tloci(true); //candidate parent loci to adopt this
  //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
  for (int l=nidx-1;l>=0;l--) {
      GffLocus& loc=*(gdata->loci[l]);
      if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
      if (t->start>loc.end) {
           if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
           continue;
           }
      if (loc.start>t->end) {
               //this should never be the case if nidx was found correctly
               GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
               continue;
               }
      if (loc.add_RNA(t)) {
         //will add this transcript to loc
         lfound++;
         mrgloci.Add(l);
         if (collapseRedundant) {
           //compare to every single transcript in this locus
           for (int ti=0;ti<loc.rnas.Count();ti++) {
                 if (loc.rnas[ti]==t) continue;
                 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
                 //GMessage("  ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
                 GffObj* container=NULL;
                 if (odata->replaced_by==NULL && 
                      (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
                     if (container==t) {
                        odata->replaced_by=t;
                        preserveContainedCDS(t, loc.rnas[ti]);
                        }
                     else {// t is being replaced by previously defined transcript
                        tdata->replaced_by=loc.rnas[ti];
                        preserveContainedCDS(loc.rnas[ti], t);
                        }
                     }
              }//for each transcript in the exon-overlapping locus
          } //if doCollapseRedundant
         } //overlapping locus
      } //for each existing locus
  if (lfound==0) {
      //overlapping loci not found, create a locus with only this mRNA
      int addidx=gdata->loci.Add(new GffLocus(t));
      if (addidx<0) {
         //should never be the case!
         GMessage("  WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
         }
      }
   else { //found at least one overlapping locus
     lfound--;
     int locidx=mrgloci[lfound];
     GffLocus& loc=*(gdata->loci[locidx]);
     //last locus index found is also the smallest index
     if (lfound>0) {
       //more than one loci found parenting this mRNA, merge loci
       /* if (debug)
          GMessage(" merging %d loci \n",lfound);
       */
       for (int l=0;l<lfound;l++) {
          int mlidx=mrgloci[l]; 
          loc.addMerge(*(gdata->loci[mlidx]), t);
          gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
          }
       }
     int i=locidx;  
     while (i>0 && loc<*(gdata->loci[i-1])) {
       //bubble down until it's in the proper order
       i--;
       gdata->loci.Swap(i,i+1);
       }
     }//found at least one overlapping locus
  return true;
}