Exemple #1
0
void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) {
	//mrnas sorted by start coordinate
	//and so are the loci
	//int rdisc=0;
		for (int t=0;t<mrnas.Count();t++) {
		GArray<int> mrgloci(false);
		GffObj* mrna=mrnas[t];
		int lfound=0; //count of parent loci
		/*for (int l=0;l<loci.Count();l++) {
			if (loci[l]->end<mrna->exons.First()->start) continue;
			if (loci[l]->start>mrna->exons.Last()->end) break; */
		 for (int l=loci.Count()-1;l>=0;l--) {
		   if (loci[l]->end<mrna->exons.First()->start) {
		       if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break;
		       continue;
		       }
		   if (loci[l]->start>mrna->exons.Last()->end) continue;
			//here we have mrna overlapping loci[l]
			if (loci[l]->add_mRNA(mrna)) {
				//a parent locus was found
				lfound++;
				mrgloci.Add(l); //locus indices added here, in decreasing order
			}
		}//loci loop
		//if (lfound<0) continue; //mrna was a ref duplicate, skip it
		if (lfound==0) {
			//create a locus with only this mRNA
 			 loci.Add(new GLocus(mrna, qfidx));
		    }
		 else if (lfound>1) {
			//more than one locus found parenting this mRNA, merge loci
		     lfound--;
			 for (int l=0;l<lfound;l++) {
				  int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove
				  loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna);
				  loci.Delete(mlidx);
			    }
		    }
	}//mrnas loop
	//if (rdisc>0) mrnas.Pack();
	//return rdisc;
}
Exemple #2
0
//-------------------------------------------------------------------------
//	create dialog
//-------------------------------------------------------------------------
Bool GWinDialog::ShowDialog(GWinControl* pOwner)
{
	int DialogResult = IDCANCEL;

	//	add dialog to list
	g_ActiveWinDialogs.Add( this );

	//	create dialog
	DialogResult = DialogBoxParam( GApp::g_HInstance, MAKEINTRESOURCE( DialogResource() ), pOwner->Hwnd(), GetDialogCallback(), (u32)this );

	if ( DialogResult == -1 )
	{
		GDebug::CheckWin32Error();
	}

	//	remove dialog from list
	int Index = g_ActiveWinDialogs.FindIndex(this);
	g_ActiveWinDialogs.RemoveAt( Index );

	return (DialogResult == IDOK);
}
Exemple #3
0
int parse_mRNAs(GfList& mrnas,
				 GList<GSeqData>& glstdata,
				 bool is_ref_set,
				 int check_for_dups,
				 int qfidx, bool only_multiexon) {
	int tredundant=0; //redundant transcripts discarded
	int total_kept=0;
	int total_seen=mrnas.Count();
	for (int k=0;k<mrnas.Count();k++) {
		GffObj* m=mrnas[k];
		int i=-1;
		GSeqData f(m->gseq_id);
		GSeqData* gdata=NULL;
		uint tlen=m->len();
		if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
			if (gtf_tracking_verbose) 
			      GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
			continue;
			}
		if (only_multiexon && m->exons.Count()<2) {
			continue;
			}
		//GStr feature(m->getFeatureName());
		//feature.lower();
		//bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
		//if (m->exons.Count()==0 && gene_or_locus) {
		if (m->isDiscarded()) {
			//discard generic "gene" or "locus" features with no other detailed subfeatures
			if (!is_ref_set && gtf_tracking_verbose)
			   GMessage("Warning: discarding non-transfrag (GFF generic gene/locus container?) %s\n",m->getID());
			continue;
			}

		if (m->exons.Count()==0) {
				if (gtf_tracking_verbose && !is_ref_set)
				 GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
				m->addExon(m->start,m->end);
				}
		if (glstdata.Found(&f,i)) gdata=glstdata[i];
		else {
			gdata=new GSeqData(m->gseq_id);
			glstdata.Add(gdata);
			}
		
		double fpkm=0;
		double cov=0;
		double conf_hi=0;
		double conf_lo=0;

		GList<GffObj>* target_mrnas=NULL;
		if (is_ref_set) { //-- ref transcripts
		   if (m->strand=='.') {
		     //unknown strand - discard from reference set (!)
		     continue;
		     }
		   total_kept++;
		   target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
		   if (check_for_dups) {
		     //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
		     int rpidx=-1;
		     GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
		     if (rp!=NULL) { //duplicate found
		      //discard one of them
		      //but let's keep the gene_name if present
		      //DEBUG:
		      //GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
		      tredundant++;
		      total_kept--;
		      if (betterDupRef(rp, m)) {
		           if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
		                  rp->setGeneName(m->getGeneName());
		                  }
		           continue;
		           }
		         else {
		           if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
		                  m->setGeneName(rp->getGeneName());
		                  }
		           ((CTData*)(rp->uptr))->mrna=NULL;
		           rp->isUsed(false);
		           target_mrnas->Forget(rpidx);
		           target_mrnas->Delete(rpidx);
		           }
		       }
		     } //check for duplicate ref transcripts
		   } //ref transcripts
		else { //-- query transfrags
		   if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
		     else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
		        else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
		   total_kept++;
		   if (check_for_dups) { //check for redundancy
		     // check if there is a redundancy between this and another already loaded Cufflinks transcript
		     int cidx =  is_Redundant(m, target_mrnas, (check_for_dups>1));
		     if (cidx>=0) {
		        //always discard the redundant transcript with the fewer exons OR shorter
			     tredundant++;
		         total_kept--;
		    	 if (t_dominates(target_mrnas->Get(cidx),m)) {
		            //new transcript is shorter, discard it
		        	if (gtf_tracking_verbose) {
		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", m->getID(), target_mrnas->Get(cidx)->getID());
		        	}
		            continue;
		        }
		        else {
		            //discard the older transfrag
		        	if (gtf_tracking_verbose) {
		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", target_mrnas->Get(cidx)->getID(), m->getID());
		        	}
		            ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
		            target_mrnas->Get(cidx)->isUsed(false);
		            target_mrnas->Forget(cidx);
		            target_mrnas->Delete(cidx);
		            //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
		        }
		     }
		   }// redundant transfrag check
		   if (m->gscore==0.0)   
		     m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
		   //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
		   const char* expr = m->getAttr("FPKM");
		   if (expr!=NULL) {
		       if (expr[0]=='"') expr++;
		       fpkm=strtod(expr, NULL);
		       } else { //backward compatibility: read RPKM if FPKM not found
		       //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
		       expr=m->getAttr("RPKM");
		       if (expr!=NULL) {
		           if (expr[0]=='"') expr++;
		           fpkm=strtod(expr, NULL);
		           }
		       }
		   //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
		   const char* scov=m->getAttr("cov");
		   if (scov!=NULL) {
		       if (scov[0]=='"') scov++; 
		       cov=strtod(scov, NULL);
		       }
		   //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
		   const char* sconf_hi=m->getAttr("conf_hi");
		   if (sconf_hi!=NULL){
		       if (sconf_hi[0]=='"') sconf_hi++;
		       conf_hi=strtod(sconf_hi, NULL);
		       }
		   //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
		   const char* sconf_lo=m->getAttr("conf_lo");
		   if (sconf_lo!=NULL) {
		       if (sconf_lo[0]=='"') sconf_lo++;
		       conf_lo=strtod(sconf_lo, NULL);
		       }
		   } //Cufflinks transfrags
		target_mrnas->Add(m);
		m->isUsed(true);
		CTData* mdata=new CTData(m);
		mdata->qset=qfidx;
		gdata->tdata.Add(mdata);
		if (!is_ref_set) {
		// Cufflinks - attributes parsing
		   mdata->FPKM=fpkm;
		   mdata->cov=cov;
		   mdata->conf_hi=conf_hi;
		   mdata->conf_lo=conf_lo;
		   }
	}//for each mrna read
	if (gtf_tracking_verbose) {
		if (is_ref_set)
       GMessage(" Kept %d ref transcripts out of %d\n", total_kept, total_seen);
		else
	   GMessage(" Kept %d transfrags out of %d\n", total_kept, total_seen);
	}
 //if (mrna_deleted>0)
 //  mrnas.Pack();
 
 //return (is_ref_set ? refdiscarded : tredundant);
	return tredundant;
}
Exemple #4
0
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, 
                          bool doCluster, bool doCollapseRedundant, 
						  bool matchAllIntrons, bool fuzzSpan, bool forceExons) {
	GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
	gffr->showWarnings(this->showWarnings);
	//           keepAttrs   mergeCloseExons  noExonAttr
	gffr->readAll(this->fullAttributes,    this->mergeCloseExons,  this->noExonAttrs);
	GVec<int> pseudoAttrIds;
	GVec<int> pseudoFeatureIds;
	if (this->noPseudo) {
		GffNameList& fnames = gffr->names->feats;
		for (int i=0;i<fnames.Count();i++) {
			char* n=fnames[i]->name;
			if (startsWith(n, "pseudo")) {
				pseudoFeatureIds.Add(fnames[i]->idx);
			}
		}
		GffNameList& attrnames = gffr->names->attrs;
		for (int i=0;i<attrnames.Count();i++) {
			char* n=attrnames[i]->name;
			char* p=strifind(n, "pseudo");
			if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) {
				pseudoAttrIds.Add(attrnames[i]->idx);
			}
		}
	}

	//int redundant=0; //redundant annotation discarded
	if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
	//int rna_deleted=0;
	//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
	for (int k=0;k<gffr->gflst.Count();k++) {
		GffObj* m=gffr->gflst[k];
		if (strcmp(m->getFeatureName(), "locus")==0 &&
				m->getAttr("transcripts")!=NULL) {
			continue; //discard locus meta-features
		}
		if (this->noPseudo) {
			bool is_pseudo=false;
			for (int i=0;i<pseudoFeatureIds.Count();++i) {
				if (pseudoFeatureIds[i]==m->ftype_id) {
					is_pseudo=true;
					break;
				}
			}
			if (is_pseudo) continue;
			for (int i=0;i<pseudoAttrIds.Count();++i) {
				char* attrv=NULL;
				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
				if (attrv!=NULL) {
					char fc=tolower(attrv[0]);
					if (fc=='t' || fc=='y' || fc=='1') {
						is_pseudo=true;
						break;
					}
				}
			}
			if (is_pseudo) continue;
			//last resort:
			//  scan all the attribute values for "pseudogene" keyword (NCBI does that for "product" attr)
			/*
			 if (m->attrs!=NULL) {
				 for (int i=0;i<m->attrs->Count();++i) {
					 GffAttr& a=*(m->attrs->Get(i));
					 if (strifind(a.attr_val, "pseudogene")) {
						 is_pseudo=true;
						 break;
					 }
				 }
			 }
			 if (is_pseudo) continue;
			 */
		} //pseudogene detection requested
		char* rloc=m->getAttr("locus");
		if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
			m->removeAttr("locus", rloc);
		}
		/*
     if (m->exons.Count()==0 && m->children.Count()==0) {
       //a non-mRNA feature with no subfeatures
       //add a dummy exon just to have the generic exon checking work
       m->addExon(m->start,m->end);
       }
		 */
		if (forceExons) {  // && m->children.Count()==0) {
			m->exon_ftype_id=gff_fid_exon;
		}
		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
		if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) {
			continue;
		}
		m->isUsed(true); //so the gffreader won't destroy it
		int i=-1;
		GenomicSeqData f(m->gseq_id);
		GenomicSeqData* gdata=NULL;
		if (seqdata.Found(&f,i)) gdata=seqdata[i];
		else { //entry not created yet for this genomic seq
			gdata=new GenomicSeqData(m->gseq_id);
			seqdata.Add(gdata);
		}
		/*
		for (int k=0;k<gfadd.Count();k++) {
			bool keep=placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
			if (!keep) {
				gfadd[k]->isUsed(false);
				//DEBUG
				GMessage("Feature %s(%d-%d) is going to be discarded..\n",gfadd[k]->getID(), gfadd[k]->start, gfadd[k]->end);
			}
		}
		*/
		bool keep=placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
		if (!keep) {
			m->isUsed(false);
			//DEBUG
			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
		}
	} //for each read gffObj
	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
	if (f!=stdin) { fclose(f); f=NULL; }
	delete gffr;
}