Example #1
0
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, 
                          bool doCluster, bool doCollapseRedundant, 
						  bool matchAllIntrons, bool fuzzSpan, bool forceExons) {
	GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
	gffr->showWarnings(this->showWarnings);
	//           keepAttrs   mergeCloseExons  noExonAttr
	gffr->readAll(this->fullAttributes,    this->mergeCloseExons,  this->noExonAttrs);
	GVec<int> pseudoAttrIds;
	GVec<int> pseudoFeatureIds;
	if (this->noPseudo) {
		GffNameList& fnames = gffr->names->feats;
		for (int i=0;i<fnames.Count();i++) {
			char* n=fnames[i]->name;
			if (startsWith(n, "pseudo")) {
				pseudoFeatureIds.Add(fnames[i]->idx);
			}
		}
		GffNameList& attrnames = gffr->names->attrs;
		for (int i=0;i<attrnames.Count();i++) {
			char* n=attrnames[i]->name;
			char* p=strifind(n, "pseudo");
			if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) {
				pseudoAttrIds.Add(attrnames[i]->idx);
			}
		}
	}

	//int redundant=0; //redundant annotation discarded
	if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
	//int rna_deleted=0;
	//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
	for (int k=0;k<gffr->gflst.Count();k++) {
		GffObj* m=gffr->gflst[k];
		if (strcmp(m->getFeatureName(), "locus")==0 &&
				m->getAttr("transcripts")!=NULL) {
			continue; //discard locus meta-features
		}
		if (this->noPseudo) {
			bool is_pseudo=false;
			for (int i=0;i<pseudoFeatureIds.Count();++i) {
				if (pseudoFeatureIds[i]==m->ftype_id) {
					is_pseudo=true;
					break;
				}
			}
			if (is_pseudo) continue;
			for (int i=0;i<pseudoAttrIds.Count();++i) {
				char* attrv=NULL;
				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
				if (attrv!=NULL) {
					char fc=tolower(attrv[0]);
					if (fc=='t' || fc=='y' || fc=='1') {
						is_pseudo=true;
						break;
					}
				}
			}
			if (is_pseudo) continue;
			//last resort:
			//  scan all the attribute values for "pseudogene" keyword (NCBI does that for "product" attr)
			/*
			 if (m->attrs!=NULL) {
				 for (int i=0;i<m->attrs->Count();++i) {
					 GffAttr& a=*(m->attrs->Get(i));
					 if (strifind(a.attr_val, "pseudogene")) {
						 is_pseudo=true;
						 break;
					 }
				 }
			 }
			 if (is_pseudo) continue;
			 */
		} //pseudogene detection requested
		char* rloc=m->getAttr("locus");
		if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
			m->removeAttr("locus", rloc);
		}
		/*
     if (m->exons.Count()==0 && m->children.Count()==0) {
       //a non-mRNA feature with no subfeatures
       //add a dummy exon just to have the generic exon checking work
       m->addExon(m->start,m->end);
       }
		 */
		if (forceExons) {  // && m->children.Count()==0) {
			m->exon_ftype_id=gff_fid_exon;
		}
		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
		if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) {
			continue;
		}
		m->isUsed(true); //so the gffreader won't destroy it
		int i=-1;
		GenomicSeqData f(m->gseq_id);
		GenomicSeqData* gdata=NULL;
		if (seqdata.Found(&f,i)) gdata=seqdata[i];
		else { //entry not created yet for this genomic seq
			gdata=new GenomicSeqData(m->gseq_id);
			seqdata.Add(gdata);
		}
		/*
		for (int k=0;k<gfadd.Count();k++) {
			bool keep=placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
			if (!keep) {
				gfadd[k]->isUsed(false);
				//DEBUG
				GMessage("Feature %s(%d-%d) is going to be discarded..\n",gfadd[k]->getID(), gfadd[k]->start, gfadd[k]->end);
			}
		}
		*/
		bool keep=placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
		if (!keep) {
			m->isUsed(false);
			//DEBUG
			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
		}
	} //for each read gffObj
	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
	if (f!=stdin) { fclose(f); f=NULL; }
	delete gffr;
}