int cmpRedundant(GffObj& a, GffObj& b) { if (a.exons.Count()==b.exons.Count()) { if (a.covlen==b.covlen) { return strcmp(a.getID(), b.getID()); } else return (a.covlen>b.covlen)? 1 : -1; } else return (a.exons.Count()>b.exons.Count())? 1: -1; }
bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) { //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons //but it does not cross any intron-exon boundary of tj int imax=ti.exons.Count()-1; int jmax=tj.exons.Count()-1; if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n"); int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan if (fuzzSpan) { for (int j=0;j<=jmax;j++) { //must NOT overlap the introns if ((j>0 && ti.start<tj.exons[j]->start) || (j<jmax && ti.end>tj.exons[j]->end)) return false; if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl) return true; } } else { for (int j=0;j<=jmax;j++) { //must NOT overlap the introns if ((j>0 && ti.start<tj.exons[j]->start) || (j<jmax && ti.end>tj.exons[j]->end)) return false; //strict containment if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) return true; } } return false; }
int main(int argc, char* argv[]) { if (argc == 1 or argc > 2) { std::cerr << "Usage: TestGFFParse input\n"; std::exit(1); } GffReader reader(argv[1]); reader.readAll(true); std::cerr << "had count of " << reader.gflst.Count() << "\n"; size_t nfeat = reader.gflst.Count(); for (size_t i=0; i < nfeat; ++i) { GffObj* f = reader.gflst[i]; if (f->isTranscript()) { std::cout << f->getID() << '\t' << f->getGeneID() << '\t'; if (f->attrs) { for (size_t j=0; j < f->attrs->Count(); ++j) { std::cout << f->getAttrName(j) << "\t" << f->getAttrValue(j) << "\t"; } } std::cout << "\n"; } } std::exit(0); }
bool exonOverlap2Gene(GffObj* t, GffObj& g) { if (t->exons.Count()>0) { return t->exonOverlap(g.start, g.end); } else return g.overlap(*t); }
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) { //returns true if the transcript passed the filter char* gname=gffrec.getGeneName(); if (gname==NULL) gname=gffrec.getGeneID(); GStr defline(gffrec.getID()); if (f_out && !fmtGTF) { const char* tname=NULL; if ((tname=gffrec.getAttr("transcript_name"))!=NULL) { gffrec.addAttr("Name", tname); gffrec.removeAttr("transcript_name"); } } if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) { const char* biotype=gffrec.getAttr("gene_biotype"); if (biotype) { gffrec.addAttr("type", biotype); gffrec.removeAttr("gene_biotype"); } else { //old Ensembl files lacking gene_biotype gffrec.addAttr("type", gffrec.getTrackName()); } //bool is_gene=false; bool is_pseudo=false; if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS()) gffrec.setFeatureName("mRNA"); else { if (strcmp(biotype, "processed_transcript")==0) gffrec.setFeatureName("proc_RNA"); else { //is_gene=endsWith(biotype, "gene"); is_pseudo=strifind(biotype, "pseudo"); if (is_pseudo) { gffrec.setFeatureName("pseudo_RNA"); } else if (endsWith(biotype, "RNA")) { gffrec.setFeatureName(biotype); } else gffrec.setFeatureName("misc_RNA"); } } } if (gname && strcmp(gname, gffrec.getID())!=0) { int* isonum=isoCounter.Find(gname); if (isonum==NULL) { isonum=new int(1); isoCounter.Add(gname,isonum); } else (*isonum)++; defline.appendfmt(" gene=%s", gname); } int seqlen=0; const char* tlabel=tracklabel; if (tlabel==NULL) tlabel=gffrec.getTrackName(); //defline.appendfmt(" track:%s",tlabel); char* cdsnt = NULL; char* cdsaa = NULL; int aalen=0; for (int i=1;i<gffrec.exons.Count();i++) { int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1; if (ilen>4000000) GMessage("Warning: very large intron (%d) for transcript %s\n", ilen, gffrec.getID()); if (ilen>maxintron) { return false; } } GList<GSeg> seglst(false,true); GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec); if (spliceCheck && gffrec.exons.Count()>1) { //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC ) if (faseq==NULL) GError("Error: no genomic sequence available!\n"); int glen=gffrec.end-gffrec.start+1; const char* gseq=faseq->subseq(gffrec.start, glen); bool revcompl=(gffrec.strand=='-'); bool ssValid=true; for (int e=1;e<gffrec.exons.Count();e++) { const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start; int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1; GSpliceSite acceptorSite(intron,intronlen,true, revcompl); GSpliceSite donorSite(intron,intronlen, false, revcompl); //GMessage("%c intron %d-%d : %s .. %s\n", // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt); if (acceptorSite=="AG") { // GT-AG or GC-AG if (!donorSite.canonicalDonor()) { ssValid=false;break; } } else if (acceptorSite=="AC") { // if (donorSite!="AT") { ssValid=false; break; } } else { ssValid=false; break; } } //GFREE(gseq); if (!ssValid) { if (verbose) GMessage("Invalid splice sites found for '%s'\n",gffrec.getID()); return false; //don't print this one! } } bool trprint=true; int stopCodonAdjust=0; int mCDphase=0; bool hasStop=false; if (gffrec.CDphase=='1' || gffrec.CDphase=='2') mCDphase = gffrec.CDphase-'0'; if (f_y!=NULL || f_x!=NULL || validCDSonly) { if (faseq==NULL) GError("Error: no genomic sequence provided!\n"); //if (protmap && fullCDSonly) { //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) { if (validCDSonly) { //make sure the stop codon is always included //adjust_stopcodon(gffrec,3); stopCodonAdjust=adjust_stopcodon(gffrec,3); } int strandNum=0; int phaseNum=0; CDS_CHECK: cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst); if (cdsnt==NULL) trprint=false; else { //has CDS if (validCDSonly) { cdsaa=translateDNA(cdsnt, aalen, seqlen); char* p=strchr(cdsaa,'.'); hasStop=false; if (p!=NULL) { if (p-cdsaa>=aalen-2) { //stop found as the last codon *p='0';//remove it hasStop=true; if (aalen-2==p-cdsaa) { //previous to last codon is the stop codon //so correct the CDS stop accordingly adjust_stopcodon(gffrec,-3, &seglst); stopCodonAdjust=0; //clear artificial stop adjustment seqlen-=3; cdsnt[seqlen]=0; } aalen=p-cdsaa; } else {//stop found before the last codon trprint=false; } }//stop codon found if (trprint==false) { //failed CDS validity check //in-frame stop codon found if (altPhases && phaseNum<3) { phaseNum++; gffrec.CDphase = '0'+((mCDphase+phaseNum)%3); GFREE(cdsaa); goto CDS_CHECK; } if (gffrec.exons.Count()==1 && bothStrands) { strandNum++; phaseNum=0; if (strandNum<2) { GFREE(cdsaa); gffrec.strand = (gffrec.strand=='-') ? '+':'-'; goto CDS_CHECK; //repeat the CDS check for a different frame } } if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID()); } //has in-frame STOP if (fullCDSonly) { if (!hasStop || cdsaa[0]!='M') trprint=false; } } // CDS check requested } //has CDS } //translation or codon check/output was requested if (!trprint) { GFREE(cdsnt); GFREE(cdsaa); return false; } if (stopCodonAdjust>0 && !hasStop) { //restore stop codon location adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst); if (cdsnt!=NULL && seqlen>0) { seqlen-=stopCodonAdjust; cdsnt[seqlen]=0; } if (cdsaa!=NULL) aalen--; } if (f_y!=NULL) { //CDS translation fasta output requested //char* if (cdsaa==NULL) { //translate now if not done before cdsaa=translateDNA(cdsnt, aalen, seqlen); } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_y, defline, cdsaa, aalen); } if (f_x!=NULL) { //CDS only if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline.appendfmt("(%c)",gffrec.strand); //warning: not CDS coordinates are written here, but the exon ones defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; // -- here these are CDS substring coordinates on the spliced sequence: defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcript for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_x, defline, cdsnt, seqlen); } GFREE(cdsnt); GFREE(cdsaa); if (f_w!=NULL) { //write spliced exons uint cds_start=0; uint cds_end=0; seglst.Clear(); char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst); if (exont!=NULL) { if (gffrec.CDstart>0) { defline.appendfmt(" CDS=%d-%d", cds_start, cds_end); } if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline+=(char)'|'; defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; defline+=(char)'|'; defline+=(char)gffrec.strand; defline.append(" exons:"); for (int i=0;i<gffrec.exons.Count();i++) { if (i>0) defline.append(","); defline+=(int)gffrec.exons[i]->start; defline.append("-"); defline+=(int)gffrec.exons[i]->end; } defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_w, defline, exont, seqlen); GFREE(exont); } } //writing f_w (spliced exons) return true; }