int qsearch_loci(uint x, GList<GLocus>& loci) { // same as above, but for GSeg lists //binary search //do the simplest tests first: if (loci[0]->start>x) return 0; if (loci.Last()->start<x) return -1; uint istart=0; int i=0; int idx=-1; int maxh=loci.Count()-1; int l=0; int h = maxh; while (l <= h) { i = (l + h) >> 1; istart=loci[i]->start; if (istart < x) l=i+1; else { if (istart == x) { //found matching coordinate here idx=i; while (idx<=maxh && loci[idx]->start==x) { idx++; } return (idx>maxh) ? -1 : idx; } h=i-1; } } //while idx = l; while (idx<=maxh && loci[idx]->start<=x) { idx++; } return (idx>maxh) ? -1 : idx; }
int qsearch_mrnas(uint x, GList<GffObj>& mrnas) { //binary search //do the simplest tests first: if (mrnas[0]->start>x) return 0; if (mrnas.Last()->start<x) return -1; uint istart=0; int i=0; int idx=-1; int maxh=mrnas.Count()-1; int l=0; int h = maxh; while (l <= h) { i = (l+h)>>1; istart=mrnas[i]->start; if (istart < x) l = i + 1; else { if (istart == x) { //found matching coordinate here idx=i; while (idx<=maxh && mrnas[idx]->start==x) { idx++; } return (idx>maxh) ? -1 : idx; } h = i - 1; } } //while idx = l; while (idx<=maxh && mrnas[idx]->start<=x) { idx++; } return (idx>maxh) ? -1 : idx; }
void cluster_mRNAs(GList<GffObj> & mrnas, GList<GLocus> & loci, int qfidx) { //mrnas sorted by start coordinate //and so are the loci //int rdisc=0; for (int t=0;t<mrnas.Count();t++) { GArray<int> mrgloci(false); GffObj* mrna=mrnas[t]; int lfound=0; //count of parent loci /*for (int l=0;l<loci.Count();l++) { if (loci[l]->end<mrna->exons.First()->start) continue; if (loci[l]->start>mrna->exons.Last()->end) break; */ for (int l=loci.Count()-1;l>=0;l--) { if (loci[l]->end<mrna->exons.First()->start) { if (mrna->exons.First()->start-loci[l]->start > GFF_MAX_LOCUS) break; continue; } if (loci[l]->start>mrna->exons.Last()->end) continue; //here we have mrna overlapping loci[l] if (loci[l]->add_mRNA(mrna)) { //a parent locus was found lfound++; mrgloci.Add(l); //locus indices added here, in decreasing order } }//loci loop //if (lfound<0) continue; //mrna was a ref duplicate, skip it if (lfound==0) { //create a locus with only this mRNA loci.Add(new GLocus(mrna, qfidx)); } else if (lfound>1) { //more than one locus found parenting this mRNA, merge loci lfound--; for (int l=0;l<lfound;l++) { int mlidx=mrgloci[l]; //largest indices first, so it's safe to remove loci[mrgloci[lfound]]->addMerge(*loci[mlidx], mrna); loci.Delete(mlidx); } } }//mrnas loop //if (rdisc>0) mrnas.Pack(); //return rdisc; }
void collectLocusData(GList<GenomicSeqData>& ref_data) { int locus_num=0; for (int g=0;g<ref_data.Count();g++) { GenomicSeqData* gdata=ref_data[g]; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); GHash<int> gnames(true); //gene names in this locus GHash<int> geneids(true); //Entrez GeneID: numbers for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GStr gname(t.getGeneName()); if (!gname.is_empty()) { gname.upper(); int* prevg=gnames.Find(gname.chars()); if (prevg!=NULL) (*prevg)++; else gnames.Add(gname, new int(1)); } //parse GeneID xrefs, if any: GStr xrefs(t.getAttr("xrefs")); if (!xrefs.is_empty()) { xrefs.startTokenize(","); GStr token; while (xrefs.nextToken(token)) { token.upper(); if (token.startsWith("GENEID:")) { token.cut(0,token.index(':')+1); int* prevg=geneids.Find(token.chars()); if (prevg!=NULL) (*prevg)++; else geneids.Add(token, new int(1)); } } //for each xref } //xrefs parsing }//for each transcript locus_num++; loc.locus_num=locus_num; if (gnames.Count()>0) { //collect all gene names associated to this locus gnames.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=gnames.NextData(key))!=NULL) { loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq)); } } //added collected gene_names if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus geneids.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=geneids.NextData(key))!=NULL) { loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq)); } } } //for each locus }//for each genomic sequence }
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data, int check_for_dups, int qfidx, const char* fname, bool only_multiexon) { //>>>>> read all transcripts/features from a GTF/GFF3 file //int imrna_counter=0; #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("00"); #endif int loci_counter=0; if (ref_data==NULL) ref_data=&seqdata; bool isRefData=(&seqdata==ref_data); //(f, transcripts_only) GffReader* gffr=new GffReader(f, true); //load only transcript annotations gffr->showWarnings(gtf_tracking_verbose); // keepAttrs mergeCloseExons noExonAttrs gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale); //so it will read exon attributes only for low number of Cufflinks files #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_readAll"); #endif int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon); #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_parse_mRNAs"); #endif if (gtf_tracking_verbose && d>0) { if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d); else GMessage(" %d redundant query transfrags discarded.\n",d); } //imrna_counter=gffr->mrnas.Count(); delete gffr; //free the extra memory and unused GffObjs #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_del_gffr"); #endif //for each genomic sequence, cluster transcripts int oriented_by_overlap=0; int initial_unoriented=0; int final_unoriented=0; GStr bname(fname); GStr s; if (!bname.is_empty()) { int di=bname.rindex('.'); if (di>0) bname.cut(di); int p=bname.rindex('/'); if (p<0) p=bname.rindex('\\'); if (p>=0) bname.remove(0,p); } FILE* fdis=NULL; FILE* frloci=NULL; for (int g=0;g<seqdata.Count();g++) { //find the corresponding refseqdata with the same gseq_id int gseq_id=seqdata[g]->get_gseqid(); if (!isRefData) { //query data, find corresponding ref data GSeqData* rdata=getRefData(gseq_id, *ref_data); initial_unoriented+=seqdata[g]->umrnas.Count(); if (seqdata[g]->umrnas.Count()>0) { oriented_by_overlap+=fix_umrnas(*seqdata[g], rdata, fdis); final_unoriented+=seqdata[g]->umrnas.Count(); } } //>>>>> group mRNAs into locus-clusters (based on exon overlap) cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx); cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx); if (!isRefData) { cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx); } loci_counter+=seqdata[g]->loci_f.Count(); loci_counter+=seqdata[g]->loci_r.Count(); // if (refData) { // if (frloci==NULL) { // s=bname; // s.append(".loci.lst"); // frloci=fopen(s.chars(), "w"); // } // writeLoci(frloci, seqdata[g]->loci_f); // writeLoci(frloci, seqdata[g]->loci_r); // }//write ref loci }//for each genomic sequence if (fdis!=NULL) fclose(fdis); if (frloci!=NULL) fclose(frloci); if (initial_unoriented || final_unoriented) { if (gtf_tracking_verbose) GMessage(" Found %d transfrags with undetermined strand (%d out of initial %d were fixed by overlaps)\n", final_unoriented, oriented_by_overlap, initial_unoriented); } //if (fdis!=NULL) remove(s.chars()); remove 0-length file #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_cluster"); #endif }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:"); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) { GMessage("%s",USAGE); exit(1); } debugMode=(args.getOpt("debug")!=NULL); decodeChars=(args.getOpt('D')!=NULL); forceExons=(args.getOpt("force-exons")!=NULL); NoPseudo=(args.getOpt("no-pseudo")!=NULL); mRNAOnly=(args.getOpt('O')==NULL); //sortByLoc=(args.getOpt('S')!=NULL); addDescr=(args.getOpt('A')!=NULL); verbose=(args.getOpt('v')!=NULL); wCDSonly=(args.getOpt('C')!=NULL); validCDSonly=(args.getOpt('V')!=NULL); altPhases=(args.getOpt('H')!=NULL); fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF bothStrands=(args.getOpt('B')!=NULL); fullCDSonly=(args.getOpt('J')!=NULL); spliceCheck=(args.getOpt('N')!=NULL); bool matchAllIntrons=(args.getOpt('K')==NULL); bool fuzzSpan=(args.getOpt('Q')!=NULL); if (args.getOpt('M') || args.getOpt("merge")) { doCluster=true; doCollapseRedundant=true; } else { if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options require -M/--merge option!\n"); exit(1); } } if (args.getOpt("cluster-only")) { doCluster=true; doCollapseRedundant=false; if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options have no effect with --cluster-only.\n"); exit(1); } } if (fullCDSonly) validCDSonly=true; if (verbose) { fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); } fullattr=(args.getOpt('F')!=NULL); if (args.getOpt('G')==NULL) noExonAttr=!fullattr; else { noExonAttr=true; fullattr=true; } if (NoPseudo && !fullattr) { noExonAttr=true; fullattr=true; } ensembl_convert=(args.getOpt('L')!=NULL); if (ensembl_convert) { fullattr=true; noExonAttr=false; //sortByLoc=true; } mergeCloseExons=(args.getOpt('Z')!=NULL); multiExon=(args.getOpt('U')!=NULL); writeExonSegs=(args.getOpt('W')!=NULL); tracklabel=args.getOpt('t'); GFastaDb gfasta(args.getOpt('g')); //if (gfasta.fastaPath!=NULL) // sortByLoc=true; //enforce sorting by chromosome/contig GStr s=args.getOpt('i'); if (!s.is_empty()) maxintron=s.asInt(); FILE* f_repl=NULL; s=args.getOpt('d'); if (!s.is_empty()) { if (s=="-") f_repl=stdout; else { f_repl=fopen(s.chars(), "w"); if (f_repl==NULL) GError("Error creating file %s\n", s.chars()); } } rfltWithin=(args.getOpt('R')!=NULL); s=args.getOpt('r'); if (!s.is_empty()) { s.trim(); if (s[0]=='+' || s[0]=='-') { rfltStrand=s[0]; s.cut(0,1); } int isep=s.index(':'); if (isep>0) { //gseq name given if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) { isep--; rfltStrand=s[isep]; s.cut(isep,1); } if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars()); s.cut(0,isep+1); } GStr gsend; char slast=s[s.length()-1]; if (rfltStrand==0 && (slast=='+' || slast=='-')) { s.chomp(slast); rfltStrand=slast; } if (s.index("..")>=0) gsend=s.split(".."); else gsend=s.split('-'); if (!s.is_empty()) rfltStart=(uint)s.asInt(); if (!gsend.is_empty()) { rfltEnd=(uint)gsend.asInt(); if (rfltEnd==0) rfltEnd=MAX_UINT; } } //gseq/range filtering else { if (rfltWithin) GError("Error: option -R requires -r!\n"); //if (rfltWholeTranscript) // GError("Error: option -P requires -r!\n"); } s=args.getOpt('m'); if (!s.is_empty()) { FILE* ft=fopen(s,"r"); if (ft==NULL) GError("Error opening reference table: %s\n",s.chars()); loadRefTable(ft, reftbl); fclose(ft); } s=args.getOpt('s'); if (!s.is_empty()) { FILE* fsize=fopen(s,"r"); if (fsize==NULL) GError("Error opening info file: %s\n",s.chars()); loadSeqInfo(fsize, seqinfo); fclose(fsize); } openfw(f_out, args, 'o'); //if (f_out==NULL) f_out=stdout; if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL)) GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n"); openfw(f_w, args, 'w'); openfw(f_x, args, 'x'); openfw(f_y, args, 'y'); if (f_y!=NULL || f_x!=NULL) wCDSonly=true; //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL); int numfiles = args.startNonOpt(); //GList<GffObj> gfkept(false,true); //unsorted, free items on delete int out_counter=0; //number of records printed while (true) { GStr infile; if (numfiles) { infile=args.nextNonOpt(); if (infile.is_empty()) break; if (infile=="-") { f_in=stdin; infile="stdin"; } else if ((f_in=fopen(infile, "r"))==NULL) GError("Error: cannot open input file %s!\n",infile.chars()); } else infile="-"; GffLoader gffloader(infile.chars()); gffloader.transcriptsOnly=mRNAOnly; gffloader.fullAttributes=fullattr; gffloader.noExonAttrs=noExonAttr; gffloader.mergeCloseExons=mergeCloseExons; gffloader.showWarnings=(args.getOpt('E')!=NULL); gffloader.noPseudo=NoPseudo; gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan, forceExons); if (doCluster) collectLocusData(g_data); if (numfiles==0) break; } GStr loctrack("gffcl"); if (tracklabel) loctrack=tracklabel; g_data.setSorted(&gseqCmpName); GffPrintMode exonPrinting; if (fmtGTF) { exonPrinting = pgtfAny; } else { exonPrinting = forceExons ? pgffBoth : pgffAny; } bool firstGff3Print=!fmtGTF; if (doCluster) { //grouped in loci for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); //check all non-replaced transcripts in this locus: int numvalid=0; int idxfirstvalid=-1; for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) { if (f_repl && (t.udata & 8)==0) { //t.udata|=8; fprintf(f_repl, "%s", t.getID()); GTData* rby=tdata; while (rby->replaced_by!=NULL) { fprintf(f_repl," => %s", rby->replaced_by->getID()); rby->rna->udata|=8; rby=(GTData*)(rby->replaced_by->uptr); } fprintf(f_repl, "\n"); } continue; } if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (idxfirstvalid<0) idxfirstvalid=i; } } if (f_out && numvalid>0) { GStr locname("RLOC_"); locname.appendfmt("%08d",loc.locus_num); if (!fmtGTF) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s", loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand, locname.chars(), locname.chars()); //const char* loc_gname=loc.getGeneName(); if (loc.gene_names.Count()>0) { //print all gene names associated to this locus fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars()); for (int i=1;i<loc.gene_names.Count();i++) { fprintf(f_out, ",%s",loc.gene_names[i]->name.chars()); } } if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars()); for (int i=1;i<loc.gene_ids.Count();i++) { fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars()); } } fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID()); for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) { fprintf(f_out, ",%s",loc.rnas[i]->getID()); } fprintf(f_out, "\n"); } //now print all valid, non-replaced transcripts in this locus: for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue; t.addAttr("locus", locname.chars()); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->addAttr("locus", locname.chars()); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } } } //have valid transcripts to print }//for each locus //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic sequence } else { //not grouped into loci, print the rnas with their parents, if any int numvalid=0; for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int m=0;m<gdata->rnas.Count();m++) { GffObj& t=*(gdata->rnas[m]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) continue; if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (f_out) { if (tdata->geneinfo) tdata->geneinfo->finalize(); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } }//GFF/GTF output requested } //valid transcript } //for each rna //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic seq } //not clustered if (f_repl && f_repl!=stdout) fclose(f_repl); seqinfo.Clear(); //if (faseq!=NULL) delete faseq; //if (gcdb!=NULL) delete gcdb; GFREE(rfltGSeq); FRCLOSE(f_in); FWCLOSE(f_out); FWCLOSE(f_w); FWCLOSE(f_x); FWCLOSE(f_y); }