void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) { if (seq==NULL) return; int len=(seqlen>0)?seqlen:strlen(seq); if (len<=0) return; if (!defline.is_empty()) fprintf(f, ">%s\n",defline.chars()); int ilen=0; for (int i=0; i < len; i++, ilen++) { if (ilen == 70) { fputc('\n', f); ilen = 0; } putc(seq[i], f); } //for fputc('\n', f); }
void collectLocusData(GList<GenomicSeqData>& ref_data) { int locus_num=0; for (int g=0;g<ref_data.Count();g++) { GenomicSeqData* gdata=ref_data[g]; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); GHash<int> gnames(true); //gene names in this locus GHash<int> geneids(true); //Entrez GeneID: numbers for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GStr gname(t.getGeneName()); if (!gname.is_empty()) { gname.upper(); int* prevg=gnames.Find(gname.chars()); if (prevg!=NULL) (*prevg)++; else gnames.Add(gname, new int(1)); } //parse GeneID xrefs, if any: GStr xrefs(t.getAttr("xrefs")); if (!xrefs.is_empty()) { xrefs.startTokenize(","); GStr token; while (xrefs.nextToken(token)) { token.upper(); if (token.startsWith("GENEID:")) { token.cut(0,token.index(':')+1); int* prevg=geneids.Find(token.chars()); if (prevg!=NULL) (*prevg)++; else geneids.Add(token, new int(1)); } } //for each xref } //xrefs parsing }//for each transcript locus_num++; loc.locus_num=locus_num; if (gnames.Count()>0) { //collect all gene names associated to this locus gnames.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=gnames.NextData(key))!=NULL) { loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq)); } } //added collected gene_names if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus geneids.startIterate(); int* gfreq=NULL; char* key=NULL; while ((gfreq=geneids.NextData(key))!=NULL) { loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq)); } } } //for each locus }//for each genomic sequence }
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data, int check_for_dups, int qfidx, const char* fname, bool only_multiexon) { //>>>>> read all transcripts/features from a GTF/GFF3 file //int imrna_counter=0; #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("00"); #endif int loci_counter=0; if (ref_data==NULL) ref_data=&seqdata; bool isRefData=(&seqdata==ref_data); //(f, transcripts_only) GffReader* gffr=new GffReader(f, true); //load only transcript annotations gffr->showWarnings(gtf_tracking_verbose); // keepAttrs mergeCloseExons noExonAttrs gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale); //so it will read exon attributes only for low number of Cufflinks files #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_readAll"); #endif int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon); #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_parse_mRNAs"); #endif if (gtf_tracking_verbose && d>0) { if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d); else GMessage(" %d redundant cufflinks transfrags discarded.\n",d); } //imrna_counter=gffr->mrnas.Count(); delete gffr; //free the extra memory and unused GffObjs #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_del_gffr"); #endif //for each genomic sequence, cluster transcripts int discarded=0; GStr bname(fname); GStr s; if (!bname.is_empty()) { int di=bname.rindex('.'); if (di>0) bname.cut(di); int p=bname.rindex('/'); if (p<0) p=bname.rindex('\\'); if (p>=0) bname.remove(0,p); } FILE* fdis=NULL; FILE* frloci=NULL; for (int g=0;g<seqdata.Count();g++) { //find the corresponding refseqdata with the same gseq_id int gseq_id=seqdata[g]->get_gseqid(); if (!isRefData) { //cufflinks data, find corresponding ref data GSeqData* rdata=getRefData(gseq_id, *ref_data); if (rdata!=NULL && seqdata[g]->umrnas.Count()>0) { discarded+=fix_umrnas(*seqdata[g], rdata, fdis); } } //>>>>> group mRNAs into locus-clusters (based on exon overlap) cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx); cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx); if (!isRefData) { cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx); } loci_counter+=seqdata[g]->loci_f.Count(); loci_counter+=seqdata[g]->loci_r.Count(); // if (refData) { // if (frloci==NULL) { // s=bname; // s.append(".loci.lst"); // frloci=fopen(s.chars(), "w"); // } // writeLoci(frloci, seqdata[g]->loci_f); // writeLoci(frloci, seqdata[g]->loci_r); // }//write ref loci }//for each genomic sequence if (fdis!=NULL) fclose(fdis); if (frloci!=NULL) fclose(frloci); if (discarded>0) { if (gtf_tracking_verbose) GMessage("Found %d transcripts with undetermined strand.\n", discarded); } else { if (fdis!=NULL) remove(s.chars()); } #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_cluster"); #endif }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:"); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) { GMessage("%s",USAGE); exit(1); } debugMode=(args.getOpt("debug")!=NULL); decodeChars=(args.getOpt('D')!=NULL); forceExons=(args.getOpt("force-exons")!=NULL); NoPseudo=(args.getOpt("no-pseudo")!=NULL); mRNAOnly=(args.getOpt('O')==NULL); //sortByLoc=(args.getOpt('S')!=NULL); addDescr=(args.getOpt('A')!=NULL); verbose=(args.getOpt('v')!=NULL); wCDSonly=(args.getOpt('C')!=NULL); validCDSonly=(args.getOpt('V')!=NULL); altPhases=(args.getOpt('H')!=NULL); fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF bothStrands=(args.getOpt('B')!=NULL); fullCDSonly=(args.getOpt('J')!=NULL); spliceCheck=(args.getOpt('N')!=NULL); bool matchAllIntrons=(args.getOpt('K')==NULL); bool fuzzSpan=(args.getOpt('Q')!=NULL); if (args.getOpt('M') || args.getOpt("merge")) { doCluster=true; doCollapseRedundant=true; } else { if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options require -M/--merge option!\n"); exit(1); } } if (args.getOpt("cluster-only")) { doCluster=true; doCollapseRedundant=false; if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options have no effect with --cluster-only.\n"); exit(1); } } if (fullCDSonly) validCDSonly=true; if (verbose) { fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); } fullattr=(args.getOpt('F')!=NULL); if (args.getOpt('G')==NULL) noExonAttr=!fullattr; else { noExonAttr=true; fullattr=true; } if (NoPseudo && !fullattr) { noExonAttr=true; fullattr=true; } ensembl_convert=(args.getOpt('L')!=NULL); if (ensembl_convert) { fullattr=true; noExonAttr=false; //sortByLoc=true; } mergeCloseExons=(args.getOpt('Z')!=NULL); multiExon=(args.getOpt('U')!=NULL); writeExonSegs=(args.getOpt('W')!=NULL); tracklabel=args.getOpt('t'); GFastaDb gfasta(args.getOpt('g')); //if (gfasta.fastaPath!=NULL) // sortByLoc=true; //enforce sorting by chromosome/contig GStr s=args.getOpt('i'); if (!s.is_empty()) maxintron=s.asInt(); FILE* f_repl=NULL; s=args.getOpt('d'); if (!s.is_empty()) { if (s=="-") f_repl=stdout; else { f_repl=fopen(s.chars(), "w"); if (f_repl==NULL) GError("Error creating file %s\n", s.chars()); } } rfltWithin=(args.getOpt('R')!=NULL); s=args.getOpt('r'); if (!s.is_empty()) { s.trim(); if (s[0]=='+' || s[0]=='-') { rfltStrand=s[0]; s.cut(0,1); } int isep=s.index(':'); if (isep>0) { //gseq name given if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) { isep--; rfltStrand=s[isep]; s.cut(isep,1); } if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars()); s.cut(0,isep+1); } GStr gsend; char slast=s[s.length()-1]; if (rfltStrand==0 && (slast=='+' || slast=='-')) { s.chomp(slast); rfltStrand=slast; } if (s.index("..")>=0) gsend=s.split(".."); else gsend=s.split('-'); if (!s.is_empty()) rfltStart=(uint)s.asInt(); if (!gsend.is_empty()) { rfltEnd=(uint)gsend.asInt(); if (rfltEnd==0) rfltEnd=MAX_UINT; } } //gseq/range filtering else { if (rfltWithin) GError("Error: option -R requires -r!\n"); //if (rfltWholeTranscript) // GError("Error: option -P requires -r!\n"); } s=args.getOpt('m'); if (!s.is_empty()) { FILE* ft=fopen(s,"r"); if (ft==NULL) GError("Error opening reference table: %s\n",s.chars()); loadRefTable(ft, reftbl); fclose(ft); } s=args.getOpt('s'); if (!s.is_empty()) { FILE* fsize=fopen(s,"r"); if (fsize==NULL) GError("Error opening info file: %s\n",s.chars()); loadSeqInfo(fsize, seqinfo); fclose(fsize); } openfw(f_out, args, 'o'); //if (f_out==NULL) f_out=stdout; if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL)) GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n"); openfw(f_w, args, 'w'); openfw(f_x, args, 'x'); openfw(f_y, args, 'y'); if (f_y!=NULL || f_x!=NULL) wCDSonly=true; //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL); int numfiles = args.startNonOpt(); //GList<GffObj> gfkept(false,true); //unsorted, free items on delete int out_counter=0; //number of records printed while (true) { GStr infile; if (numfiles) { infile=args.nextNonOpt(); if (infile.is_empty()) break; if (infile=="-") { f_in=stdin; infile="stdin"; } else if ((f_in=fopen(infile, "r"))==NULL) GError("Error: cannot open input file %s!\n",infile.chars()); } else infile="-"; GffLoader gffloader(infile.chars()); gffloader.transcriptsOnly=mRNAOnly; gffloader.fullAttributes=fullattr; gffloader.noExonAttrs=noExonAttr; gffloader.mergeCloseExons=mergeCloseExons; gffloader.showWarnings=(args.getOpt('E')!=NULL); gffloader.noPseudo=NoPseudo; gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan, forceExons); if (doCluster) collectLocusData(g_data); if (numfiles==0) break; } GStr loctrack("gffcl"); if (tracklabel) loctrack=tracklabel; g_data.setSorted(&gseqCmpName); GffPrintMode exonPrinting; if (fmtGTF) { exonPrinting = pgtfAny; } else { exonPrinting = forceExons ? pgffBoth : pgffAny; } bool firstGff3Print=!fmtGTF; if (doCluster) { //grouped in loci for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); //check all non-replaced transcripts in this locus: int numvalid=0; int idxfirstvalid=-1; for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) { if (f_repl && (t.udata & 8)==0) { //t.udata|=8; fprintf(f_repl, "%s", t.getID()); GTData* rby=tdata; while (rby->replaced_by!=NULL) { fprintf(f_repl," => %s", rby->replaced_by->getID()); rby->rna->udata|=8; rby=(GTData*)(rby->replaced_by->uptr); } fprintf(f_repl, "\n"); } continue; } if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (idxfirstvalid<0) idxfirstvalid=i; } } if (f_out && numvalid>0) { GStr locname("RLOC_"); locname.appendfmt("%08d",loc.locus_num); if (!fmtGTF) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s", loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand, locname.chars(), locname.chars()); //const char* loc_gname=loc.getGeneName(); if (loc.gene_names.Count()>0) { //print all gene names associated to this locus fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars()); for (int i=1;i<loc.gene_names.Count();i++) { fprintf(f_out, ",%s",loc.gene_names[i]->name.chars()); } } if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars()); for (int i=1;i<loc.gene_ids.Count();i++) { fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars()); } } fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID()); for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) { fprintf(f_out, ",%s",loc.rnas[i]->getID()); } fprintf(f_out, "\n"); } //now print all valid, non-replaced transcripts in this locus: for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue; t.addAttr("locus", locname.chars()); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->addAttr("locus", locname.chars()); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } } } //have valid transcripts to print }//for each locus //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic sequence } else { //not grouped into loci, print the rnas with their parents, if any int numvalid=0; for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int m=0;m<gdata->rnas.Count();m++) { GffObj& t=*(gdata->rnas[m]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) continue; if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (f_out) { if (tdata->geneinfo) tdata->geneinfo->finalize(); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } }//GFF/GTF output requested } //valid transcript } //for each rna //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic seq } //not clustered if (f_repl && f_repl!=stdout) fclose(f_repl); seqinfo.Clear(); //if (faseq!=NULL) delete faseq; //if (gcdb!=NULL) delete gcdb; GFREE(rfltGSeq); FRCLOSE(f_in); FWCLOSE(f_out); FWCLOSE(f_w); FWCLOSE(f_x); FWCLOSE(f_y); }
int main(int argc, char * const argv[]) { //GArgs args(argc, argv, "hg:c:s:t:o:p:help;genomic-fasta=COV=PID=seq=out=disable-flag;test="); GArgs args(argc, argv, opts); fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); args.printError(USAGE, true); //if (args.getOpt('h') || args.getOpt("help")) if (args.getOpt(OPT_HELP)) { GMessage("%s\n", USAGE); exit(1); } if (args.getOpt(OPT_NUM)) { GStr snum(args.getOpt(OPT_NUM)); int num=snum.asInt(); char* numstr=commaprintnum(num); GMessage("Number %d written with commas: %s\n", num, numstr); GFREE(numstr); } //--- GHash<GVec<int> > ends; /* testGPVec(); //exit(0); //uint pos=3; //GStr spos((int)pos); //GVec<int> *ev=ends[spos.chars()]; GPVec<Gint> v; int r(5); int rr=v.Add(new Gint(3)); //if (rr<0) { // GMessage("Error adding 0! (code %d)\n",rr); // } v.Add(new Gint(r)); v.Add(new Gint(2)); v.Add(new Gint(1)); v.Add(new Gint(4)); rr=v.Add(new Gint(0)); v[rr]->v=-1; v.Sort(cmpGint); GMessage("collection has %d elements:\n",v.Count()); for (int i=0;i<v.Count();i++) { GMessage("v[%d]=%d;\n",i,v[i]->v); } exit(0); */ //--- int numopts=args.startOpt(); if (numopts) GMessage("#### Recognized %d option arguments:\n", numopts); int optcode=0; while ((optcode=args.nextCode())) { char* r=args.getOpt(optcode); GMessage("%14s\t= %s\n", args.getOptName(optcode), (r[0]==0)?"True":r); } int numargs=args.startNonOpt(); if (numargs>0) { GMessage("\n#### Found %d non-option arguments given:\n", numargs); char* a=NULL; while ((a=args.nextNonOpt())) { GMessage("%s\n",a); } } GStr s=args.getOpt('t'); if (!s.is_empty()) { GStr token; GMessage("Tokens in \"%s\" :\n",s.chars()); s.startTokenize(";,: \t"); int c=1; while (s.nextToken(token)) { GMessage("token %2d : \"%s\"\n",c,token.chars()); c++; } } if (args.getOpt(OPT_BITVEC)) { uint numbits=4156888234; GBitVec bits(numbits); GMessage(">>> -- BitVec(%u) created (size=%u, mem=%lu) -- \n", numbits, bits.size(), bits.getMemorySize()); bits[405523342]=true; GMessage(" memory size: %lu , size()=%u, count()=%d \n", bits.getMemorySize(), bits.size(), bits.count()); /* //GMessage(">>> -- Start BitVec Test -- \n"); if (bits[1092]) bitError(1092); bits.resize(2049); if (bits[2048]) bitError(2048); bits[2048]=true; if (!bits[2048]) bitError(2048); bits.resize(4097); if (!bits[2048]) bitError(2048); if (bits[4096]) bitError(4096); bits[4096]=true; if (!bits[4096]) bitError(4096); GBitVec bits2(64); Gswap(bits, bits2); if (!bits2[2048]) bitError(2048); if (!bits2[4096]) bitError(4096); */ //GMessage("<<< -- End BitVec Test (size: %d, count: %d, bits2 size=%d, count=%d) --\n", /// bits.size(), bits.count(), bits2.size(), bits2.count()); } }