//read only the contig headers from the file //also checks for duplicate seqnames (just in case) bool AceParser::parseContigs() { if (f!=stdin) seek(0); off_t ctgpos; numContigs=0; while ((ctgpos=fskipTo("CO "))>=0) { numContigs++; LytCtgData* ctgdata=new LytCtgData(ctgpos); char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs); if (p==NULL) { GMessage("AceParser: error parsing contig name!\n"); return false; } int ctglen, ctg_numSeqs, numseqs; //p must be after contig name within linebuf! if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) { GMessage("Error parsing contig len and seq count at:\n%s\n", linebuf->chars()); } ctg_numSeqs=numseqs; ctgdata->len=ctglen; ctgdata->numseqs=numseqs; ctgdata->offs = 1; contigs.Add(ctgdata); //ctgpos=fskipTo("CO "); } if (ctgpos==-2) return false; //parsing failed: line too long ?!? contigs.setSorted(true); return true; }
bool AceParser::parse(fnLytSeq* seqfn) { //read all seqs and their positions from the file //also checks for duplicate seqnames (just in case) if (f!=stdin) seek(0); ctgIDs.Clear(); //GHash<int> ctgIDs; //contig IDs, to make them unique! // off_t ctgpos; numContigs=0; while ((ctgpos=fskipTo("CO "))>=0) { numContigs++; LytCtgData* ctgdata=new LytCtgData(ctgpos); char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs); if (p==NULL) { GMessage("AceParser: error parsing contig name!\n"); return false; } int ctglen, numseqs; //p must be after contig name within linebuf! if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) { GMessage("Error parsing contig len and seq count at:\n%s\n", linebuf->chars()); } ctgdata->len=ctglen; ctgdata->numseqs = numseqs; ctgdata->offs = 1; int ctgidx=contigs.Add(ctgdata); loadContig(ctgidx, seqfn, false); } //while contigs if (ctgpos==-2) return false; //parsing failed: line too long ?!? contigs.setSorted(true); return true; }
bool LayoutParser::parseContigs() { //load all the file offsets for contigs if (f!=stdin) seek(0); ctgIDs.Clear(); //GHash<int> ctgIDs; //contig IDs, to make them unique! // int ctgpos; //locate the first contig line numContigs=0; while ((ctgpos=fskipTo(">"))>=0) { numContigs++; LytCtgData* ctgdata=new LytCtgData(ctgpos); char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs); if (p==NULL) { GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars()); return false; } int ctg_lpos, ctg_rpos, numseqs; //p must be after contig name within linebuf! ctg_lpos=0;ctg_rpos=0; if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) { GMessage("Error parsing contig len and seq count at:\n%s\n", p); return false; } ctgdata->len=ctg_rpos-ctg_lpos+1; ctgdata->numseqs=numseqs; ctgdata->lpos=ctg_lpos; ctgdata->rpos=ctg_rpos; ctgdata->offs=ctg_lpos; contigs.Add(ctgdata); //ctgpos=fskipTo(">"); } //while lines if (ctgpos==-2) return false; contigs.setSorted(true); return true; }
void testGPVec() { GPVec<Gint> vecs[3]; vecs[1].Add(new Gint(2)); vecs[2].Add(new Gint(3)); GMessage("Added to vecs[1]:%d\n", vecs[1][0]->val()); GMessage("Added to vecs[2]:%d\n", vecs[2][0]->val()); }
void printLocus(GffLocus* loc, const char* pre) { if (pre!=NULL) fprintf(stderr, "%s", pre); GMessage(" [%d-%d] : ", loc->start, loc->end); GMessage("%s",loc->rnas[0]->getID()); for (int i=1;i<loc->rnas.Count();i++) { GMessage(",%s",loc->rnas[i]->getID()); } GMessage("\n"); }
int Gmkdir(const char *path, bool recursive, int perms) { if (path==NULL || path[0]==0) return -1; mode_t process_mask = umask(0); //is this really needed? if (!recursive) { int r=G_mkdir(path, perms); if (r!=0) GMessage("Warning: G_mkdir(%s) failed: %s\n", path, strerror(errno)); umask(process_mask); return r; } int plen=strlen(path); char* gpath=NULL; //make sure gpath ends with / if (path[plen-1]=='/') { gpath=Gstrdup(path); } else { GMALLOC(gpath, plen+2); strcpy(gpath,path); strcat(gpath, "/"); ++plen; } //char* ss=gpath+plen-1; char* psep = gpath+plen-1; //start at the last / GDynArray<char*> dirstack(4); // stack of directories that should be created while (psep>gpath && *(psep-1)=='/') --psep; //skip double slashes *psep='\0'; int fexists=0; while ((fexists=fileExists(gpath))==0) { dirstack.Push(psep); do { --psep; } while (psep>gpath && *psep!='/'); if (psep<=gpath) { psep=NULL; break; } while (psep>gpath && *(psep-1)=='/') --psep; *psep='\0'; } if (psep) *psep='/'; while (dirstack.Count()>0) { psep=dirstack.Pop(); int mkdir_err=0; if ((mkdir_err=G_mkdir(gpath, perms))!=0) { GMessage("Warning: mkdir(%s) failed: %s\n", gpath, strerror(errno)); GFREE(gpath); umask(process_mask); return -1; } *psep='/'; } GFREE(gpath); umask(process_mask); return 0; }
char* AceParser::getSeq(LytSeqInfo* seq) { if (f==stdin || seek(seq->fpos)!=0) { GMessage("AceParser: error seeking seq '%s'\n", seq->name); return NULL; } //skip the contig header: char* r=linebuf->getLine(f,f_pos); if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) { GMessage("AceParser: error seeking seq '%s'\n" " (no RD entry found at location %d)\n", seq->name, seq->fpos); return NULL; } return readSeq(seq); }
char* AceParser::getContigSeq(LytCtgData* ctg) { if (f==stdin || seek(ctg->fpos)!=0) { GMessage("AceParser: error seeking contig '%s'\n", ctg->name); return NULL; } //skip the contig header: char* r=linebuf->getLine(f,f_pos); if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) { GMessage("AceParser: error seeking contig '%s'\n" " (no CO entry found at location %d)\n", ctg->name, ctg->fpos); return NULL; } return readSeq(); }
char* LytCtgData::readName(char* s, GHash<int>& names) { char* p=strchrs(s, " \t"); if (p!=NULL) { char* tmp; char* tmp2; GMALLOC(tmp, (p-s+30)*sizeof(char)); strncpy(tmp, s,p-s); tmp[p-s]='\0'; GMALLOC(tmp2, (p-s+30)*sizeof(char)); strcpy(tmp2, tmp); //make it unique (by simple versioning) int v=0; while (names.hasKey(tmp2)) { v++; sprintf(tmp2, "%s.%d", tmp, v); } name=Gstrdup(tmp2); GFREE(tmp); GFREE(tmp2); names.shkAdd(name, new int(1)); p++; } else { GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s); } return p; }
void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) { GLineReader fr(f); while (!fr.isEof()) { char* line=fr.getLine(); if (line==NULL) break; char* id=line; char* lenstr=NULL; char* text=NULL; char* p=line; while (*p!=0 && !isspace(*p)) p++; if (*p==0) continue; *p=0;p++; while (*p==' ' || *p=='\t') p++; if (*p==0) continue; lenstr=p; while (*p!=0 && !isspace(*p)) p++; if (*p!=0) { *p=0;p++; } while (*p==' ' || *p=='\t') p++; if (*p!=0) text=p; //else text remains NULL int len=0; if (!parseInt(lenstr,len)) { GMessage("Warning: could not parse sequence length: %s %s\n", id, lenstr); continue; } // --- here we have finished parsing the line si.Add(id, new SeqInfo(len,text)); } //while lines }
LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) { LytSeqInfo* seq; if (!startsWith(s, "AF ", 3)) return NULL; s+=3; char* p=strchrs(s," \t"); if (p==NULL) return NULL; p++; char c; int offs; if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL; p--; *p='\0'; if ((seq=seqinfo.Find(s))!=NULL) { GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n" " so it cannot be added for contig '%s (%d)'\n", s, seq->contig->name, seq->contig->len, ctg->name, ctg->len); return NULL; } seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0); seqinfo.shkAdd(seq->name, seq); ctg->seqs.Add(seq); return seq; }
FILE* Gfopen(const char *path, char *mode) { FILE* f=NULL; if (mode==NULL) f=fopen(path, "rb"); else f=fopen(path, mode); if (f==NULL) GMessage("Error opening file '%s': %s\n", path, strerror(errno)); return f; }
void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar != NULL && strcmp(cigar, "*") != 0) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { GError("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true; else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true; else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true; else if (op == 'P') op = BAM_CPAD; else GError("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } setupCoordinates(); } //set_cigar()
bool LayoutParser::parse(fnLytSeq* seqfn) { //read all seqs and their positions from the file //also checks for duplicate seqnames (just in case) if (f!=stdin) seek(0); ctgIDs.Clear(); //GHash<int> ctgIDs; //contig IDs, to make them unique! // int ctgpos; numContigs=0; while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line numContigs++; LytCtgData* ctgdata=new LytCtgData(ctgpos); char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs); if (p==NULL) { GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars()); return false; } int ctg_lpos, ctg_rpos, numseqs; //p must be after contig name within linebuf! ctg_lpos=0;ctg_rpos=0; if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) { GMessage("Error parsing contig len and seq count at:\n%s\n", p); return false; } //ctg_numSeqs=numseqs; ctgdata->numseqs=numseqs; ctgdata->rpos=ctg_rpos; ctgdata->lpos=ctg_lpos; ctgdata->len=ctg_rpos-ctg_lpos+1; ctgdata->offs=ctg_lpos; int ctgidx=contigs.Add(ctgdata); //now look and load all the component sequences loadContig(ctgidx, seqfn, false); } //while lines contigs.setSorted(true); return true; }
void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) { GFastaRec* farec=records.Find(seqname); if (farec!=NULL) { GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n"); farec->seqlen=seqlen; farec->fpos=foffs; farec->line_len=llen; farec->line_blen=llen_full; } else { farec=new GFastaRec(seqlen,foffs,llen,llen_full); records.Add(seqname,farec); farec->seqname=records.getLastKey(); } }
void GWait::GUIDoWait(int code) { dowait_cnt += code; if(dowait_cnt > 0) { if(code > 0 && dowait_cnt == 1 && NULL == dowait_win) { dowait_win = new GWait(); if(dowait_win) { dowait_win->nextObj =Gdesktop->parent->focus->nextObj; Gdesktop->parent->focus->nextObj = dowait_win; //adds the new item to the Z list dowait_win->parent = Gdesktop->parent; //LCD GQueue.push(GMessage(WM_INIT, 0, (long long)((LCD_MULT *)(Gdesktop->parent))->lcd, dowait_win)); } if(!dowait_win) dowait_cnt = 0; } else { if(dowait_win) { if(code == 1) // begin wait dowait_win->add_owner(); if(code == 0) // restore wait dowait_win->SetTimer(ID_BUSY_CLOCK, BUSY_START_TIME); } } } else { if(dowait_win) { GMessage msg; dowait_win->process_destroy(msg); delete dowait_win; dowait_win = NULL; } dowait_cnt = 0; // prevent underflow } dowait_locker = NULL; }
int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index if (finame==NULL) finame=fai_name; if (finame!=fai_name) { fai_name=Gstrdup(finame); } if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n"); records.Clear(); haveFai=false; FILE* fi=fopen(fai_name,"rb"); if (fi==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fai_name); return 0; } GLineReader fl(fi); char* s=NULL; while ((s=fl.nextLine())!=NULL) { if (*s=='#') continue; char* p=strchrs(s,"\t "); if (p==NULL) GError(ERR_FAIDXLINE,s); *p=0; //s now holds the genomic sequence name p++; uint len=0; int line_len=0, line_blen=0; #ifdef __WIN32__ long offset=-1; sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen); #else long long offset=-1; sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen); #endif if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len) GError(ERR_FAIDXLINE,p); addRecord(s,len,offset,line_len, line_blen); } fclose(fi); haveFai=(records.Count()>0); return records.Count(); }
off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) { /* reads the file from the current position until the next occurence of a line starting with linestart returns the line in buf and the file offset of the beginning of the line; the file offset is -1 if linestart was not found or -2 if an unwanted line-start came out of order */ off_t lastpos=getFilePos(); int tlen=strlen(linestart); int nlen=(butnot==NULL) ? 0 : strlen(butnot); while (linebuf->getLine(f, f_pos)!=NULL) { if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) { GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n", linebuf->line(), linestart); return -2; } if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos; lastpos=getFilePos(); } return -1; }
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data, int check_for_dups, int qfidx, const char* fname, bool only_multiexon) { //>>>>> read all transcripts/features from a GTF/GFF3 file //int imrna_counter=0; #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("00"); #endif int loci_counter=0; if (ref_data==NULL) ref_data=&seqdata; bool isRefData=(&seqdata==ref_data); //(f, transcripts_only) GffReader* gffr=new GffReader(f, true); //load only transcript annotations gffr->showWarnings(gtf_tracking_verbose); // keepAttrs mergeCloseExons noExonAttrs gffr->readAll(!isRefData, true, isRefData || gtf_tracking_largeScale); //so it will read exon attributes only for low number of Cufflinks files #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_readAll"); #endif int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon); #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_parse_mRNAs"); #endif if (gtf_tracking_verbose && d>0) { if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d); else GMessage(" %d redundant query transfrags discarded.\n",d); } //imrna_counter=gffr->mrnas.Count(); delete gffr; //free the extra memory and unused GffObjs #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_del_gffr"); #endif //for each genomic sequence, cluster transcripts int oriented_by_overlap=0; int initial_unoriented=0; int final_unoriented=0; GStr bname(fname); GStr s; if (!bname.is_empty()) { int di=bname.rindex('.'); if (di>0) bname.cut(di); int p=bname.rindex('/'); if (p<0) p=bname.rindex('\\'); if (p>=0) bname.remove(0,p); } FILE* fdis=NULL; FILE* frloci=NULL; for (int g=0;g<seqdata.Count();g++) { //find the corresponding refseqdata with the same gseq_id int gseq_id=seqdata[g]->get_gseqid(); if (!isRefData) { //query data, find corresponding ref data GSeqData* rdata=getRefData(gseq_id, *ref_data); initial_unoriented+=seqdata[g]->umrnas.Count(); if (seqdata[g]->umrnas.Count()>0) { oriented_by_overlap+=fix_umrnas(*seqdata[g], rdata, fdis); final_unoriented+=seqdata[g]->umrnas.Count(); } } //>>>>> group mRNAs into locus-clusters (based on exon overlap) cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx); cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx); if (!isRefData) { cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx); } loci_counter+=seqdata[g]->loci_f.Count(); loci_counter+=seqdata[g]->loci_r.Count(); // if (refData) { // if (frloci==NULL) { // s=bname; // s.append(".loci.lst"); // frloci=fopen(s.chars(), "w"); // } // writeLoci(frloci, seqdata[g]->loci_f); // writeLoci(frloci, seqdata[g]->loci_r); // }//write ref loci }//for each genomic sequence if (fdis!=NULL) fclose(fdis); if (frloci!=NULL) fclose(frloci); if (initial_unoriented || final_unoriented) { if (gtf_tracking_verbose) GMessage(" Found %d transfrags with undetermined strand (%d out of initial %d were fixed by overlaps)\n", final_unoriented, oriented_by_overlap, initial_unoriented); } //if (fdis!=NULL) remove(s.chars()); remove 0-length file #ifdef HEAPROFILE if (IsHeapProfilerRunning()) HeapProfilerDump("post_cluster"); #endif }
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan, bool forceExons) { GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted gffr->showWarnings(this->showWarnings); // keepAttrs mergeCloseExons noExonAttr gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs); GVec<int> pseudoAttrIds; GVec<int> pseudoFeatureIds; if (this->noPseudo) { GffNameList& fnames = gffr->names->feats; for (int i=0;i<fnames.Count();i++) { char* n=fnames[i]->name; if (startsWith(n, "pseudo")) { pseudoFeatureIds.Add(fnames[i]->idx); } } GffNameList& attrnames = gffr->names->attrs; for (int i=0;i<attrnames.Count();i++) { char* n=attrnames[i]->name; char* p=strifind(n, "pseudo"); if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) { pseudoAttrIds.Add(attrnames[i]->idx); } } } //int redundant=0; //redundant annotation discarded if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars()); //int rna_deleted=0; //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates for (int k=0;k<gffr->gflst.Count();k++) { GffObj* m=gffr->gflst[k]; if (strcmp(m->getFeatureName(), "locus")==0 && m->getAttr("transcripts")!=NULL) { continue; //discard locus meta-features } if (this->noPseudo) { bool is_pseudo=false; for (int i=0;i<pseudoFeatureIds.Count();++i) { if (pseudoFeatureIds[i]==m->ftype_id) { is_pseudo=true; break; } } if (is_pseudo) continue; for (int i=0;i<pseudoAttrIds.Count();++i) { char* attrv=NULL; if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]); if (attrv!=NULL) { char fc=tolower(attrv[0]); if (fc=='t' || fc=='y' || fc=='1') { is_pseudo=true; break; } } } if (is_pseudo) continue; //last resort: // scan all the attribute values for "pseudogene" keyword (NCBI does that for "product" attr) /* if (m->attrs!=NULL) { for (int i=0;i<m->attrs->Count();++i) { GffAttr& a=*(m->attrs->Get(i)); if (strifind(a.attr_val, "pseudogene")) { is_pseudo=true; break; } } } if (is_pseudo) continue; */ } //pseudogene detection requested char* rloc=m->getAttr("locus"); if (rloc!=NULL && startsWith(rloc, "RLOC_")) { m->removeAttr("locus", rloc); } /* if (m->exons.Count()==0 && m->children.Count()==0) { //a non-mRNA feature with no subfeatures //add a dummy exon just to have the generic exon checking work m->addExon(m->start,m->end); } */ if (forceExons) { // && m->children.Count()==0) { m->exon_ftype_id=gff_fid_exon; } //GList<GffObj> gfadd(false,false); -- for gf_validate()? if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) { continue; } m->isUsed(true); //so the gffreader won't destroy it int i=-1; GenomicSeqData f(m->gseq_id); GenomicSeqData* gdata=NULL; if (seqdata.Found(&f,i)) gdata=seqdata[i]; else { //entry not created yet for this genomic seq gdata=new GenomicSeqData(m->gseq_id); seqdata.Add(gdata); } /* for (int k=0;k<gfadd.Count();k++) { bool keep=placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); if (!keep) { gfadd[k]->isUsed(false); //DEBUG GMessage("Feature %s(%d-%d) is going to be discarded..\n",gfadd[k]->getID(), gfadd[k]->start, gfadd[k]->end); } } */ bool keep=placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); if (!keep) { m->isUsed(false); //DEBUG //GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end); } } //for each read gffObj //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars()); if (f!=stdin) { fclose(f); f=NULL; } delete gffr; }
int parse_mRNAs(GfList& mrnas, GList<GSeqData>& glstdata, bool is_ref_set, int check_for_dups, int qfidx, bool only_multiexon) { int tredundant=0; //redundant transcripts discarded int total_kept=0; int total_seen=mrnas.Count(); for (int k=0;k<mrnas.Count();k++) { GffObj* m=mrnas[k]; int i=-1; GSeqData f(m->gseq_id); GSeqData* gdata=NULL; uint tlen=m->len(); if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too.. if (gtf_tracking_verbose) GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen); continue; } if (only_multiexon && m->exons.Count()<2) { continue; } //GStr feature(m->getFeatureName()); //feature.lower(); //bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0); //if (m->exons.Count()==0 && gene_or_locus) { if (m->isDiscarded()) { //discard generic "gene" or "locus" features with no other detailed subfeatures if (!is_ref_set && gtf_tracking_verbose) GMessage("Warning: discarding non-transfrag (GFF generic gene/locus container?) %s\n",m->getID()); continue; } if (m->exons.Count()==0) { if (gtf_tracking_verbose && !is_ref_set) GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID()); m->addExon(m->start,m->end); } if (glstdata.Found(&f,i)) gdata=glstdata[i]; else { gdata=new GSeqData(m->gseq_id); glstdata.Add(gdata); } double fpkm=0; double cov=0; double conf_hi=0; double conf_lo=0; GList<GffObj>* target_mrnas=NULL; if (is_ref_set) { //-- ref transcripts if (m->strand=='.') { //unknown strand - discard from reference set (!) continue; } total_kept++; target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r); if (check_for_dups) { //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts int rpidx=-1; GffObj* rp= is_RefDup(m, *target_mrnas, rpidx); if (rp!=NULL) { //duplicate found //discard one of them //but let's keep the gene_name if present //DEBUG: //GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID()); tredundant++; total_kept--; if (betterDupRef(rp, m)) { if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) { rp->setGeneName(m->getGeneName()); } continue; } else { if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) { m->setGeneName(rp->getGeneName()); } ((CTData*)(rp->uptr))->mrna=NULL; rp->isUsed(false); target_mrnas->Forget(rpidx); target_mrnas->Delete(rpidx); } } } //check for duplicate ref transcripts } //ref transcripts else { //-- query transfrags if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); } else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); } else { m->strand='.'; target_mrnas=&(gdata->umrnas); } total_kept++; if (check_for_dups) { //check for redundancy // check if there is a redundancy between this and another already loaded Cufflinks transcript int cidx = is_Redundant(m, target_mrnas, (check_for_dups>1)); if (cidx>=0) { //always discard the redundant transcript with the fewer exons OR shorter tredundant++; total_kept--; if (t_dominates(target_mrnas->Get(cidx),m)) { //new transcript is shorter, discard it if (gtf_tracking_verbose) { GMessage(" transfrag %s discarded (made redundant by %s)\n", m->getID(), target_mrnas->Get(cidx)->getID()); } continue; } else { //discard the older transfrag if (gtf_tracking_verbose) { GMessage(" transfrag %s discarded (made redundant by %s)\n", target_mrnas->Get(cidx)->getID(), m->getID()); } ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL; target_mrnas->Get(cidx)->isUsed(false); target_mrnas->Forget(cidx); target_mrnas->Delete(cidx); //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually } } }// redundant transfrag check if (m->gscore==0.0) m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM"); const char* expr = m->getAttr("FPKM"); if (expr!=NULL) { if (expr[0]=='"') expr++; fpkm=strtod(expr, NULL); } else { //backward compatibility: read RPKM if FPKM not found //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM"); expr=m->getAttr("RPKM"); if (expr!=NULL) { if (expr[0]=='"') expr++; fpkm=strtod(expr, NULL); } } //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov"); const char* scov=m->getAttr("cov"); if (scov!=NULL) { if (scov[0]=='"') scov++; cov=strtod(scov, NULL); } //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi"); const char* sconf_hi=m->getAttr("conf_hi"); if (sconf_hi!=NULL){ if (sconf_hi[0]=='"') sconf_hi++; conf_hi=strtod(sconf_hi, NULL); } //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo"); const char* sconf_lo=m->getAttr("conf_lo"); if (sconf_lo!=NULL) { if (sconf_lo[0]=='"') sconf_lo++; conf_lo=strtod(sconf_lo, NULL); } } //Cufflinks transfrags target_mrnas->Add(m); m->isUsed(true); CTData* mdata=new CTData(m); mdata->qset=qfidx; gdata->tdata.Add(mdata); if (!is_ref_set) { // Cufflinks - attributes parsing mdata->FPKM=fpkm; mdata->cov=cov; mdata->conf_hi=conf_hi; mdata->conf_lo=conf_lo; } }//for each mrna read if (gtf_tracking_verbose) { if (is_ref_set) GMessage(" Kept %d ref transcripts out of %d\n", total_kept, total_seen); else GMessage(" Kept %d transfrags out of %d\n", total_kept, total_seen); } //if (mrna_deleted>0) // mrnas.Pack(); //return (is_ref_set ? refdiscarded : tredundant); return tredundant; }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "hFCq:r:o:"); int e; if ((e=args.isError())>0) GError("%s\nInvalid argument: %s\n", USAGE, argv[e]); if (args.getOpt('h')!=NULL){ GMessage("%s\n", USAGE); exit(1); } args.startNonOpt(); GStr fadb(args.nextNonOpt()); if (fadb.is_empty()) GError("%s Error: multi-fasta file expected!\n",USAGE); GStr fname(fadb); fname.append(".fai"); bool createLocal=(args.getOpt('F')!=NULL); const char* idxname=(createLocal)? NULL : fname.chars(); GFastaIndex faidx(fadb.chars(), idxname); //also tried to load the index if exists in the current directory GStr fnamecwd(fname); //name in current directory (without path) int ip=-1; if ((ip=fnamecwd.rindex(CHPATHSEP))>=0) { fnamecwd.cut(0,ip+1); } if (!createLocal) { //look for existing indexes to load //try the same directory as the fasta file first if (!faidx.hasIndex() and fileExists(fnamecwd.chars())>1) { //try current working directory next faidx.loadIndex(fnamecwd.chars()); } if (!faidx.hasIndex()) {//could not load any index data //try to create it in the same directory as the fasta file GMessage("No fasta index found. Rebuilding..\n"); faidx.buildIndex(); if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n"); GMessage("Fasta index rebuilt.\n"); //check if we can create a file there FILE* fcreate=fopen(fname.chars(), "w"); if (fcreate==NULL) GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fname.chars()); else { fclose(fcreate); if (faidx.storeIndex(fname.chars())<faidx.getCount()) GMessage("Warning: error writing the index file %s!\n",fname.chars()); } //creating index file in the same directory as fasta file }//trying to create the index file } if (createLocal || !faidx.hasIndex()) { //simply rebuild the index in the current directory and use it: //remove directories in path, if any if (faidx.getCount()==0) { faidx.buildIndex(); if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n"); } if (faidx.storeIndex(fnamecwd.chars())<faidx.getCount()) GMessage("Warning: error writing the index file %s!\n",fnamecwd.chars()); } GStr qry(args.getOpt('q')); if (qry.is_empty()) exit(0); GFastaRec* farec=faidx.getRecord(qry.chars()); if (farec==NULL) { GMessage("Error: couldn't find fasta record for '%s'!\n",qry.chars()); exit(1); } GFaSeqGet faseq(fadb.chars(),farec->seqlen, farec->fpos, farec->line_len, farec->line_blen); //TODO: read these from -r option uint qstart=0; uint qend=0; //farec->seqlen bool revCompl=(args.getOpt('C')!=NULL); char* s=args.getOpt('r'); if (s!=NULL) { char *p=s; while (isdigit(*p)) p++; if (*p=='-') { sscanf(s,"%u-%u",&qstart, &qend); if (qstart==0 || qend==0) GError("Error parsing sequence range: %s\n",s); } else if (*p==':') { int qlen=0; sscanf(s,"%u:%d", &qstart, &qlen); if (qstart==0 || qlen==0) GError("Error parsing sequence range: %s\n",s); qend=qstart+qlen-1; } else if (*p=='.') { sscanf(s,"%u..%u",&qstart, &qend); if (qstart==0 || qend==0) GError("Error parsing sequence range: %s\n",s); } } if (qstart==0) qstart=1; if (qend==0) qend=farec->seqlen; // call faseq.loadall() here if multiple ranges are to be extracted all // over this genomic sequence char* subseq=faseq.copyRange(qstart, qend, revCompl, true); FILE* f_out=NULL; openfwrite(f_out, args, 'o'); if (f_out==NULL) f_out=stdout; writeFasta(f_out, qry.chars(), NULL, subseq, 70, qend-qstart+1); GFREE(subseq); }
bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, bool matchAllIntrons, bool fuzzSpan) { bool keep=false; GTData* tdata=NULL; //int tidx=-1; /* if (debug) { GMessage(">>Placing transcript %s\n", t->getID()); debugState=true; } else debugState=false; */ //dumb TRNA case for RefSeq: gene parent link missing //try to restore it here; BUT this only works if gene feature comes first ////DEBUG ONLY: //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) { // GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count()); //} //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count()); if (t->parent==NULL && t->isTranscript()) { int gidx=gdata->gfs.Count()-1; while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) { GffObj& g = *(gdata->gfs[gidx]); if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) { g.children.Add(t); keep=true; if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } t->parent=&g; //disable printing of gene if transcriptsOnly if (transcriptsOnly) { g.udata|=4; //tag it as non-printable } const char* geneName=g.getAttr("Name"); if (t->getAttr("Name")==NULL && geneName) { t->addAttr("Name", geneName); t->addAttr("gene_name", geneName); } t->addAttr("geneID", g.getID()); break; } --gidx; } } /* if (t->exons.Count()==0 && t->children.Count()==0 && forceExons) { //a non-mRNA feature with no subfeatures //just so we get some sequence functions working, add a dummy "exon"-like subfeature here //--this could be a single "pseudogene" entry or another genomic region without exons // t->addExon(t->start,t->end); } */ if (t->exons.Count()>0) { gdata->rnas.Add(t); //added it in sorted order if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } keep=true; } else { if (t->isGene() || !this->transcriptsOnly) { gdata->gfs.Add(t); keep=true; //GTData* tdata=new GTData(t); //additional transcript data if (tdata==NULL) { tdata=new GTData(t); //additional transcript data gdata->tdata.Add(tdata); } return true; } else return false; //nothing to do with these non-transcript objects } if (!doCluster) return keep; if (!keep) return false; //---- place into a locus if (gdata->loci.Count()==0) { gdata->loci.Add(new GffLocus(t)); return true; //new locus on this ref seq } int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx); if (nidx==0) { //cannot have any overlapping loci //if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end); gdata->loci.Add(new GffLocus(t)); return true; } if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end int lfound=0; //count of parent loci GArray<int> mrgloci(false); GList<GffLocus> tloci(true); //candidate parent loci to adopt this //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1); for (int l=nidx-1;l>=0;l--) { GffLocus& loc=*(gdata->loci[l]); if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue; if (t->start>loc.end) { if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already continue; } if (loc.start>t->end) { //this should never be the case if nidx was found correctly GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID()); continue; } if (loc.add_RNA(t)) { //will add this transcript to loc lfound++; mrgloci.Add(l); if (collapseRedundant) { //compare to every single transcript in this locus for (int ti=0;ti<loc.rnas.Count();ti++) { if (loc.rnas[ti]==t) continue; GTData* odata=(GTData*)(loc.rnas[ti]->uptr); //GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID()); GffObj* container=NULL; if (odata->replaced_by==NULL && (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) { if (container==t) { odata->replaced_by=t; preserveContainedCDS(t, loc.rnas[ti]); } else {// t is being replaced by previously defined transcript tdata->replaced_by=loc.rnas[ti]; preserveContainedCDS(loc.rnas[ti], t); } } }//for each transcript in the exon-overlapping locus } //if doCollapseRedundant } //overlapping locus } //for each existing locus if (lfound==0) { //overlapping loci not found, create a locus with only this mRNA int addidx=gdata->loci.Add(new GffLocus(t)); if (addidx<0) { //should never be the case! GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end); } } else { //found at least one overlapping locus lfound--; int locidx=mrgloci[lfound]; GffLocus& loc=*(gdata->loci[locidx]); //last locus index found is also the smallest index if (lfound>0) { //more than one loci found parenting this mRNA, merge loci /* if (debug) GMessage(" merging %d loci \n",lfound); */ for (int l=0;l<lfound;l++) { int mlidx=mrgloci[l]; loc.addMerge(*(gdata->loci[mlidx]), t); gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove } } int i=locidx; while (i>0 && loc<*(gdata->loci[i-1])) { //bubble down until it's in the proper order i--; gdata->loci.Swap(i,i+1); } }//found at least one overlapping locus return true; }
void GBamRecord::add_aux(const char* str) { //requires: being called AFTER add_quals() int strl=strlen(str); //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; //int doff0=doff; if (strl < 6 || str[2] != ':' || str[4] != ':') parse_error("missing colon in auxiliary data"); tag[0] = str[0]; tag[1] = str[1]; uint8_t atype = str[3]; uint8_t* adata=abuf; int alen=0; if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility atype='A'; alen=1; adata=(uint8_t*)&str[5]; } else if (atype == 'I' || atype == 'i') { long long x=strtoll(str+5, NULL, 10); //(long long)atoll(str + 5); //long x=(long)atol(str + 5); if (x < 0) { if (x >= -127) { atype='c'; abuf[0] = (int8_t)x; alen=1; } else if (x >= -32767) { atype = 's'; *(int16_t*)abuf = (int16_t)x; alen=2; } else { atype='i'; *(int32_t*)abuf = (int32_t)x; alen=4; if (x < -2147483648ll) GMessage("Parse warning: integer %lld is out of range.", x); } } else { //x >=0 if (x <= 255) { atype = 'C'; abuf[0] = (uint8_t)x; alen=1; } else if (x <= 65535) { atype='S'; *(uint16_t*)abuf = (uint16_t)x; alen=2; } else { atype='I'; *(uint32_t*)abuf = (uint32_t)x; alen=4; if (x > 4294967295ll) GMessage("Parse warning: integer %lld is out of range.", x); } } } //integer type else if (atype == 'f') { *(float*)abuf = (float)atof(str + 5); alen = sizeof(float); } else if (atype == 'd') { //? *(float*)abuf = (float)atof(str + 9); alen=8; } else if (atype == 'Z' || atype == 'H') { if (atype == 'H') { // check whether the hex string is valid if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); for (int i = 0; i < strl - 5; ++i) { int c = toupper(str[5 + i]); if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) parse_error("invalid hex character"); } } memcpy(abuf, str + 5, strl - 5); abuf[strl-5] = 0; alen=strl-4; } else parse_error("unrecognized aux type"); this->add_aux(tag, atype, alen, adata); }//add_aux()
bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) { bool forgetCtg = false; if (ctgidx>=contigs.Count()) GError("LayoutParser: invalid contig index '%d'\n", ctgidx); LytCtgData* ctgdata=contigs[ctgidx]; if (re_pos && currentContig!=NULL) { //free previously loaded contig data currentContig->seqs.Clear(); // unless it was a parse() call seqinfo.Clear(); } currentContig=ctgdata; int ctg_numSeqs=ctgdata->numseqs; if (re_pos) { seek(ctgdata->fpos); //position right where the contig definition starts char *r = linebuf->getLine(f,f_pos); if (r==NULL) return false; } if (seqfn!=NULL) { //process the contig sequence! char* ctgseq=readSeq(); forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq); GFREE(ctgseq); //obviously the caller should have made a copy } //now look for all the component sequences if (fskipTo("AF ")<0) { GMessage("AceParser: error finding sequence offsets (AF)" " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len); return false; } int numseqs=0; while (startsWith(linebuf->chars(), "AF ",3)) { if (addSeq(linebuf->chars(), ctgdata)==NULL) { GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars()); return false; } numseqs++; //read next line: linebuf->getLine(f,f_pos); } if (numseqs!=ctg_numSeqs) { GMessage("Invalid number of AF entries found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } //now read each sequence entry off_t seqpos=fskipTo("RD "); numseqs=0; //count again, now the RD entries if (seqpos<0) { GMessage("AceParser: error locating first RD entry for contig '%s'\n", ctgdata->name); return false; } //int numseqs=0; //reading the actual component sequence details while (startsWith(linebuf->chars(), "RD ",3)) { char* s=linebuf->chars()+3; char* p=strchrs(s, " \t"); LytSeqInfo* seq; if (p==NULL) { GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars()); return false; } *p='\0'; if ((seq=seqinfo.Find(s))==NULL) { GMessage("AceParser: unknown RD encountered: '%s'\n", s); return false; } p++; //now p is in linebuf after the RD name seq->fpos=seqpos; int len; if (sscanf(p, "%d", &len)!=1) { GMessage("AceParser: cannot parse RD length for '%s'\n", s); return false; } seq->setLength(len); //read the sequence data here if a callback fn was given: char* sseq=NULL; if (seqfn!=NULL) sseq=readSeq(seq); //read full sequence here if (fskipTo("QA ")<0) { GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos); return false; } //parse QA entry: int tmpa, tmpb; if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 || seq->left<=0 || seq->right<=0) { GMessage("AceParser: Error parsing QA entry.\n"); return false; } /* if (fskipTo("DS")<0) { GMessage("AceParser: Error closing RD entry ('DS' not found).\n"); return false; } */ seqpos=getFilePos()+1; bool forgetSeq=false; if (seqfn!=NULL) { forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq); GFREE(sseq); } if (forgetSeq) { //parsing the whole stream -- aceconv) ctg_numSeqs--; seqinfo.Remove(seq->name); ctgdata->seqs.RemovePtr(seq); } numseqs++; if (numseqs<ctgdata->numseqs) seqpos=fskipTo("RD ", "CO "); //more sequences left to read } if (numseqs!=ctgdata->numseqs) { GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } if (forgetCtg) { ctgIDs.Remove(ctgdata->name); ctgdata->seqs.Clear(); seqinfo.Clear(); contigs.RemovePtr(ctgdata); } return true; }
/* Load contig data; can be called by parse - and then no fseek is needed and the file position if right after parsing the contig summary data */ bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) { bool forgetCtg=false; char* r=NULL; if (ctgidx>=contigs.Count()) GError("LayoutParser: invalid contig index '%d'\n", ctgidx); LytCtgData* ctgdata=contigs[ctgidx]; if (re_pos && currentContig!=NULL) { //free previous contig data //unless it was a parse() call currentContig->seqs.Clear(); seqinfo.Clear(); } currentContig=ctgdata; if (re_pos) { seek(ctgdata->fpos); //position right where the contig definition starts r=linebuf->getLine(f,f_pos);//skip the first line if (r==NULL) return false; } if (seqfn!=NULL) forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL); int ctg_numSeqs=ctgdata->numseqs; int numseqs=0; while ((r=linebuf->getLine(f,f_pos))!=NULL) { if (linebuf->length()<4) continue; if (linebuf->chars()[0]=='>') { linebuf->pushBack(); break; //reached next contig } //sequence data parsing bool forgetSeq=false; LytSeqInfo* seq=NULL; if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) { GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars()); return false; } /* // Weird -- why would I MODIFY the given clipping of a sequence? //-- bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos); if (ctg_clipping) { if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length()) seq->left = ctgdata->lpos - seq->offs+1; if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs ) seq->right = ctgdata->rpos-seq->offs+1; } */ if (seqfn!=NULL) forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL); if (forgetSeq) { ctg_numSeqs--; seqinfo.Remove(seq->name); ctgdata->seqs.RemovePtr(seq); } else { numseqs++; } } //while sequences if (forgetCtg) { ctgIDs.Remove(ctgdata->name); contigs.RemovePtr(ctgdata); } if (numseqs!=ctg_numSeqs) { GMessage("Mismatching number of sequences found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } return true; }
char* AceParser::readSeq(LytSeqInfo* sqinfo) { //assumes the next line is where a sequence starts! //stops at the next empty line encountered char* buf; static char rlenbuf[12]= {0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length int rlenbufacc=0; //how many digits accumulated in rlenbuf so far int buflen=512; GMALLOC(buf, buflen); //this MUST be freed by the caller buf[0]='\0'; int accrd=0; //accumulated read length so far -- excludes interseg gaps! int rgpos=0; //accumulated offset including interseg gaps! char *r = linebuf->getLine(f,f_pos); int linelen=linebuf->length(); char r_splice=0, l_splice=0; while (linelen>0) { if (r==NULL) { GMessage("AceParser: error reading sequence data\n"); return NULL; } //-- pass the line content for accumulation int i=0; while (r[i]) { while (r[i] && isdigit(r[i])) { rlenbuf[rlenbufacc]=r[i]; rlenbufacc++; i++; } if (r[i]==0) break; //end of line reached already //now r[i] is surely a non-digit char if (rlenbufacc>0) { //have we just had a number before? rlenbuf[rlenbufacc]=0; if (r[i]=='=' || r[i]=='-') { i++; if (r[i]==0) break; } else { //check for splice site markers for this introns if (r[i]=='('||r[i]=='[') { l_splice=r[i]; i++; if (r[i]==0) break; } if (r[i]==')'||r[i]==']') { r_splice=r[i]; i++; if (r[i]==0) break; } }//splice check add_intron(buf, accrd, rgpos, rlenbuf, sqinfo, l_splice, r_splice); rlenbufacc=0; r_splice=0; l_splice=0; //i++;//skip the gap character } //check for digits here and break the linebuf as needed int bi=i; //start of non-digit run while (r[i] && !isdigit(r[i])) i++; int nl=(i-bi); //length of non-digit run if (nl>0) { int si=accrd; accrd+=nl; rgpos+=nl; if (accrd>=buflen-1) { buflen=accrd+512; GREALLOC(buf,buflen); } //append these non-digit chars for(int b=0; b<nl; b++) { buf[si+b]=r[bi+b]; } }//non-digit run } //while line chars /* //-- append the line to buf accrd+=linelen; if (accrd>=buflen) { buflen+=1024; GREALLOC(buf,buflen); } strcat(buf, r); */ r=linebuf->getLine(f,f_pos); linelen=linebuf->length(); }//while linelen>0 //add the 0-ending buf[accrd]=0; return buf; }
LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) { LytSeqInfo* seq; //s must be the line with sequence data char* p=strchrs(s," \t"); if (p==NULL) return NULL; p++; char c; int slen, soffs, clpL, clpR; clpL=0;clpR=0; if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL; p--; *p='\0'; if ((seq=seqinfo.Find(s))!=NULL) { GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n" " so it cannot be added for contig '%s (%d nt)'\n", s, seq->contig->name, seq->contig->len, ctg->name, ctg->len); return NULL; } seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR); seqinfo.shkAdd(seq->name, seq); ctg->seqs.Add(seq); //parse optional extensions, if any p+=strlen(s); //position p after the seqname char* m=NULL; int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart; char segSplice, nextsegSplice; while ((m=strchr(p,':'))!=NULL) { switch (*(m-1)) { case 'G': //segmenting info prevSegStart=soffs+clpL-1; p=m+1; //p to the beginning of G: data //accumulate the total length in lenSegs while (*p>='1' && *p<='9') { segEnd=0; segRclip=0; nextsegStart=0; nextsegLclip=0; segSplice=0; nextsegSplice=0; if (!parseInt(p,segEnd)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); if (*p=='c') { p++; if (!parseInt(p,segRclip)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); } if (*p=='S' || *p=='s') { segSplice=*p; p++; } if (*p!='-') GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); else p++; if (!parseInt(p,nextsegStart)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); if (*p=='c') { p++; if (!parseInt(p,nextsegLclip)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); } if (*p=='S' || *p=='s') { nextsegSplice=*p; p++; } seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip, segSplice, nextsegSplice); prevSegStart=nextsegStart; // if (*p==',') p++; else break; } //while inter-segment parsing break; // 'G:' case case 'L': //clone mates list p=m+1; //p to the beginning of L: data break; case 'D': //difference sequence p=m+1; //p to the beginning of D: data break; case 'S': //actual sequence p=m+1; //p to the beginning of S: data break; default: p=m+1;//next attribute } } return seq; }
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) { //returns true if the transcript passed the filter char* gname=gffrec.getGeneName(); if (gname==NULL) gname=gffrec.getGeneID(); GStr defline(gffrec.getID()); if (f_out && !fmtGTF) { const char* tname=NULL; if ((tname=gffrec.getAttr("transcript_name"))!=NULL) { gffrec.addAttr("Name", tname); gffrec.removeAttr("transcript_name"); } } if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) { const char* biotype=gffrec.getAttr("gene_biotype"); if (biotype) { gffrec.addAttr("type", biotype); gffrec.removeAttr("gene_biotype"); } else { //old Ensembl files lacking gene_biotype gffrec.addAttr("type", gffrec.getTrackName()); } //bool is_gene=false; bool is_pseudo=false; if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS()) gffrec.setFeatureName("mRNA"); else { if (strcmp(biotype, "processed_transcript")==0) gffrec.setFeatureName("proc_RNA"); else { //is_gene=endsWith(biotype, "gene"); is_pseudo=strifind(biotype, "pseudo"); if (is_pseudo) { gffrec.setFeatureName("pseudo_RNA"); } else if (endsWith(biotype, "RNA")) { gffrec.setFeatureName(biotype); } else gffrec.setFeatureName("misc_RNA"); } } } if (gname && strcmp(gname, gffrec.getID())!=0) { int* isonum=isoCounter.Find(gname); if (isonum==NULL) { isonum=new int(1); isoCounter.Add(gname,isonum); } else (*isonum)++; defline.appendfmt(" gene=%s", gname); } int seqlen=0; const char* tlabel=tracklabel; if (tlabel==NULL) tlabel=gffrec.getTrackName(); //defline.appendfmt(" track:%s",tlabel); char* cdsnt = NULL; char* cdsaa = NULL; int aalen=0; for (int i=1;i<gffrec.exons.Count();i++) { int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1; if (ilen>4000000) GMessage("Warning: very large intron (%d) for transcript %s\n", ilen, gffrec.getID()); if (ilen>maxintron) { return false; } } GList<GSeg> seglst(false,true); GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec); if (spliceCheck && gffrec.exons.Count()>1) { //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC ) if (faseq==NULL) GError("Error: no genomic sequence available!\n"); int glen=gffrec.end-gffrec.start+1; const char* gseq=faseq->subseq(gffrec.start, glen); bool revcompl=(gffrec.strand=='-'); bool ssValid=true; for (int e=1;e<gffrec.exons.Count();e++) { const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start; int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1; GSpliceSite acceptorSite(intron,intronlen,true, revcompl); GSpliceSite donorSite(intron,intronlen, false, revcompl); //GMessage("%c intron %d-%d : %s .. %s\n", // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt); if (acceptorSite=="AG") { // GT-AG or GC-AG if (!donorSite.canonicalDonor()) { ssValid=false;break; } } else if (acceptorSite=="AC") { // if (donorSite!="AT") { ssValid=false; break; } } else { ssValid=false; break; } } //GFREE(gseq); if (!ssValid) { if (verbose) GMessage("Invalid splice sites found for '%s'\n",gffrec.getID()); return false; //don't print this one! } } bool trprint=true; int stopCodonAdjust=0; int mCDphase=0; bool hasStop=false; if (gffrec.CDphase=='1' || gffrec.CDphase=='2') mCDphase = gffrec.CDphase-'0'; if (f_y!=NULL || f_x!=NULL || validCDSonly) { if (faseq==NULL) GError("Error: no genomic sequence provided!\n"); //if (protmap && fullCDSonly) { //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) { if (validCDSonly) { //make sure the stop codon is always included //adjust_stopcodon(gffrec,3); stopCodonAdjust=adjust_stopcodon(gffrec,3); } int strandNum=0; int phaseNum=0; CDS_CHECK: cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst); if (cdsnt==NULL) trprint=false; else { //has CDS if (validCDSonly) { cdsaa=translateDNA(cdsnt, aalen, seqlen); char* p=strchr(cdsaa,'.'); hasStop=false; if (p!=NULL) { if (p-cdsaa>=aalen-2) { //stop found as the last codon *p='0';//remove it hasStop=true; if (aalen-2==p-cdsaa) { //previous to last codon is the stop codon //so correct the CDS stop accordingly adjust_stopcodon(gffrec,-3, &seglst); stopCodonAdjust=0; //clear artificial stop adjustment seqlen-=3; cdsnt[seqlen]=0; } aalen=p-cdsaa; } else {//stop found before the last codon trprint=false; } }//stop codon found if (trprint==false) { //failed CDS validity check //in-frame stop codon found if (altPhases && phaseNum<3) { phaseNum++; gffrec.CDphase = '0'+((mCDphase+phaseNum)%3); GFREE(cdsaa); goto CDS_CHECK; } if (gffrec.exons.Count()==1 && bothStrands) { strandNum++; phaseNum=0; if (strandNum<2) { GFREE(cdsaa); gffrec.strand = (gffrec.strand=='-') ? '+':'-'; goto CDS_CHECK; //repeat the CDS check for a different frame } } if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID()); } //has in-frame STOP if (fullCDSonly) { if (!hasStop || cdsaa[0]!='M') trprint=false; } } // CDS check requested } //has CDS } //translation or codon check/output was requested if (!trprint) { GFREE(cdsnt); GFREE(cdsaa); return false; } if (stopCodonAdjust>0 && !hasStop) { //restore stop codon location adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst); if (cdsnt!=NULL && seqlen>0) { seqlen-=stopCodonAdjust; cdsnt[seqlen]=0; } if (cdsaa!=NULL) aalen--; } if (f_y!=NULL) { //CDS translation fasta output requested //char* if (cdsaa==NULL) { //translate now if not done before cdsaa=translateDNA(cdsnt, aalen, seqlen); } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_y, defline, cdsaa, aalen); } if (f_x!=NULL) { //CDS only if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline.appendfmt("(%c)",gffrec.strand); //warning: not CDS coordinates are written here, but the exon ones defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; // -- here these are CDS substring coordinates on the spliced sequence: defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcript for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_x, defline, cdsnt, seqlen); } GFREE(cdsnt); GFREE(cdsaa); if (f_w!=NULL) { //write spliced exons uint cds_start=0; uint cds_end=0; seglst.Clear(); char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst); if (exont!=NULL) { if (gffrec.CDstart>0) { defline.appendfmt(" CDS=%d-%d", cds_start, cds_end); } if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline+=(char)'|'; defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; defline+=(char)'|'; defline+=(char)gffrec.strand; defline.append(" exons:"); for (int i=0;i<gffrec.exons.Count();i++) { if (i>0) defline.append(","); defline+=(int)gffrec.exons[i]->start; defline.append("-"); defline+=(int)gffrec.exons[i]->end; } defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_w, defline, exont, seqlen); GFREE(exont); } } //writing f_w (spliced exons) return true; }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:"); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) { GMessage("%s",USAGE); exit(1); } debugMode=(args.getOpt("debug")!=NULL); decodeChars=(args.getOpt('D')!=NULL); forceExons=(args.getOpt("force-exons")!=NULL); NoPseudo=(args.getOpt("no-pseudo")!=NULL); mRNAOnly=(args.getOpt('O')==NULL); //sortByLoc=(args.getOpt('S')!=NULL); addDescr=(args.getOpt('A')!=NULL); verbose=(args.getOpt('v')!=NULL); wCDSonly=(args.getOpt('C')!=NULL); validCDSonly=(args.getOpt('V')!=NULL); altPhases=(args.getOpt('H')!=NULL); fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF bothStrands=(args.getOpt('B')!=NULL); fullCDSonly=(args.getOpt('J')!=NULL); spliceCheck=(args.getOpt('N')!=NULL); bool matchAllIntrons=(args.getOpt('K')==NULL); bool fuzzSpan=(args.getOpt('Q')!=NULL); if (args.getOpt('M') || args.getOpt("merge")) { doCluster=true; doCollapseRedundant=true; } else { if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options require -M/--merge option!\n"); exit(1); } } if (args.getOpt("cluster-only")) { doCluster=true; doCollapseRedundant=false; if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options have no effect with --cluster-only.\n"); exit(1); } } if (fullCDSonly) validCDSonly=true; if (verbose) { fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); } fullattr=(args.getOpt('F')!=NULL); if (args.getOpt('G')==NULL) noExonAttr=!fullattr; else { noExonAttr=true; fullattr=true; } if (NoPseudo && !fullattr) { noExonAttr=true; fullattr=true; } ensembl_convert=(args.getOpt('L')!=NULL); if (ensembl_convert) { fullattr=true; noExonAttr=false; //sortByLoc=true; } mergeCloseExons=(args.getOpt('Z')!=NULL); multiExon=(args.getOpt('U')!=NULL); writeExonSegs=(args.getOpt('W')!=NULL); tracklabel=args.getOpt('t'); GFastaDb gfasta(args.getOpt('g')); //if (gfasta.fastaPath!=NULL) // sortByLoc=true; //enforce sorting by chromosome/contig GStr s=args.getOpt('i'); if (!s.is_empty()) maxintron=s.asInt(); FILE* f_repl=NULL; s=args.getOpt('d'); if (!s.is_empty()) { if (s=="-") f_repl=stdout; else { f_repl=fopen(s.chars(), "w"); if (f_repl==NULL) GError("Error creating file %s\n", s.chars()); } } rfltWithin=(args.getOpt('R')!=NULL); s=args.getOpt('r'); if (!s.is_empty()) { s.trim(); if (s[0]=='+' || s[0]=='-') { rfltStrand=s[0]; s.cut(0,1); } int isep=s.index(':'); if (isep>0) { //gseq name given if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) { isep--; rfltStrand=s[isep]; s.cut(isep,1); } if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars()); s.cut(0,isep+1); } GStr gsend; char slast=s[s.length()-1]; if (rfltStrand==0 && (slast=='+' || slast=='-')) { s.chomp(slast); rfltStrand=slast; } if (s.index("..")>=0) gsend=s.split(".."); else gsend=s.split('-'); if (!s.is_empty()) rfltStart=(uint)s.asInt(); if (!gsend.is_empty()) { rfltEnd=(uint)gsend.asInt(); if (rfltEnd==0) rfltEnd=MAX_UINT; } } //gseq/range filtering else { if (rfltWithin) GError("Error: option -R requires -r!\n"); //if (rfltWholeTranscript) // GError("Error: option -P requires -r!\n"); } s=args.getOpt('m'); if (!s.is_empty()) { FILE* ft=fopen(s,"r"); if (ft==NULL) GError("Error opening reference table: %s\n",s.chars()); loadRefTable(ft, reftbl); fclose(ft); } s=args.getOpt('s'); if (!s.is_empty()) { FILE* fsize=fopen(s,"r"); if (fsize==NULL) GError("Error opening info file: %s\n",s.chars()); loadSeqInfo(fsize, seqinfo); fclose(fsize); } openfw(f_out, args, 'o'); //if (f_out==NULL) f_out=stdout; if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL)) GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n"); openfw(f_w, args, 'w'); openfw(f_x, args, 'x'); openfw(f_y, args, 'y'); if (f_y!=NULL || f_x!=NULL) wCDSonly=true; //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL); int numfiles = args.startNonOpt(); //GList<GffObj> gfkept(false,true); //unsorted, free items on delete int out_counter=0; //number of records printed while (true) { GStr infile; if (numfiles) { infile=args.nextNonOpt(); if (infile.is_empty()) break; if (infile=="-") { f_in=stdin; infile="stdin"; } else if ((f_in=fopen(infile, "r"))==NULL) GError("Error: cannot open input file %s!\n",infile.chars()); } else infile="-"; GffLoader gffloader(infile.chars()); gffloader.transcriptsOnly=mRNAOnly; gffloader.fullAttributes=fullattr; gffloader.noExonAttrs=noExonAttr; gffloader.mergeCloseExons=mergeCloseExons; gffloader.showWarnings=(args.getOpt('E')!=NULL); gffloader.noPseudo=NoPseudo; gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan, forceExons); if (doCluster) collectLocusData(g_data); if (numfiles==0) break; } GStr loctrack("gffcl"); if (tracklabel) loctrack=tracklabel; g_data.setSorted(&gseqCmpName); GffPrintMode exonPrinting; if (fmtGTF) { exonPrinting = pgtfAny; } else { exonPrinting = forceExons ? pgffBoth : pgffAny; } bool firstGff3Print=!fmtGTF; if (doCluster) { //grouped in loci for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); //check all non-replaced transcripts in this locus: int numvalid=0; int idxfirstvalid=-1; for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) { if (f_repl && (t.udata & 8)==0) { //t.udata|=8; fprintf(f_repl, "%s", t.getID()); GTData* rby=tdata; while (rby->replaced_by!=NULL) { fprintf(f_repl," => %s", rby->replaced_by->getID()); rby->rna->udata|=8; rby=(GTData*)(rby->replaced_by->uptr); } fprintf(f_repl, "\n"); } continue; } if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (idxfirstvalid<0) idxfirstvalid=i; } } if (f_out && numvalid>0) { GStr locname("RLOC_"); locname.appendfmt("%08d",loc.locus_num); if (!fmtGTF) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s", loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand, locname.chars(), locname.chars()); //const char* loc_gname=loc.getGeneName(); if (loc.gene_names.Count()>0) { //print all gene names associated to this locus fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars()); for (int i=1;i<loc.gene_names.Count();i++) { fprintf(f_out, ",%s",loc.gene_names[i]->name.chars()); } } if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars()); for (int i=1;i<loc.gene_ids.Count();i++) { fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars()); } } fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID()); for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) { fprintf(f_out, ",%s",loc.rnas[i]->getID()); } fprintf(f_out, "\n"); } //now print all valid, non-replaced transcripts in this locus: for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue; t.addAttr("locus", locname.chars()); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->addAttr("locus", locname.chars()); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } } } //have valid transcripts to print }//for each locus //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic sequence } else { //not grouped into loci, print the rnas with their parents, if any int numvalid=0; for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int m=0;m<gdata->rnas.Count();m++) { GffObj& t=*(gdata->rnas[m]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) continue; if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (f_out) { if (tdata->geneinfo) tdata->geneinfo->finalize(); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } }//GFF/GTF output requested } //valid transcript } //for each rna //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic seq } //not clustered if (f_repl && f_repl!=stdout) fclose(f_repl); seqinfo.Clear(); //if (faseq!=NULL) delete faseq; //if (gcdb!=NULL) delete gcdb; GFREE(rfltGSeq); FRCLOSE(f_in); FWCLOSE(f_out); FWCLOSE(f_w); FWCLOSE(f_x); FWCLOSE(f_y); }