void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar != NULL && strcmp(cigar, "*") != 0) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { GError("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true; else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true; else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true; else if (op == 'P') op = BAM_CPAD; else GError("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } setupCoordinates(); } //set_cigar()
int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file if (records.Count()==0) GError("Error at GFastaIndex:storeIndex(): no records found!\n"); FILE* fai=fopen(finame, "w"); if (fai==NULL) GError("Error creating fasta index file: %s\n",finame); int rcount=storeIndex(fai); GFREE(fai_name); fai_name=Gstrdup(finame); return rcount; }
void Gmktempdir(char* templ) { #ifdef __WIN32__ int blen=strlen(templ); if (_mktemp_s(templ, blen)!=0) GError("Error creating temp dir %s!\n", templ); #else char* cdir=mkdtemp(templ); if (cdir==NULL) GError("Error creating temp dir %s!(%s)\n", templ, strerror(errno)); #endif }
bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) { //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons //but it does not cross any intron-exon boundary of tj int imax=ti.exons.Count()-1; int jmax=tj.exons.Count()-1; if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n"); int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan if (fuzzSpan) { for (int j=0;j<=jmax;j++) { //must NOT overlap the introns if ((j>0 && ti.start<tj.exons[j]->start) || (j<jmax && ti.end>tj.exons[j]->end)) return false; if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl) return true; } } else { for (int j=0;j<=jmax;j++) { //must NOT overlap the introns if ((j>0 && ti.start<tj.exons[j]->start) || (j<jmax && ti.end>tj.exons[j]->end)) return false; //strict containment if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) return true; } } return false; }
void openfw(FILE* &f, GArgs& args, char opt) { GStr s=args.getOpt(opt); if (!s.is_empty()) { if (s=='-') f=stdout; else { f=fopen(s,"w"); if (f==NULL) GError("Error creating file: %s\n", s.chars()); } } }
int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index if (finame==NULL) finame=fai_name; if (finame!=fai_name) { fai_name=Gstrdup(finame); } if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n"); records.Clear(); haveFai=false; FILE* fi=fopen(fai_name,"rb"); if (fi==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fai_name); return 0; } GLineReader fl(fi); char* s=NULL; while ((s=fl.nextLine())!=NULL) { if (*s=='#') continue; char* p=strchrs(s,"\t "); if (p==NULL) GError(ERR_FAIDXLINE,s); *p=0; //s now holds the genomic sequence name p++; uint len=0; int line_len=0, line_blen=0; #ifdef __WIN32__ long offset=-1; sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen); #else long long offset=-1; sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen); #endif if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len) GError(ERR_FAIDXLINE,p); addRecord(s,len,offset,line_len, line_blen); } fclose(fi); haveFai=(records.Count()>0); return records.Count(); }
byte gdna2bit(char* &nt, int n) { // Pack n bases into a byte (n can be 1..4) byte out = 0; while (n && *nt) { n--; out <<= 2; out += nt2bit[(int)*nt]; nt++; } #ifdef GDEBUG if (n) { GError("Error: attempt to read 6-mer beyond the end of the string!\n"); } #endif return out; }
void GBamRecord::add_sequence(const char* qseq, int slen) { //must be called AFTER set_cigar (cannot replace existing sequence for now) if (qseq==NULL) return; //should we ever care about this? if (slen<0) slen=strlen(qseq); int doff = b->core.l_qname + b->core.n_cigar * 4; if (strcmp(qseq, "*")!=0) { b->core.l_qseq=slen; if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq); uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; //also allocated quals memory memset(p, 0, (b->core.l_qseq+1)/2); for (int i = 0; i < b->core.l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); } else b->core.l_qseq = 0; }
bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) { bool forgetCtg = false; if (ctgidx>=contigs.Count()) GError("LayoutParser: invalid contig index '%d'\n", ctgidx); LytCtgData* ctgdata=contigs[ctgidx]; if (re_pos && currentContig!=NULL) { //free previously loaded contig data currentContig->seqs.Clear(); // unless it was a parse() call seqinfo.Clear(); } currentContig=ctgdata; int ctg_numSeqs=ctgdata->numseqs; if (re_pos) { seek(ctgdata->fpos); //position right where the contig definition starts char *r = linebuf->getLine(f,f_pos); if (r==NULL) return false; } if (seqfn!=NULL) { //process the contig sequence! char* ctgseq=readSeq(); forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq); GFREE(ctgseq); //obviously the caller should have made a copy } //now look for all the component sequences if (fskipTo("AF ")<0) { GMessage("AceParser: error finding sequence offsets (AF)" " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len); return false; } int numseqs=0; while (startsWith(linebuf->chars(), "AF ",3)) { if (addSeq(linebuf->chars(), ctgdata)==NULL) { GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars()); return false; } numseqs++; //read next line: linebuf->getLine(f,f_pos); } if (numseqs!=ctg_numSeqs) { GMessage("Invalid number of AF entries found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } //now read each sequence entry off_t seqpos=fskipTo("RD "); numseqs=0; //count again, now the RD entries if (seqpos<0) { GMessage("AceParser: error locating first RD entry for contig '%s'\n", ctgdata->name); return false; } //int numseqs=0; //reading the actual component sequence details while (startsWith(linebuf->chars(), "RD ",3)) { char* s=linebuf->chars()+3; char* p=strchrs(s, " \t"); LytSeqInfo* seq; if (p==NULL) { GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars()); return false; } *p='\0'; if ((seq=seqinfo.Find(s))==NULL) { GMessage("AceParser: unknown RD encountered: '%s'\n", s); return false; } p++; //now p is in linebuf after the RD name seq->fpos=seqpos; int len; if (sscanf(p, "%d", &len)!=1) { GMessage("AceParser: cannot parse RD length for '%s'\n", s); return false; } seq->setLength(len); //read the sequence data here if a callback fn was given: char* sseq=NULL; if (seqfn!=NULL) sseq=readSeq(seq); //read full sequence here if (fskipTo("QA ")<0) { GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos); return false; } //parse QA entry: int tmpa, tmpb; if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 || seq->left<=0 || seq->right<=0) { GMessage("AceParser: Error parsing QA entry.\n"); return false; } /* if (fskipTo("DS")<0) { GMessage("AceParser: Error closing RD entry ('DS' not found).\n"); return false; } */ seqpos=getFilePos()+1; bool forgetSeq=false; if (seqfn!=NULL) { forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq); GFREE(sseq); } if (forgetSeq) { //parsing the whole stream -- aceconv) ctg_numSeqs--; seqinfo.Remove(seq->name); ctgdata->seqs.RemovePtr(seq); } numseqs++; if (numseqs<ctgdata->numseqs) seqpos=fskipTo("RD ", "CO "); //more sequences left to read } if (numseqs!=ctgdata->numseqs) { GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } if (forgetCtg) { ctgIDs.Remove(ctgdata->name); ctgdata->seqs.Clear(); seqinfo.Clear(); contigs.RemovePtr(ctgdata); } return true; }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:"); args.printError(USAGE, true); if (args.getOpt('h') || args.getOpt("help")) { GMessage("%s",USAGE); exit(1); } debugMode=(args.getOpt("debug")!=NULL); decodeChars=(args.getOpt('D')!=NULL); forceExons=(args.getOpt("force-exons")!=NULL); NoPseudo=(args.getOpt("no-pseudo")!=NULL); mRNAOnly=(args.getOpt('O')==NULL); //sortByLoc=(args.getOpt('S')!=NULL); addDescr=(args.getOpt('A')!=NULL); verbose=(args.getOpt('v')!=NULL); wCDSonly=(args.getOpt('C')!=NULL); validCDSonly=(args.getOpt('V')!=NULL); altPhases=(args.getOpt('H')!=NULL); fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF bothStrands=(args.getOpt('B')!=NULL); fullCDSonly=(args.getOpt('J')!=NULL); spliceCheck=(args.getOpt('N')!=NULL); bool matchAllIntrons=(args.getOpt('K')==NULL); bool fuzzSpan=(args.getOpt('Q')!=NULL); if (args.getOpt('M') || args.getOpt("merge")) { doCluster=true; doCollapseRedundant=true; } else { if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options require -M/--merge option!\n"); exit(1); } } if (args.getOpt("cluster-only")) { doCluster=true; doCollapseRedundant=false; if (!matchAllIntrons || fuzzSpan) { GMessage("%s",USAGE); GMessage("Error: -K or -Q options have no effect with --cluster-only.\n"); exit(1); } } if (fullCDSonly) validCDSonly=true; if (verbose) { fprintf(stderr, "Command line was:\n"); args.printCmdLine(stderr); } fullattr=(args.getOpt('F')!=NULL); if (args.getOpt('G')==NULL) noExonAttr=!fullattr; else { noExonAttr=true; fullattr=true; } if (NoPseudo && !fullattr) { noExonAttr=true; fullattr=true; } ensembl_convert=(args.getOpt('L')!=NULL); if (ensembl_convert) { fullattr=true; noExonAttr=false; //sortByLoc=true; } mergeCloseExons=(args.getOpt('Z')!=NULL); multiExon=(args.getOpt('U')!=NULL); writeExonSegs=(args.getOpt('W')!=NULL); tracklabel=args.getOpt('t'); GFastaDb gfasta(args.getOpt('g')); //if (gfasta.fastaPath!=NULL) // sortByLoc=true; //enforce sorting by chromosome/contig GStr s=args.getOpt('i'); if (!s.is_empty()) maxintron=s.asInt(); FILE* f_repl=NULL; s=args.getOpt('d'); if (!s.is_empty()) { if (s=="-") f_repl=stdout; else { f_repl=fopen(s.chars(), "w"); if (f_repl==NULL) GError("Error creating file %s\n", s.chars()); } } rfltWithin=(args.getOpt('R')!=NULL); s=args.getOpt('r'); if (!s.is_empty()) { s.trim(); if (s[0]=='+' || s[0]=='-') { rfltStrand=s[0]; s.cut(0,1); } int isep=s.index(':'); if (isep>0) { //gseq name given if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) { isep--; rfltStrand=s[isep]; s.cut(isep,1); } if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars()); s.cut(0,isep+1); } GStr gsend; char slast=s[s.length()-1]; if (rfltStrand==0 && (slast=='+' || slast=='-')) { s.chomp(slast); rfltStrand=slast; } if (s.index("..")>=0) gsend=s.split(".."); else gsend=s.split('-'); if (!s.is_empty()) rfltStart=(uint)s.asInt(); if (!gsend.is_empty()) { rfltEnd=(uint)gsend.asInt(); if (rfltEnd==0) rfltEnd=MAX_UINT; } } //gseq/range filtering else { if (rfltWithin) GError("Error: option -R requires -r!\n"); //if (rfltWholeTranscript) // GError("Error: option -P requires -r!\n"); } s=args.getOpt('m'); if (!s.is_empty()) { FILE* ft=fopen(s,"r"); if (ft==NULL) GError("Error opening reference table: %s\n",s.chars()); loadRefTable(ft, reftbl); fclose(ft); } s=args.getOpt('s'); if (!s.is_empty()) { FILE* fsize=fopen(s,"r"); if (fsize==NULL) GError("Error opening info file: %s\n",s.chars()); loadSeqInfo(fsize, seqinfo); fclose(fsize); } openfw(f_out, args, 'o'); //if (f_out==NULL) f_out=stdout; if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL)) GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n"); openfw(f_w, args, 'w'); openfw(f_x, args, 'x'); openfw(f_y, args, 'y'); if (f_y!=NULL || f_x!=NULL) wCDSonly=true; //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL); int numfiles = args.startNonOpt(); //GList<GffObj> gfkept(false,true); //unsorted, free items on delete int out_counter=0; //number of records printed while (true) { GStr infile; if (numfiles) { infile=args.nextNonOpt(); if (infile.is_empty()) break; if (infile=="-") { f_in=stdin; infile="stdin"; } else if ((f_in=fopen(infile, "r"))==NULL) GError("Error: cannot open input file %s!\n",infile.chars()); } else infile="-"; GffLoader gffloader(infile.chars()); gffloader.transcriptsOnly=mRNAOnly; gffloader.fullAttributes=fullattr; gffloader.noExonAttrs=noExonAttr; gffloader.mergeCloseExons=mergeCloseExons; gffloader.showWarnings=(args.getOpt('E')!=NULL); gffloader.noPseudo=NoPseudo; gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan, forceExons); if (doCluster) collectLocusData(g_data); if (numfiles==0) break; } GStr loctrack("gffcl"); if (tracklabel) loctrack=tracklabel; g_data.setSorted(&gseqCmpName); GffPrintMode exonPrinting; if (fmtGTF) { exonPrinting = pgtfAny; } else { exonPrinting = forceExons ? pgffBoth : pgffAny; } bool firstGff3Print=!fmtGTF; if (doCluster) { //grouped in loci for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int l=0;l<gdata->loci.Count();l++) { GffLocus& loc=*(gdata->loci[l]); //check all non-replaced transcripts in this locus: int numvalid=0; int idxfirstvalid=-1; for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) { if (f_repl && (t.udata & 8)==0) { //t.udata|=8; fprintf(f_repl, "%s", t.getID()); GTData* rby=tdata; while (rby->replaced_by!=NULL) { fprintf(f_repl," => %s", rby->replaced_by->getID()); rby->rna->udata|=8; rby=(GTData*)(rby->replaced_by->uptr); } fprintf(f_repl, "\n"); } continue; } if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (idxfirstvalid<0) idxfirstvalid=i; } } if (f_out && numvalid>0) { GStr locname("RLOC_"); locname.appendfmt("%08d",loc.locus_num); if (!fmtGTF) { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s", loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand, locname.chars(), locname.chars()); //const char* loc_gname=loc.getGeneName(); if (loc.gene_names.Count()>0) { //print all gene names associated to this locus fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars()); for (int i=1;i<loc.gene_names.Count();i++) { fprintf(f_out, ",%s",loc.gene_names[i]->name.chars()); } } if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars()); for (int i=1;i<loc.gene_ids.Count();i++) { fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars()); } } fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID()); for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) { fprintf(f_out, ",%s",loc.rnas[i]->getID()); } fprintf(f_out, "\n"); } //now print all valid, non-replaced transcripts in this locus: for (int i=0;i<loc.rnas.Count();i++) { GffObj& t=*(loc.rnas[i]); GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue; t.addAttr("locus", locname.chars()); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->addAttr("locus", locname.chars()); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } } } //have valid transcripts to print }//for each locus //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic sequence } else { //not grouped into loci, print the rnas with their parents, if any int numvalid=0; for (int g=0;g<g_data.Count();g++) { GenomicSeqData* gdata=g_data[g]; int gfs_i=0; for (int m=0;m<gdata->rnas.Count();m++) { GffObj& t=*(gdata->rnas[m]); if (f_out) { while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } GTData* tdata=(GTData*)(t.uptr); if (tdata->replaced_by!=NULL) continue; if (process_transcript(gfasta, t)) { t.udata|=4; //tag it as valid numvalid++; if (f_out) { if (tdata->geneinfo) tdata->geneinfo->finalize(); out_counter++; if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); else { if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } //print the parent first, if any if (t.parent!=NULL && ((t.parent->udata & 4)==0)) { GTData* pdata=(GTData*)(t.parent->uptr); if (pdata && pdata->geneinfo!=NULL) pdata->geneinfo->finalize(); t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); t.parent->udata|=4; } t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } }//GFF/GTF output requested } //valid transcript } //for each rna //print the rest of the isolated pseudo/gene/region features not printed yet if (f_out) { while (gfs_i<gdata->gfs.Count()) { GffObj& gfst=*(gdata->gfs[gfs_i]); if ((gfst.udata&4)==0) { //never printed gfst.udata|=4; if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; } if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons) gfst.addExon(gfst.start,gfst.end); gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars); } ++gfs_i; } } } //for each genomic seq } //not clustered if (f_repl && f_repl!=stdout) fclose(f_repl); seqinfo.Clear(); //if (faseq!=NULL) delete faseq; //if (gcdb!=NULL) delete gcdb; GFREE(rfltGSeq); FRCLOSE(f_in); FWCLOSE(f_out); FWCLOSE(f_w); FWCLOSE(f_x); FWCLOSE(f_y); }
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) { //returns true if the transcript passed the filter char* gname=gffrec.getGeneName(); if (gname==NULL) gname=gffrec.getGeneID(); GStr defline(gffrec.getID()); if (f_out && !fmtGTF) { const char* tname=NULL; if ((tname=gffrec.getAttr("transcript_name"))!=NULL) { gffrec.addAttr("Name", tname); gffrec.removeAttr("transcript_name"); } } if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) { const char* biotype=gffrec.getAttr("gene_biotype"); if (biotype) { gffrec.addAttr("type", biotype); gffrec.removeAttr("gene_biotype"); } else { //old Ensembl files lacking gene_biotype gffrec.addAttr("type", gffrec.getTrackName()); } //bool is_gene=false; bool is_pseudo=false; if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS()) gffrec.setFeatureName("mRNA"); else { if (strcmp(biotype, "processed_transcript")==0) gffrec.setFeatureName("proc_RNA"); else { //is_gene=endsWith(biotype, "gene"); is_pseudo=strifind(biotype, "pseudo"); if (is_pseudo) { gffrec.setFeatureName("pseudo_RNA"); } else if (endsWith(biotype, "RNA")) { gffrec.setFeatureName(biotype); } else gffrec.setFeatureName("misc_RNA"); } } } if (gname && strcmp(gname, gffrec.getID())!=0) { int* isonum=isoCounter.Find(gname); if (isonum==NULL) { isonum=new int(1); isoCounter.Add(gname,isonum); } else (*isonum)++; defline.appendfmt(" gene=%s", gname); } int seqlen=0; const char* tlabel=tracklabel; if (tlabel==NULL) tlabel=gffrec.getTrackName(); //defline.appendfmt(" track:%s",tlabel); char* cdsnt = NULL; char* cdsaa = NULL; int aalen=0; for (int i=1;i<gffrec.exons.Count();i++) { int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1; if (ilen>4000000) GMessage("Warning: very large intron (%d) for transcript %s\n", ilen, gffrec.getID()); if (ilen>maxintron) { return false; } } GList<GSeg> seglst(false,true); GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec); if (spliceCheck && gffrec.exons.Count()>1) { //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC ) if (faseq==NULL) GError("Error: no genomic sequence available!\n"); int glen=gffrec.end-gffrec.start+1; const char* gseq=faseq->subseq(gffrec.start, glen); bool revcompl=(gffrec.strand=='-'); bool ssValid=true; for (int e=1;e<gffrec.exons.Count();e++) { const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start; int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1; GSpliceSite acceptorSite(intron,intronlen,true, revcompl); GSpliceSite donorSite(intron,intronlen, false, revcompl); //GMessage("%c intron %d-%d : %s .. %s\n", // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt); if (acceptorSite=="AG") { // GT-AG or GC-AG if (!donorSite.canonicalDonor()) { ssValid=false;break; } } else if (acceptorSite=="AC") { // if (donorSite!="AT") { ssValid=false; break; } } else { ssValid=false; break; } } //GFREE(gseq); if (!ssValid) { if (verbose) GMessage("Invalid splice sites found for '%s'\n",gffrec.getID()); return false; //don't print this one! } } bool trprint=true; int stopCodonAdjust=0; int mCDphase=0; bool hasStop=false; if (gffrec.CDphase=='1' || gffrec.CDphase=='2') mCDphase = gffrec.CDphase-'0'; if (f_y!=NULL || f_x!=NULL || validCDSonly) { if (faseq==NULL) GError("Error: no genomic sequence provided!\n"); //if (protmap && fullCDSonly) { //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) { if (validCDSonly) { //make sure the stop codon is always included //adjust_stopcodon(gffrec,3); stopCodonAdjust=adjust_stopcodon(gffrec,3); } int strandNum=0; int phaseNum=0; CDS_CHECK: cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst); if (cdsnt==NULL) trprint=false; else { //has CDS if (validCDSonly) { cdsaa=translateDNA(cdsnt, aalen, seqlen); char* p=strchr(cdsaa,'.'); hasStop=false; if (p!=NULL) { if (p-cdsaa>=aalen-2) { //stop found as the last codon *p='0';//remove it hasStop=true; if (aalen-2==p-cdsaa) { //previous to last codon is the stop codon //so correct the CDS stop accordingly adjust_stopcodon(gffrec,-3, &seglst); stopCodonAdjust=0; //clear artificial stop adjustment seqlen-=3; cdsnt[seqlen]=0; } aalen=p-cdsaa; } else {//stop found before the last codon trprint=false; } }//stop codon found if (trprint==false) { //failed CDS validity check //in-frame stop codon found if (altPhases && phaseNum<3) { phaseNum++; gffrec.CDphase = '0'+((mCDphase+phaseNum)%3); GFREE(cdsaa); goto CDS_CHECK; } if (gffrec.exons.Count()==1 && bothStrands) { strandNum++; phaseNum=0; if (strandNum<2) { GFREE(cdsaa); gffrec.strand = (gffrec.strand=='-') ? '+':'-'; goto CDS_CHECK; //repeat the CDS check for a different frame } } if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID()); } //has in-frame STOP if (fullCDSonly) { if (!hasStop || cdsaa[0]!='M') trprint=false; } } // CDS check requested } //has CDS } //translation or codon check/output was requested if (!trprint) { GFREE(cdsnt); GFREE(cdsaa); return false; } if (stopCodonAdjust>0 && !hasStop) { //restore stop codon location adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst); if (cdsnt!=NULL && seqlen>0) { seqlen-=stopCodonAdjust; cdsnt[seqlen]=0; } if (cdsaa!=NULL) aalen--; } if (f_y!=NULL) { //CDS translation fasta output requested //char* if (cdsaa==NULL) { //translate now if not done before cdsaa=translateDNA(cdsnt, aalen, seqlen); } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_y, defline, cdsaa, aalen); } if (f_x!=NULL) { //CDS only if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline.appendfmt("(%c)",gffrec.strand); //warning: not CDS coordinates are written here, but the exon ones defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; // -- here these are CDS substring coordinates on the spliced sequence: defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcript for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_x, defline, cdsnt, seqlen); } GFREE(cdsnt); GFREE(cdsaa); if (f_w!=NULL) { //write spliced exons uint cds_start=0; uint cds_end=0; seglst.Clear(); char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst); if (exont!=NULL) { if (gffrec.CDstart>0) { defline.appendfmt(" CDS=%d-%d", cds_start, cds_end); } if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline+=(char)'|'; defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; defline+=(char)'|'; defline+=(char)gffrec.strand; defline.append(" exons:"); for (int i=0;i<gffrec.exons.Count();i++) { if (i>0) defline.append(","); defline+=(int)gffrec.exons[i]->start; defline.append("-"); defline+=(int)gffrec.exons[i]->end; } defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_w, defline, exont, seqlen); GFREE(exont); } } //writing f_w (spliced exons) return true; }
int main(int argc, char * const argv[]) { GArgs args(argc, argv, "hFCq:r:o:"); int e; if ((e=args.isError())>0) GError("%s\nInvalid argument: %s\n", USAGE, argv[e]); if (args.getOpt('h')!=NULL){ GMessage("%s\n", USAGE); exit(1); } args.startNonOpt(); GStr fadb(args.nextNonOpt()); if (fadb.is_empty()) GError("%s Error: multi-fasta file expected!\n",USAGE); GStr fname(fadb); fname.append(".fai"); bool createLocal=(args.getOpt('F')!=NULL); const char* idxname=(createLocal)? NULL : fname.chars(); GFastaIndex faidx(fadb.chars(), idxname); //also tried to load the index if exists in the current directory GStr fnamecwd(fname); //name in current directory (without path) int ip=-1; if ((ip=fnamecwd.rindex(CHPATHSEP))>=0) { fnamecwd.cut(0,ip+1); } if (!createLocal) { //look for existing indexes to load //try the same directory as the fasta file first if (!faidx.hasIndex() and fileExists(fnamecwd.chars())>1) { //try current working directory next faidx.loadIndex(fnamecwd.chars()); } if (!faidx.hasIndex()) {//could not load any index data //try to create it in the same directory as the fasta file GMessage("No fasta index found. Rebuilding..\n"); faidx.buildIndex(); if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n"); GMessage("Fasta index rebuilt.\n"); //check if we can create a file there FILE* fcreate=fopen(fname.chars(), "w"); if (fcreate==NULL) GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fname.chars()); else { fclose(fcreate); if (faidx.storeIndex(fname.chars())<faidx.getCount()) GMessage("Warning: error writing the index file %s!\n",fname.chars()); } //creating index file in the same directory as fasta file }//trying to create the index file } if (createLocal || !faidx.hasIndex()) { //simply rebuild the index in the current directory and use it: //remove directories in path, if any if (faidx.getCount()==0) { faidx.buildIndex(); if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n"); } if (faidx.storeIndex(fnamecwd.chars())<faidx.getCount()) GMessage("Warning: error writing the index file %s!\n",fnamecwd.chars()); } GStr qry(args.getOpt('q')); if (qry.is_empty()) exit(0); GFastaRec* farec=faidx.getRecord(qry.chars()); if (farec==NULL) { GMessage("Error: couldn't find fasta record for '%s'!\n",qry.chars()); exit(1); } GFaSeqGet faseq(fadb.chars(),farec->seqlen, farec->fpos, farec->line_len, farec->line_blen); //TODO: read these from -r option uint qstart=0; uint qend=0; //farec->seqlen bool revCompl=(args.getOpt('C')!=NULL); char* s=args.getOpt('r'); if (s!=NULL) { char *p=s; while (isdigit(*p)) p++; if (*p=='-') { sscanf(s,"%u-%u",&qstart, &qend); if (qstart==0 || qend==0) GError("Error parsing sequence range: %s\n",s); } else if (*p==':') { int qlen=0; sscanf(s,"%u:%d", &qstart, &qlen); if (qstart==0 || qlen==0) GError("Error parsing sequence range: %s\n",s); qend=qstart+qlen-1; } else if (*p=='.') { sscanf(s,"%u..%u",&qstart, &qend); if (qstart==0 || qend==0) GError("Error parsing sequence range: %s\n",s); } } if (qstart==0) qstart=1; if (qend==0) qend=farec->seqlen; // call faseq.loadall() here if multiple ranges are to be extracted all // over this genomic sequence char* subseq=faseq.copyRange(qstart, qend, revCompl, true); FILE* f_out=NULL; openfwrite(f_out, args, 'o'); if (f_out==NULL) f_out=stdout; writeFasta(f_out, qry.chars(), NULL, subseq, 70, qend-qstart+1); GFREE(subseq); }
LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) { LytSeqInfo* seq; //s must be the line with sequence data char* p=strchrs(s," \t"); if (p==NULL) return NULL; p++; char c; int slen, soffs, clpL, clpR; clpL=0;clpR=0; if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL; p--; *p='\0'; if ((seq=seqinfo.Find(s))!=NULL) { GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n" " so it cannot be added for contig '%s (%d nt)'\n", s, seq->contig->name, seq->contig->len, ctg->name, ctg->len); return NULL; } seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR); seqinfo.shkAdd(seq->name, seq); ctg->seqs.Add(seq); //parse optional extensions, if any p+=strlen(s); //position p after the seqname char* m=NULL; int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart; char segSplice, nextsegSplice; while ((m=strchr(p,':'))!=NULL) { switch (*(m-1)) { case 'G': //segmenting info prevSegStart=soffs+clpL-1; p=m+1; //p to the beginning of G: data //accumulate the total length in lenSegs while (*p>='1' && *p<='9') { segEnd=0; segRclip=0; nextsegStart=0; nextsegLclip=0; segSplice=0; nextsegSplice=0; if (!parseInt(p,segEnd)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); if (*p=='c') { p++; if (!parseInt(p,segRclip)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); } if (*p=='S' || *p=='s') { segSplice=*p; p++; } if (*p!='-') GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); else p++; if (!parseInt(p,nextsegStart)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); if (*p=='c') { p++; if (!parseInt(p,nextsegLclip)) GError("Error [segment] at LayoutParser for %s at: %s\n", s, m-1); } if (*p=='S' || *p=='s') { nextsegSplice=*p; p++; } seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip, segSplice, nextsegSplice); prevSegStart=nextsegStart; // if (*p==',') p++; else break; } //while inter-segment parsing break; // 'G:' case case 'L': //clone mates list p=m+1; //p to the beginning of L: data break; case 'D': //difference sequence p=m+1; //p to the beginning of D: data break; case 'S': //actual sequence p=m+1; //p to the beginning of S: data break; default: p=m+1;//next attribute } } return seq; }
/* Load contig data; can be called by parse - and then no fseek is needed and the file position if right after parsing the contig summary data */ bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) { bool forgetCtg=false; char* r=NULL; if (ctgidx>=contigs.Count()) GError("LayoutParser: invalid contig index '%d'\n", ctgidx); LytCtgData* ctgdata=contigs[ctgidx]; if (re_pos && currentContig!=NULL) { //free previous contig data //unless it was a parse() call currentContig->seqs.Clear(); seqinfo.Clear(); } currentContig=ctgdata; if (re_pos) { seek(ctgdata->fpos); //position right where the contig definition starts r=linebuf->getLine(f,f_pos);//skip the first line if (r==NULL) return false; } if (seqfn!=NULL) forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL); int ctg_numSeqs=ctgdata->numseqs; int numseqs=0; while ((r=linebuf->getLine(f,f_pos))!=NULL) { if (linebuf->length()<4) continue; if (linebuf->chars()[0]=='>') { linebuf->pushBack(); break; //reached next contig } //sequence data parsing bool forgetSeq=false; LytSeqInfo* seq=NULL; if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) { GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars()); return false; } /* // Weird -- why would I MODIFY the given clipping of a sequence? //-- bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos); if (ctg_clipping) { if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length()) seq->left = ctgdata->lpos - seq->offs+1; if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs ) seq->right = ctgdata->rpos-seq->offs+1; } */ if (seqfn!=NULL) forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL); if (forgetSeq) { ctg_numSeqs--; seqinfo.Remove(seq->name); ctgdata->seqs.RemovePtr(seq); } else { numseqs++; } } //while sequences if (forgetCtg) { ctgIDs.Remove(ctgdata->name); contigs.RemovePtr(ctgdata); } if (numseqs!=ctg_numSeqs) { GMessage("Mismatching number of sequences found (%d) for contig '%s' " "(length %d, numseqs %d)\n", numseqs, ctgdata->name, ctgdata->len, ctg_numSeqs); return false; } return true; }
int GFastaIndex::buildIndex() { //this parses the whole fasta file, so it could be slow if (fa_name==NULL) GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n"); FILE* fa=fopen(fa_name,"rb"); if (fa==NULL) { GMessage("Warning: cannot open fasta index file: %s!\n",fa_name); return 0; } records.Clear(); GLineReader fl(fa); char* s=NULL; uint seqlen=0; int line_len=0,line_blen=0; bool newSeq=false; //set to true after defline off_t newSeqOffset=0; int prevOffset=0; char* seqname=NULL; int last_len=0; bool mustbeLastLine=false; //true if the line length decreases while ((s=fl.nextLine())!=NULL) { if (s[0]=='>') { if (seqname!=NULL) { if (seqlen==0) GError("Warning: empty FASTA record skipped (%s)!\n",seqname); else { //seqlen!=0 addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen); } } char *p=s; while (*p > 32) p++; *p=0; GFREE(seqname); seqname=Gstrdup(&s[1]); newSeq=true; newSeqOffset=fl.getfpos(); last_len=0; line_len=0; line_blen=0; seqlen=0; mustbeLastLine=false; } //defline parsing else { //sequence line int llen=fl.length(); int lblen=fl.getFpos()-prevOffset; if (newSeq) { //first sequence line after defline line_len=llen; line_blen=lblen; } else {//next seq lines after first if (mustbeLastLine || llen>last_len) GError(ERR_FALINELEN); if (llen<last_len) mustbeLastLine=true; } seqlen+=llen; last_len=llen; newSeq=false; } //sequence line prevOffset=fl.getfpos(); }//for each line of the fasta file if (seqlen>0) addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen); GFREE(seqname); fclose(fa); return records.Count(); }
void bitError(int idx) { GError("Error bit checking (index %d)!\n", idx); }