Пример #1
0
//read only the contig headers from the file
//also checks for duplicate seqnames (just in case)
bool AceParser::parseContigs() {
    if (f!=stdin)  seek(0);
    off_t ctgpos;
    numContigs=0;
    while ((ctgpos=fskipTo("CO "))>=0) {
        numContigs++;
        LytCtgData* ctgdata=new LytCtgData(ctgpos);
        char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
        if (p==NULL) {
            GMessage("AceParser: error parsing contig name!\n");
            return false;
        }
        int ctglen, ctg_numSeqs, numseqs;
        //p must be after contig name within linebuf!
        if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
            GMessage("Error parsing contig len and seq count at:\n%s\n",
                     linebuf->chars());
        }
        ctg_numSeqs=numseqs;
        ctgdata->len=ctglen;
        ctgdata->numseqs=numseqs;
        ctgdata->offs = 1;
        contigs.Add(ctgdata);
        //ctgpos=fskipTo("CO ");
    }
    if (ctgpos==-2) return false; //parsing failed: line too long ?!?
    contigs.setSorted(true);
    return true;
}
Пример #2
0
bool AceParser::parse(fnLytSeq* seqfn) {
    //read all seqs and their positions from the file
    //also checks for duplicate seqnames (just in case)
    if (f!=stdin)  seek(0);
    ctgIDs.Clear();
    //GHash<int> ctgIDs; //contig IDs, to make them unique!
    //
    off_t ctgpos;
    numContigs=0;
    while ((ctgpos=fskipTo("CO "))>=0) {
        numContigs++;
        LytCtgData* ctgdata=new LytCtgData(ctgpos);
        char* p=ctgdata->readName(linebuf->chars()+3, ctgIDs);
        if (p==NULL) {
            GMessage("AceParser: error parsing contig name!\n");
            return false;
        }
        int ctglen, numseqs;
        //p must be after contig name within linebuf!
        if (sscanf(p, "%d %d", &ctglen, &numseqs)!=2) {
            GMessage("Error parsing contig len and seq count at:\n%s\n",
                     linebuf->chars());
        }
        ctgdata->len=ctglen;
        ctgdata->numseqs = numseqs;
        ctgdata->offs = 1;
        int ctgidx=contigs.Add(ctgdata);
        loadContig(ctgidx, seqfn, false);
    } //while contigs
    if (ctgpos==-2) return false; //parsing failed: line too long ?!?
    contigs.setSorted(true);
    return true;
}
Пример #3
0
bool LayoutParser::parseContigs() { //load all the file offsets for contigs
  if (f!=stdin) seek(0);
  ctgIDs.Clear();
  //GHash<int> ctgIDs; //contig IDs, to make them unique!
  //
  int ctgpos; //locate the first contig line
  numContigs=0;
  while ((ctgpos=fskipTo(">"))>=0) {
    numContigs++;
    LytCtgData* ctgdata=new LytCtgData(ctgpos);
    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
    if (p==NULL) {
       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
       return false;
       }
    int ctg_lpos, ctg_rpos, numseqs;
    //p must be after contig name within linebuf!
    ctg_lpos=0;ctg_rpos=0;
    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
      GMessage("Error parsing contig len and seq count at:\n%s\n",
         p);
      return false;
      }
    ctgdata->len=ctg_rpos-ctg_lpos+1;
    ctgdata->numseqs=numseqs;
    ctgdata->lpos=ctg_lpos;
    ctgdata->rpos=ctg_rpos;
    ctgdata->offs=ctg_lpos;
    contigs.Add(ctgdata);
    //ctgpos=fskipTo(">");
    } //while lines
  if (ctgpos==-2) return false;
  contigs.setSorted(true);
  return true;
}
Пример #4
0
void testGPVec() {
 GPVec<Gint> vecs[3];
 vecs[1].Add(new Gint(2));
 vecs[2].Add(new Gint(3));
 GMessage("Added to vecs[1]:%d\n", vecs[1][0]->val());
 GMessage("Added to vecs[2]:%d\n", vecs[2][0]->val());
}
Пример #5
0
void printLocus(GffLocus* loc, const char* pre) {
  if (pre!=NULL) fprintf(stderr, "%s", pre);
  GMessage(" [%d-%d] : ", loc->start, loc->end);
  GMessage("%s",loc->rnas[0]->getID());
  for (int i=1;i<loc->rnas.Count();i++) {
    GMessage(",%s",loc->rnas[i]->getID());
    }
  GMessage("\n");
}
Пример #6
0
int Gmkdir(const char *path, bool recursive, int perms) {
	if (path==NULL || path[0]==0) return -1;
	mode_t process_mask = umask(0); //is this really needed?
	if (!recursive) {
	   int r=G_mkdir(path, perms);
	   if (r!=0) 
	      GMessage("Warning: G_mkdir(%s) failed: %s\n", path, strerror(errno));
	   umask(process_mask);
	   return r;
	   }
	int plen=strlen(path);
	char* gpath=NULL;
	//make sure gpath ends with /
	if (path[plen-1]=='/') {
		gpath=Gstrdup(path);
	}
	else {
		GMALLOC(gpath, plen+2);
		strcpy(gpath,path);
		strcat(gpath, "/");
		++plen;
	}
	//char* ss=gpath+plen-1;
	char* psep = gpath+plen-1; //start at the last /
	GDynArray<char*> dirstack(4); // stack of directories that should be created
	while (psep>gpath && *(psep-1)=='/') --psep; //skip double slashes
    *psep='\0';
    int fexists=0;
	while ((fexists=fileExists(gpath))==0) {
      dirstack.Push(psep);
      do { --psep; } while (psep>gpath && *psep!='/');
      if (psep<=gpath) { psep=NULL; break; }
      while (psep>gpath && *(psep-1)=='/') --psep;
      *psep='\0';
	}
	if (psep) *psep='/';
	while (dirstack.Count()>0) {
		psep=dirstack.Pop();
		int mkdir_err=0;
		if ((mkdir_err=G_mkdir(gpath, perms))!=0) {
				GMessage("Warning: mkdir(%s) failed: %s\n", gpath, strerror(errno));
			GFREE(gpath);
			umask(process_mask);
			return -1;
		}
		*psep='/';
	}
	GFREE(gpath);
	umask(process_mask);
	return 0;
}
Пример #7
0
char* AceParser::getSeq(LytSeqInfo* seq) {
    if (f==stdin || seek(seq->fpos)!=0) {
        GMessage("AceParser: error seeking seq '%s'\n", seq->name);
        return NULL;
    }
//skip the contig header:
    char* r=linebuf->getLine(f,f_pos);
    if (r==NULL || !startsWith(linebuf->chars(), "RD ", 3)) {
        GMessage("AceParser: error seeking seq '%s'\n"
                 " (no RD entry found at location %d)\n", seq->name, seq->fpos);
        return NULL;
    }
    return readSeq(seq);
}
Пример #8
0
char* AceParser::getContigSeq(LytCtgData* ctg) {
    if (f==stdin || seek(ctg->fpos)!=0) {
        GMessage("AceParser: error seeking contig '%s'\n", ctg->name);
        return NULL;
    }
//skip the contig header:
    char* r=linebuf->getLine(f,f_pos);
    if (r==NULL || !startsWith(linebuf->chars(), "CO ", 3)) {
        GMessage("AceParser: error seeking contig '%s'\n"
                 " (no CO entry found at location %d)\n", ctg->name, ctg->fpos);
        return NULL;
    }
    return readSeq();
}
Пример #9
0
char* LytCtgData::readName(char* s, GHash<int>& names) {
   char* p=strchrs(s, " \t");
   if (p!=NULL) {
       char* tmp;
       char* tmp2;
       GMALLOC(tmp, (p-s+30)*sizeof(char));
       strncpy(tmp, s,p-s);
       tmp[p-s]='\0';
       GMALLOC(tmp2, (p-s+30)*sizeof(char));
       strcpy(tmp2, tmp);
       //make it unique (by simple versioning)
       int v=0;
       while (names.hasKey(tmp2)) {
         v++;
         sprintf(tmp2, "%s.%d", tmp, v);
         }
       name=Gstrdup(tmp2);
       GFREE(tmp);
       GFREE(tmp2);
       names.shkAdd(name, new int(1));
       p++;
       }
      else {
       GMessage("LytCtgData::readName: Cannot find the token delimiter in:\n%s\n", s);
       }
     return p;
     }
Пример #10
0
void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
  GLineReader fr(f);
  while (!fr.isEof()) {
      char* line=fr.getLine();
      if (line==NULL) break;
      char* id=line;
      char* lenstr=NULL;
      char* text=NULL;
      char* p=line;
      while (*p!=0 && !isspace(*p)) p++;
      if (*p==0) continue;
      *p=0;p++;
      while (*p==' ' || *p=='\t') p++;
      if (*p==0) continue;
      lenstr=p;
      while (*p!=0 && !isspace(*p)) p++;
      if (*p!=0) { *p=0;p++; }
      while (*p==' ' || *p=='\t') p++;
      if (*p!=0) text=p; //else text remains NULL
      int len=0;
      if (!parseInt(lenstr,len)) {
         GMessage("Warning: could not parse sequence length: %s %s\n",
                  id, lenstr);
         continue;
         }
      // --- here we have finished parsing the line
      si.Add(id, new SeqInfo(len,text));
      } //while lines
}
Пример #11
0
LytSeqInfo* AceParser::addSeq(char* s, LytCtgData* ctg) {
    LytSeqInfo* seq;
    if (!startsWith(s, "AF ", 3))
        return NULL;
    s+=3;
    char* p=strchrs(s," \t");
    if (p==NULL) return NULL;
    p++;
    char c;
    int offs;
    if (sscanf(p,"%c %d", &c, &offs)!=2) return NULL;
    p--;
    *p='\0';
    if ((seq=seqinfo.Find(s))!=NULL) {
        GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
                 " so it cannot be added for contig '%s (%d)'\n",
                 s, seq->contig->name, seq->contig->len,
                 ctg->name, ctg->len);
        return NULL;
    }
    seq = new LytSeqInfo(s, ctg, offs, (c=='C') ? 1 : 0);
    seqinfo.shkAdd(seq->name, seq);
    ctg->seqs.Add(seq);
    return seq;
}
Пример #12
0
FILE* Gfopen(const char *path, char *mode) {
	FILE* f=NULL;
	if (mode==NULL) f=fopen(path, "rb");
	    	   else f=fopen(path, mode);
	if (f==NULL)
		GMessage("Error opening file '%s':  %s\n", path, strerror(errno));
	return f;
}
Пример #13
0
 void GBamRecord::set_cigar(const char* cigar) {
   //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
   int doff=b->core.l_qname;
   uint8_t* after_cigar=NULL;
   int after_cigar_len=0;
   uint8_t* prev_bdata=NULL;
   if (b->data_len>doff) {
      //cigar string already allocated, replace it
      int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
      after_cigar=b->data+d;
      after_cigar_len=b->data_len-d;
      }
   const char *s;
   char *t;
   int i, op;
   long x;
   b->core.n_cigar = 0;
   if (cigar != NULL && strcmp(cigar, "*") != 0) {
        for (s = cigar; *s; ++s) {
            if (isalpha(*s)) b->core.n_cigar++;
            else if (!isdigit(*s)) {
                 GError("Error: invalid CIGAR character (%s)\n",cigar);
                 }
            }
        if (after_cigar_len>0) { //replace/insert into existing full data
             prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
             memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
             free(prev_bdata);
             }
           else {
             realloc_bdata(b, doff + b->core.n_cigar * 4);
             }
        for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
            x = strtol(s, &t, 10);
            op = toupper(*t);
            if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
            else if (op == 'I') op = BAM_CINS;
            else if (op == 'D') op = BAM_CDEL;
            else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true;
            else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true;
            else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true;
            else if (op == 'P') op = BAM_CPAD;
            else GError("Error: invalid CIGAR operation (%s)\n",cigar);
            s = t + 1;
            bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
        }
        if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
        b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
    } else {//no CIGAR string given
        if (!(b->core.flag&BAM_FUNMAP)) {
            GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
            b->core.flag |= BAM_FUNMAP;
        }
        b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
    }
   setupCoordinates();
   } //set_cigar()
Пример #14
0
bool LayoutParser::parse(fnLytSeq* seqfn) {
  //read all seqs and their positions from the file
  //also checks for duplicate seqnames (just in case)
  if (f!=stdin) seek(0);
  ctgIDs.Clear();
  //GHash<int> ctgIDs; //contig IDs, to make them unique!
  //
  int ctgpos;
  numContigs=0;
  while ((ctgpos=fskipTo(">"))>=0) { //locate the contig line
    numContigs++;
    LytCtgData* ctgdata=new LytCtgData(ctgpos);
    char* p=ctgdata->readName(linebuf->chars()+1, ctgIDs);
    if (p==NULL) {
       GMessage("LayoutParser: error parsing contig name:\n%s\n", linebuf->chars());
       return false;
       }
    int ctg_lpos, ctg_rpos, numseqs;
    //p must be after contig name within linebuf!
    ctg_lpos=0;ctg_rpos=0;
    if (sscanf(p, "%d %d %d", &numseqs, &ctg_lpos, &ctg_rpos)<1) {
      GMessage("Error parsing contig len and seq count at:\n%s\n",
         p);
      return false;
      }
    //ctg_numSeqs=numseqs;
    ctgdata->numseqs=numseqs;
    ctgdata->rpos=ctg_rpos;
    ctgdata->lpos=ctg_lpos;
    ctgdata->len=ctg_rpos-ctg_lpos+1;
    ctgdata->offs=ctg_lpos;
    int ctgidx=contigs.Add(ctgdata);
    //now look and load all the component sequences
    loadContig(ctgidx, seqfn, false);
    } //while lines
  contigs.setSorted(true);
  return true;
  }
Пример #15
0
void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) {
     GFastaRec* farec=records.Find(seqname);
     if (farec!=NULL) {
          GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n");
          farec->seqlen=seqlen;
          farec->fpos=foffs;
          farec->line_len=llen;
          farec->line_blen=llen_full;
          }
     else {
         farec=new GFastaRec(seqlen,foffs,llen,llen_full);
         records.Add(seqname,farec);
         farec->seqname=records.getLastKey();
         }
     }
Пример #16
0
void GWait::GUIDoWait(int code)
{
	dowait_cnt += code;
	if(dowait_cnt > 0)
	{
		if(code > 0 && dowait_cnt == 1 && NULL == dowait_win)
		{
			dowait_win = new GWait();
			if(dowait_win)
			{
				dowait_win->nextObj =Gdesktop->parent->focus->nextObj;
				Gdesktop->parent->focus->nextObj = dowait_win;								//adds the new item to the Z list
				dowait_win->parent = Gdesktop->parent;										//LCD
				GQueue.push(GMessage(WM_INIT, 0, (long long)((LCD_MULT *)(Gdesktop->parent))->lcd, dowait_win));
			}
			if(!dowait_win)
				dowait_cnt = 0;
		}
		else
		{
			if(dowait_win)
			{
				if(code == 1) // begin wait
					dowait_win->add_owner();
				if(code == 0) // restore wait
					dowait_win->SetTimer(ID_BUSY_CLOCK, BUSY_START_TIME);
			}
		}
	}
	else
	{
		if(dowait_win)
		{
			GMessage msg;
			dowait_win->process_destroy(msg);
			delete dowait_win;
			dowait_win = NULL;
		}
		dowait_cnt = 0; // prevent underflow
	}
	dowait_locker = NULL;
}
Пример #17
0
int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index
    if (finame==NULL) finame=fai_name;
    if (finame!=fai_name) {
      fai_name=Gstrdup(finame);
      }
    if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n");
    records.Clear();
    haveFai=false;
    FILE* fi=fopen(fai_name,"rb");
    if (fi==NULL) {
       GMessage("Warning: cannot open fasta index file: %s!\n",fai_name);
       return 0;
       }
    GLineReader fl(fi);
    char* s=NULL;
    while ((s=fl.nextLine())!=NULL) {
      if (*s=='#') continue;
      char* p=strchrs(s,"\t ");
      if (p==NULL) GError(ERR_FAIDXLINE,s);
      *p=0; //s now holds the genomic sequence name
      p++;
      uint len=0;
      int line_len=0, line_blen=0;
      #ifdef __WIN32__
         long offset=-1;
         sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen);
      #else
         long long offset=-1;
         sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen);
      #endif
      if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len)
          GError(ERR_FAIDXLINE,p);
      addRecord(s,len,offset,line_len, line_blen);
      }
    fclose(fi);
    haveFai=(records.Count()>0);
    return records.Count();
    }
Пример #18
0
off_t LayoutParser::fskipTo(const char* linestart, const char* butnot) {
  /* reads the file from the current position
  until the next occurence of a line
  starting with linestart
  returns the line in buf and the file offset
  of the beginning of the line;
  the file offset is -1 if linestart was not found
  or -2 if an unwanted line-start came out of order
  */
    off_t lastpos=getFilePos();
    int tlen=strlen(linestart);
    int nlen=(butnot==NULL) ? 0 : strlen(butnot);
    while (linebuf->getLine(f, f_pos)!=NULL) {
     if (nlen>0 && startsWith(linebuf->line(), butnot, nlen)) {
        GMessage("fSkipTo: unwanted line '%s' encountered when searching for '%s'\n",
          linebuf->line(), linestart);
        return -2;
        }
     if (startsWith(linebuf->chars(), linestart, tlen)) return lastpos;
     lastpos=getFilePos();
     }
 return -1;
 }
Пример #19
0
void read_mRNAs(FILE* f, GList<GSeqData>& seqdata, GList<GSeqData>* ref_data,
	         int check_for_dups, int qfidx, const char* fname, bool only_multiexon) {
	//>>>>> read all transcripts/features from a GTF/GFF3 file
	//int imrna_counter=0;
#ifdef HEAPROFILE
    if (IsHeapProfilerRunning())
      HeapProfilerDump("00");
#endif
	int loci_counter=0;
	if (ref_data==NULL) ref_data=&seqdata;
	bool isRefData=(&seqdata==ref_data);
	                          //(f, transcripts_only)
	GffReader* gffr=new GffReader(f, true); //load only transcript annotations
	gffr->showWarnings(gtf_tracking_verbose);
	//            keepAttrs   mergeCloseExons   noExonAttrs
	gffr->readAll(!isRefData,          true,        isRefData || gtf_tracking_largeScale);
	//so it will read exon attributes only for low number of Cufflinks files
#ifdef HEAPROFILE
    if (IsHeapProfilerRunning())
      HeapProfilerDump("post_readAll");
#endif

	int d=parse_mRNAs(gffr->gflst, seqdata, isRefData, check_for_dups, qfidx,only_multiexon);
#ifdef HEAPROFILE
    if (IsHeapProfilerRunning())
      HeapProfilerDump("post_parse_mRNAs");
#endif
	if (gtf_tracking_verbose && d>0) {
	  if (isRefData) GMessage(" %d duplicate reference transcripts discarded.\n",d);
	            else GMessage(" %d redundant query transfrags discarded.\n",d);
	  }
	//imrna_counter=gffr->mrnas.Count();
	delete gffr; //free the extra memory and unused GffObjs
#ifdef HEAPROFILE
    if (IsHeapProfilerRunning())
      HeapProfilerDump("post_del_gffr");
#endif
	
	//for each genomic sequence, cluster transcripts
	int oriented_by_overlap=0;
	int initial_unoriented=0;
	int final_unoriented=0;
	GStr bname(fname);
	GStr s;
	if (!bname.is_empty()) {
		int di=bname.rindex('.');
		if (di>0) bname.cut(di);
		int p=bname.rindex('/');
		if (p<0) p=bname.rindex('\\');
		if (p>=0) bname.remove(0,p);
	}
	FILE* fdis=NULL;
	FILE* frloci=NULL;

	for (int g=0;g<seqdata.Count();g++) {
		//find the corresponding refseqdata with the same gseq_id
		int gseq_id=seqdata[g]->get_gseqid();
		if (!isRefData) { //query data, find corresponding ref data
			GSeqData* rdata=getRefData(gseq_id, *ref_data);
			initial_unoriented+=seqdata[g]->umrnas.Count();
			if (seqdata[g]->umrnas.Count()>0) {
			    oriented_by_overlap+=fix_umrnas(*seqdata[g], rdata, fdis);
			    final_unoriented+=seqdata[g]->umrnas.Count();
			    }
			}
		//>>>>> group mRNAs into locus-clusters (based on exon overlap)
		cluster_mRNAs(seqdata[g]->mrnas_f, seqdata[g]->loci_f, qfidx);
		cluster_mRNAs(seqdata[g]->mrnas_r, seqdata[g]->loci_r, qfidx);
		if (!isRefData) {
			cluster_mRNAs(seqdata[g]->umrnas, seqdata[g]->nloci_u, qfidx);
			}
		loci_counter+=seqdata[g]->loci_f.Count();
		loci_counter+=seqdata[g]->loci_r.Count();
//		if (refData) {
//			if (frloci==NULL) {
//				s=bname;
//				s.append(".loci.lst");
//				frloci=fopen(s.chars(), "w");
//			}
//			writeLoci(frloci, seqdata[g]->loci_f);
//			writeLoci(frloci, seqdata[g]->loci_r);
//		}//write ref loci
	}//for each genomic sequence
	if (fdis!=NULL) fclose(fdis);
	if (frloci!=NULL) fclose(frloci);
	if (initial_unoriented || final_unoriented) {
	  if (gtf_tracking_verbose) GMessage(" Found %d transfrags with undetermined strand (%d out of initial %d were fixed by overlaps)\n",
			    final_unoriented, oriented_by_overlap, initial_unoriented);
	}
	//if (fdis!=NULL) remove(s.chars()); remove 0-length file
#ifdef HEAPROFILE
    if (IsHeapProfilerRunning())
      HeapProfilerDump("post_cluster");
#endif
}
Пример #20
0
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, 
                          bool doCluster, bool doCollapseRedundant, 
						  bool matchAllIntrons, bool fuzzSpan, bool forceExons) {
	GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted
	gffr->showWarnings(this->showWarnings);
	//           keepAttrs   mergeCloseExons  noExonAttr
	gffr->readAll(this->fullAttributes,    this->mergeCloseExons,  this->noExonAttrs);
	GVec<int> pseudoAttrIds;
	GVec<int> pseudoFeatureIds;
	if (this->noPseudo) {
		GffNameList& fnames = gffr->names->feats;
		for (int i=0;i<fnames.Count();i++) {
			char* n=fnames[i]->name;
			if (startsWith(n, "pseudo")) {
				pseudoFeatureIds.Add(fnames[i]->idx);
			}
		}
		GffNameList& attrnames = gffr->names->attrs;
		for (int i=0;i<attrnames.Count();i++) {
			char* n=attrnames[i]->name;
			char* p=strifind(n, "pseudo");
			if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) {
				pseudoAttrIds.Add(attrnames[i]->idx);
			}
		}
	}

	//int redundant=0; //redundant annotation discarded
	if (verbose) GMessage("   .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars());
	//int rna_deleted=0;
	//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates
	for (int k=0;k<gffr->gflst.Count();k++) {
		GffObj* m=gffr->gflst[k];
		if (strcmp(m->getFeatureName(), "locus")==0 &&
				m->getAttr("transcripts")!=NULL) {
			continue; //discard locus meta-features
		}
		if (this->noPseudo) {
			bool is_pseudo=false;
			for (int i=0;i<pseudoFeatureIds.Count();++i) {
				if (pseudoFeatureIds[i]==m->ftype_id) {
					is_pseudo=true;
					break;
				}
			}
			if (is_pseudo) continue;
			for (int i=0;i<pseudoAttrIds.Count();++i) {
				char* attrv=NULL;
				if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]);
				if (attrv!=NULL) {
					char fc=tolower(attrv[0]);
					if (fc=='t' || fc=='y' || fc=='1') {
						is_pseudo=true;
						break;
					}
				}
			}
			if (is_pseudo) continue;
			//last resort:
			//  scan all the attribute values for "pseudogene" keyword (NCBI does that for "product" attr)
			/*
			 if (m->attrs!=NULL) {
				 for (int i=0;i<m->attrs->Count();++i) {
					 GffAttr& a=*(m->attrs->Get(i));
					 if (strifind(a.attr_val, "pseudogene")) {
						 is_pseudo=true;
						 break;
					 }
				 }
			 }
			 if (is_pseudo) continue;
			 */
		} //pseudogene detection requested
		char* rloc=m->getAttr("locus");
		if (rloc!=NULL && startsWith(rloc, "RLOC_")) {
			m->removeAttr("locus", rloc);
		}
		/*
     if (m->exons.Count()==0 && m->children.Count()==0) {
       //a non-mRNA feature with no subfeatures
       //add a dummy exon just to have the generic exon checking work
       m->addExon(m->start,m->end);
       }
		 */
		if (forceExons) {  // && m->children.Count()==0) {
			m->exon_ftype_id=gff_fid_exon;
		}
		//GList<GffObj> gfadd(false,false); -- for gf_validate()?
		if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) {
			continue;
		}
		m->isUsed(true); //so the gffreader won't destroy it
		int i=-1;
		GenomicSeqData f(m->gseq_id);
		GenomicSeqData* gdata=NULL;
		if (seqdata.Found(&f,i)) gdata=seqdata[i];
		else { //entry not created yet for this genomic seq
			gdata=new GenomicSeqData(m->gseq_id);
			seqdata.Add(gdata);
		}
		/*
		for (int k=0;k<gfadd.Count();k++) {
			bool keep=placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
			if (!keep) {
				gfadd[k]->isUsed(false);
				//DEBUG
				GMessage("Feature %s(%d-%d) is going to be discarded..\n",gfadd[k]->getID(), gfadd[k]->start, gfadd[k]->end);
			}
		}
		*/
		bool keep=placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan);
		if (!keep) {
			m->isUsed(false);
			//DEBUG
			//GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end);
		}
	} //for each read gffObj
	//if (verbose) GMessage("  .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars());
	if (f!=stdin) { fclose(f); f=NULL; }
	delete gffr;
}
Пример #21
0
int parse_mRNAs(GfList& mrnas,
				 GList<GSeqData>& glstdata,
				 bool is_ref_set,
				 int check_for_dups,
				 int qfidx, bool only_multiexon) {
	int tredundant=0; //redundant transcripts discarded
	int total_kept=0;
	int total_seen=mrnas.Count();
	for (int k=0;k<mrnas.Count();k++) {
		GffObj* m=mrnas[k];
		int i=-1;
		GSeqData f(m->gseq_id);
		GSeqData* gdata=NULL;
		uint tlen=m->len();
		if (m->hasErrors() || (tlen+500>GFF_MAX_LOCUS)) { //should probably report these in a file too..
			if (gtf_tracking_verbose) 
			      GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", m->getID(), tlen);
			continue;
			}
		if (only_multiexon && m->exons.Count()<2) {
			continue;
			}
		//GStr feature(m->getFeatureName());
		//feature.lower();
		//bool gene_or_locus=(feature.endsWith("gene") ||feature.index("loc")>=0);
		//if (m->exons.Count()==0 && gene_or_locus) {
		if (m->isDiscarded()) {
			//discard generic "gene" or "locus" features with no other detailed subfeatures
			if (!is_ref_set && gtf_tracking_verbose)
			   GMessage("Warning: discarding non-transfrag (GFF generic gene/locus container?) %s\n",m->getID());
			continue;
			}

		if (m->exons.Count()==0) {
				if (gtf_tracking_verbose && !is_ref_set)
				 GMessage("Warning: %s %s found without exon segments (adding default exon).\n",m->getFeatureName(), m->getID());
				m->addExon(m->start,m->end);
				}
		if (glstdata.Found(&f,i)) gdata=glstdata[i];
		else {
			gdata=new GSeqData(m->gseq_id);
			glstdata.Add(gdata);
			}
		
		double fpkm=0;
		double cov=0;
		double conf_hi=0;
		double conf_lo=0;

		GList<GffObj>* target_mrnas=NULL;
		if (is_ref_set) { //-- ref transcripts
		   if (m->strand=='.') {
		     //unknown strand - discard from reference set (!)
		     continue;
		     }
		   total_kept++;
		   target_mrnas=(m->strand=='+') ? &(gdata->mrnas_f) : &(gdata->mrnas_r);
		   if (check_for_dups) {
		     //check all gdata->mrnas_r (ref_data) for duplicate ref transcripts
		     int rpidx=-1;
		     GffObj* rp= is_RefDup(m, *target_mrnas, rpidx);
		     if (rp!=NULL) { //duplicate found
		      //discard one of them
		      //but let's keep the gene_name if present
		      //DEBUG:
		      //GMessage("Ref duplicates: %s = %s\n", rp->getID(), m->getID());
		      tredundant++;
		      total_kept--;
		      if (betterDupRef(rp, m)) {
		           if (rp->getGeneName()==NULL && m->getGeneName()!=NULL) {
		                  rp->setGeneName(m->getGeneName());
		                  }
		           continue;
		           }
		         else {
		           if (m->getGeneName()==NULL && rp->getGeneName()!=NULL) {
		                  m->setGeneName(rp->getGeneName());
		                  }
		           ((CTData*)(rp->uptr))->mrna=NULL;
		           rp->isUsed(false);
		           target_mrnas->Forget(rpidx);
		           target_mrnas->Delete(rpidx);
		           }
		       }
		     } //check for duplicate ref transcripts
		   } //ref transcripts
		else { //-- query transfrags
		   if (m->strand=='+') { target_mrnas = &(gdata->mrnas_f); }
		     else if (m->strand=='-') { target_mrnas=&(gdata->mrnas_r); }
		        else { m->strand='.'; target_mrnas=&(gdata->umrnas); }
		   total_kept++;
		   if (check_for_dups) { //check for redundancy
		     // check if there is a redundancy between this and another already loaded Cufflinks transcript
		     int cidx =  is_Redundant(m, target_mrnas, (check_for_dups>1));
		     if (cidx>=0) {
		        //always discard the redundant transcript with the fewer exons OR shorter
			     tredundant++;
		         total_kept--;
		    	 if (t_dominates(target_mrnas->Get(cidx),m)) {
		            //new transcript is shorter, discard it
		        	if (gtf_tracking_verbose) {
		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", m->getID(), target_mrnas->Get(cidx)->getID());
		        	}
		            continue;
		        }
		        else {
		            //discard the older transfrag
		        	if (gtf_tracking_verbose) {
		        		GMessage(" transfrag %s discarded (made redundant by %s)\n", target_mrnas->Get(cidx)->getID(), m->getID());
		        	}
		            ((CTData*)(target_mrnas->Get(cidx)->uptr))->mrna=NULL;
		            target_mrnas->Get(cidx)->isUsed(false);
		            target_mrnas->Forget(cidx);
		            target_mrnas->Delete(cidx);
		            //the uptr (CTData) pointer will still be kept in gdata->ctdata and deallocated eventually
		        }
		     }
		   }// redundant transfrag check
		   if (m->gscore==0.0)   
		     m->gscore=m->exons[0]->score; //Cufflinks exon score = isoform abundance
		   //const char* expr = (gtf_tracking_largeScale) ? m->getAttr("FPKM") : m->exons[0]->getAttr(m->names,"FPKM");
		   const char* expr = m->getAttr("FPKM");
		   if (expr!=NULL) {
		       if (expr[0]=='"') expr++;
		       fpkm=strtod(expr, NULL);
		       } else { //backward compatibility: read RPKM if FPKM not found
		       //expr=(gtf_tracking_largeScale) ? m->getAttr("RPKM") : m->exons[0]->getAttr(m->names,"RPKM");
		       expr=m->getAttr("RPKM");
		       if (expr!=NULL) {
		           if (expr[0]=='"') expr++;
		           fpkm=strtod(expr, NULL);
		           }
		       }
		   //const char* scov=(gtf_tracking_largeScale) ? m->getAttr("cov") : m->exons[0]->getAttr(m->names,"cov");
		   const char* scov=m->getAttr("cov");
		   if (scov!=NULL) {
		       if (scov[0]=='"') scov++; 
		       cov=strtod(scov, NULL);
		       }
		   //const char* sconf_hi=(gtf_tracking_largeScale) ? m->getAttr("conf_hi") : m->exons[0]->getAttr(m->names,"conf_hi");
		   const char* sconf_hi=m->getAttr("conf_hi");
		   if (sconf_hi!=NULL){
		       if (sconf_hi[0]=='"') sconf_hi++;
		       conf_hi=strtod(sconf_hi, NULL);
		       }
		   //const char* sconf_lo=(gtf_tracking_largeScale) ? m->getAttr("conf_lo") : m->exons[0]->getAttr(m->names,"conf_lo");
		   const char* sconf_lo=m->getAttr("conf_lo");
		   if (sconf_lo!=NULL) {
		       if (sconf_lo[0]=='"') sconf_lo++;
		       conf_lo=strtod(sconf_lo, NULL);
		       }
		   } //Cufflinks transfrags
		target_mrnas->Add(m);
		m->isUsed(true);
		CTData* mdata=new CTData(m);
		mdata->qset=qfidx;
		gdata->tdata.Add(mdata);
		if (!is_ref_set) {
		// Cufflinks - attributes parsing
		   mdata->FPKM=fpkm;
		   mdata->cov=cov;
		   mdata->conf_hi=conf_hi;
		   mdata->conf_lo=conf_lo;
		   }
	}//for each mrna read
	if (gtf_tracking_verbose) {
		if (is_ref_set)
       GMessage(" Kept %d ref transcripts out of %d\n", total_kept, total_seen);
		else
	   GMessage(" Kept %d transfrags out of %d\n", total_kept, total_seen);
	}
 //if (mrna_deleted>0)
 //  mrnas.Pack();
 
 //return (is_ref_set ? refdiscarded : tredundant);
	return tredundant;
}
Пример #22
0
int main(int argc, char * const argv[]) {
  GArgs args(argc, argv, "hFCq:r:o:");
  int e;
  if ((e=args.isError())>0)
    GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
    if (args.getOpt('h')!=NULL){
      GMessage("%s\n", USAGE);
                exit(1);
      }
  args.startNonOpt();
  GStr fadb(args.nextNonOpt());
  if (fadb.is_empty()) GError("%s Error: multi-fasta file expected!\n",USAGE);
  GStr fname(fadb);
  fname.append(".fai");
  bool createLocal=(args.getOpt('F')!=NULL);
  const char* idxname=(createLocal)? NULL : fname.chars();
  GFastaIndex faidx(fadb.chars(), idxname);
  //also tried to load the index if exists in the current directory
  GStr fnamecwd(fname); //name in current directory (without path)
  int ip=-1;
  if ((ip=fnamecwd.rindex(CHPATHSEP))>=0) {
    fnamecwd.cut(0,ip+1);
    }
  if (!createLocal) { //look for existing indexes to load
    //try the same directory as the fasta file first
    if (!faidx.hasIndex() and fileExists(fnamecwd.chars())>1) { //try current working directory next
        faidx.loadIndex(fnamecwd.chars());
       }
    if (!faidx.hasIndex()) {//could not load any index data
       //try to create it in the same directory as the fasta file
       GMessage("No fasta index found. Rebuilding..\n");
       faidx.buildIndex();
       if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n");
       GMessage("Fasta index rebuilt.\n");
       //check if we can create a file there
       FILE* fcreate=fopen(fname.chars(), "w");
       if (fcreate==NULL)
         GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fname.chars());
       else {
         fclose(fcreate);
         if (faidx.storeIndex(fname.chars())<faidx.getCount())
           GMessage("Warning: error writing the index file %s!\n",fname.chars());
         } //creating index file in the same directory as fasta file
       }//trying to create the index file
    }

  if (createLocal || !faidx.hasIndex()) {
    //simply rebuild the index in the current directory and use it:
    //remove directories in path, if any
    if (faidx.getCount()==0) {
          faidx.buildIndex();
          if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n");
          }
    if (faidx.storeIndex(fnamecwd.chars())<faidx.getCount())
        GMessage("Warning: error writing the index file %s!\n",fnamecwd.chars());
    }

  GStr qry(args.getOpt('q'));
  if (qry.is_empty()) exit(0);
  GFastaRec* farec=faidx.getRecord(qry.chars());
  if (farec==NULL) {
      GMessage("Error: couldn't find fasta record for '%s'!\n",qry.chars());
      exit(1);
      }
  GFaSeqGet faseq(fadb.chars(),farec->seqlen, farec->fpos, farec->line_len, farec->line_blen);
  //TODO: read these from -r option
  uint qstart=0;
  uint qend=0; //farec->seqlen
  bool revCompl=(args.getOpt('C')!=NULL);
  char* s=args.getOpt('r');
  if (s!=NULL) {
     char *p=s;
     while (isdigit(*p)) p++;
     if (*p=='-') {
                sscanf(s,"%u-%u",&qstart, &qend);
                if (qstart==0 || qend==0)
                      GError("Error parsing sequence range: %s\n",s);
                }
       else if (*p==':') {
                int qlen=0;
                sscanf(s,"%u:%d", &qstart, &qlen);
                if (qstart==0 || qlen==0)
                     GError("Error parsing sequence range: %s\n",s);
                qend=qstart+qlen-1;
                }
       else if (*p=='.') {
               sscanf(s,"%u..%u",&qstart, &qend);
               if (qstart==0 || qend==0)
               GError("Error parsing sequence range: %s\n",s);
               }
     }
  if (qstart==0) qstart=1;
  if (qend==0) qend=farec->seqlen;
  // call faseq.loadall() here if multiple ranges are to be extracted all
  // over this genomic sequence
  char* subseq=faseq.copyRange(qstart, qend, revCompl, true);
  FILE* f_out=NULL;
  openfwrite(f_out, args, 'o');
  if (f_out==NULL) f_out=stdout;
  writeFasta(f_out, qry.chars(), NULL, subseq, 70, qend-qstart+1);
  GFREE(subseq);
}
Пример #23
0
bool GffLoader::placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant,
                                               bool matchAllIntrons, bool fuzzSpan) {
  bool keep=false;
  GTData* tdata=NULL;
  //int tidx=-1;
  /*
  if (debug) {
     GMessage(">>Placing transcript %s\n", t->getID());
     debugState=true;
     }
    else debugState=false; 
   */
  //dumb TRNA case for RefSeq: gene parent link missing
  //try to restore it here; BUT this only works if gene feature comes first
  ////DEBUG ONLY:
  //if (strcmp(t->getID(),"id24448")==0) { //&& t->start==309180) {
  //	 GMessage("placeGf %s (%d, %d) (%d exons)\n", t->getID(),t->start, t->end, t->exons.Count());
  //}
  //GMessage("DBG>>Placing transcript %s(%d-%d, %d exons)\n", t->getID(), t->start, t->end, t->exons.Count());
  if (t->parent==NULL && t->isTranscript()) {
  	int gidx=gdata->gfs.Count()-1;
  	while (gidx>=0 && gdata->gfs[gidx]->end>=t->start) {
  		GffObj& g = *(gdata->gfs[gidx]);
  		if (g.isGene() && t->strand==g.strand && exonOverlap2Gene(t, g)) {
  			g.children.Add(t);
  			keep=true;
  			if (tdata==NULL) {
  		       tdata=new GTData(t); //additional transcript data
  		       gdata->tdata.Add(tdata);
  			}
  			t->parent=&g;
  			//disable printing of gene if transcriptsOnly
  			if (transcriptsOnly) {
  				g.udata|=4; //tag it as non-printable
  			}
  			const char* geneName=g.getAttr("Name");
  			if (t->getAttr("Name")==NULL && geneName) {
  				t->addAttr("Name", geneName);
  				t->addAttr("gene_name", geneName);
  			}
  			t->addAttr("geneID", g.getID());
  			break;
  		}
  		--gidx;
  	}
  }

  /*
	if (t->exons.Count()==0  && t->children.Count()==0 && forceExons) {
		//a non-mRNA feature with no subfeatures
		//just so we get some sequence functions working, add a dummy "exon"-like subfeature here
		//--this could be a single "pseudogene" entry or another genomic region without exons
		//
		t->addExon(t->start,t->end);
	}
  */
  if (t->exons.Count()>0) {
                gdata->rnas.Add(t); //added it in sorted order
    		    if (tdata==NULL) {
    		       tdata=new GTData(t); //additional transcript data
    		       gdata->tdata.Add(tdata);
    		    }
                keep=true;
              }
            else {
              if (t->isGene() || !this->transcriptsOnly) {
              	   gdata->gfs.Add(t);
              	   keep=true;
              	   //GTData* tdata=new GTData(t); //additional transcript data
        		   if (tdata==NULL) {
        		       tdata=new GTData(t); //additional transcript data
        		       gdata->tdata.Add(tdata);
        		   }
              	   return true;
                 }
              else
                 return false; //nothing to do with these non-transcript objects
              }
  if (!doCluster) return keep;
  if (!keep) return false;
  //---- place into a locus
  if (gdata->loci.Count()==0) {
       gdata->loci.Add(new GffLocus(t));
       return true; //new locus on this ref seq
       }
  int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end
  //GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx);
  if (nidx==0) {
     //cannot have any overlapping loci
     //if (debug) GMessage("  <<no ovls possible, create locus %d-%d \n",t->start, t->end);
     gdata->loci.Add(new GffLocus(t));
     return true;
     }
  if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end
  int lfound=0; //count of parent loci
  GArray<int> mrgloci(false);
  GList<GffLocus> tloci(true); //candidate parent loci to adopt this
  //if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1);
  for (int l=nidx-1;l>=0;l--) {
      GffLocus& loc=*(gdata->loci[l]);
      if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue;
      if (t->start>loc.end) {
           if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already
           continue;
           }
      if (loc.start>t->end) {
               //this should never be the case if nidx was found correctly
               GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID());
               continue;
               }
      if (loc.add_RNA(t)) {
         //will add this transcript to loc
         lfound++;
         mrgloci.Add(l);
         if (collapseRedundant) {
           //compare to every single transcript in this locus
           for (int ti=0;ti<loc.rnas.Count();ti++) {
                 if (loc.rnas[ti]==t) continue;
                 GTData* odata=(GTData*)(loc.rnas[ti]->uptr);
                 //GMessage("  ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID());
                 GffObj* container=NULL;
                 if (odata->replaced_by==NULL && 
                      (container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) {
                     if (container==t) {
                        odata->replaced_by=t;
                        preserveContainedCDS(t, loc.rnas[ti]);
                        }
                     else {// t is being replaced by previously defined transcript
                        tdata->replaced_by=loc.rnas[ti];
                        preserveContainedCDS(loc.rnas[ti], t);
                        }
                     }
              }//for each transcript in the exon-overlapping locus
          } //if doCollapseRedundant
         } //overlapping locus
      } //for each existing locus
  if (lfound==0) {
      //overlapping loci not found, create a locus with only this mRNA
      int addidx=gdata->loci.Add(new GffLocus(t));
      if (addidx<0) {
         //should never be the case!
         GMessage("  WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end);
         }
      }
   else { //found at least one overlapping locus
     lfound--;
     int locidx=mrgloci[lfound];
     GffLocus& loc=*(gdata->loci[locidx]);
     //last locus index found is also the smallest index
     if (lfound>0) {
       //more than one loci found parenting this mRNA, merge loci
       /* if (debug)
          GMessage(" merging %d loci \n",lfound);
       */
       for (int l=0;l<lfound;l++) {
          int mlidx=mrgloci[l]; 
          loc.addMerge(*(gdata->loci[mlidx]), t);
          gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove
          }
       }
     int i=locidx;  
     while (i>0 && loc<*(gdata->loci[i-1])) {
       //bubble down until it's in the proper order
       i--;
       gdata->loci.Swap(i,i+1);
       }
     }//found at least one overlapping locus
  return true;
}
Пример #24
0
 void GBamRecord::add_aux(const char* str) {
     //requires: being called AFTER add_quals()
     int strl=strlen(str);
     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
     //int doff0=doff;
     if (strl < 6 || str[2] != ':' || str[4] != ':')
         parse_error("missing colon in auxiliary data");
     tag[0] = str[0]; tag[1] = str[1];
     uint8_t atype = str[3];
     uint8_t* adata=abuf;
     int alen=0;
     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
         atype='A';
         alen=1;
         adata=(uint8_t*)&str[5];
         }
      else if (atype == 'I' || atype == 'i') {

		 long long x=strtoll(str+5, NULL, 10); //(long long)atoll(str + 5);
		 //long x=(long)atol(str + 5);
		 if (x < 0) {
             if (x >= -127) {
                 atype='c';
                 abuf[0] =  (int8_t)x;
                 alen=1;
                 }
             else if (x >= -32767) {
                 atype = 's';
                 *(int16_t*)abuf = (int16_t)x;
                 alen=2;
                 }
             else {
                 atype='i';
                 *(int32_t*)abuf = (int32_t)x;
                 alen=4;
                 if (x < -2147483648ll)
                     GMessage("Parse warning: integer %lld is out of range.", x);
                 }
             } else { //x >=0
             if (x <= 255) {
                 atype = 'C';
                 abuf[0] = (uint8_t)x;
                 alen=1;
                 }
             else if (x <= 65535) {
                 atype='S';
                 *(uint16_t*)abuf = (uint16_t)x;
                 alen=2;
                 }
             else {
                 atype='I';
                 *(uint32_t*)abuf = (uint32_t)x;
                 alen=4;
                 if (x > 4294967295ll)
                     GMessage("Parse warning: integer %lld is out of range.", x);
                 }
             }
         } //integer type
         else if (atype == 'f') {
             *(float*)abuf = (float)atof(str + 5);
             alen = sizeof(float);
             }
         else if (atype == 'd') { //?
             *(float*)abuf = (float)atof(str + 9);
             alen=8;
             }
         else if (atype == 'Z' || atype == 'H') {
             if (atype == 'H') { // check whether the hex string is valid
                 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
                 for (int i = 0; i < strl - 5; ++i) {
                     int c = toupper(str[5 + i]);
                     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
                         parse_error("invalid hex character");
                     }
                 }
             memcpy(abuf, str + 5, strl - 5);
             abuf[strl-5] = 0;
             alen=strl-4;
             } else parse_error("unrecognized aux type");
  this->add_aux(tag, atype, alen, adata);
  }//add_aux()
Пример #25
0
bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {

    bool forgetCtg = false;
    if (ctgidx>=contigs.Count())
        GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
    LytCtgData* ctgdata=contigs[ctgidx];
    if (re_pos && currentContig!=NULL) { //free previously loaded contig data
        currentContig->seqs.Clear();       // unless it was a parse() call
        seqinfo.Clear();
    }
    currentContig=ctgdata;
    int ctg_numSeqs=ctgdata->numseqs;

    if (re_pos) {
        seek(ctgdata->fpos); //position right where the contig definition starts
        char *r = linebuf->getLine(f,f_pos);
        if (r==NULL) return false;
    }

    if (seqfn!=NULL) { //process the contig sequence!
        char* ctgseq=readSeq();
        forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
        GFREE(ctgseq); //obviously the caller should have made a copy
    }
    //now look for all the component sequences
    if (fskipTo("AF ")<0) {
        GMessage("AceParser: error finding sequence offsets (AF)"
                 " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
        return false;
    }
    int numseqs=0;
    while (startsWith(linebuf->chars(), "AF ",3)) {
        if (addSeq(linebuf->chars(), ctgdata)==NULL) {
            GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
            return false;
        }
        numseqs++;
        //read next line:
        linebuf->getLine(f,f_pos);
    }
    if (numseqs!=ctg_numSeqs) {
        GMessage("Invalid number of AF entries found (%d) for contig '%s' "
                 "(length %d, numseqs %d)\n", numseqs,
                 ctgdata->name, ctgdata->len, ctg_numSeqs);
        return false;
    }
    //now read each sequence entry
    off_t seqpos=fskipTo("RD ");
    numseqs=0; //count again, now the RD entries
    if (seqpos<0) {
        GMessage("AceParser: error locating first RD entry for contig '%s'\n",
                 ctgdata->name);
        return false;
    }
    //int numseqs=0;
    //reading the actual component sequence details
    while (startsWith(linebuf->chars(), "RD ",3)) {
        char* s=linebuf->chars()+3;
        char* p=strchrs(s, " \t");
        LytSeqInfo* seq;
        if (p==NULL) {
            GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
            return false;
        }
        *p='\0';
        if ((seq=seqinfo.Find(s))==NULL) {
            GMessage("AceParser: unknown RD encountered: '%s'\n", s);
            return false;
        }
        p++; //now p is in linebuf after the RD name
        seq->fpos=seqpos;
        int len;
        if (sscanf(p, "%d", &len)!=1) {
            GMessage("AceParser: cannot parse RD length for '%s'\n", s);
            return false;
        }
        seq->setLength(len);
        //read the sequence data here if a callback fn was given:
        char* sseq=NULL;
        if (seqfn!=NULL)
            sseq=readSeq(seq); //read full sequence here
        if (fskipTo("QA ")<0) {
            GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
            return false;
        }
        //parse QA entry:
        int tmpa, tmpb;
        if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
                seq->left<=0 || seq->right<=0) {
            GMessage("AceParser: Error parsing QA entry.\n");
            return false;
        }
        /*
        if (fskipTo("DS")<0) {
             GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
             return false;
             }
             */
        seqpos=getFilePos()+1;
        bool forgetSeq=false;
        if (seqfn!=NULL) {
            forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
            GFREE(sseq);
        }
        if (forgetSeq) { //parsing the whole stream -- aceconv)
            ctg_numSeqs--;
            seqinfo.Remove(seq->name);
            ctgdata->seqs.RemovePtr(seq);
        }
        numseqs++;
        if (numseqs<ctgdata->numseqs)
            seqpos=fskipTo("RD ", "CO "); //more sequences left to read
    }
    if (numseqs!=ctgdata->numseqs) {
        GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
                 "(length %d, numseqs %d)\n", numseqs,
                 ctgdata->name, ctgdata->len, ctg_numSeqs);
        return false;
    }
    if (forgetCtg) {
        ctgIDs.Remove(ctgdata->name);
        ctgdata->seqs.Clear();
        seqinfo.Clear();
        contigs.RemovePtr(ctgdata);
    }
    return true;
}
Пример #26
0
/*
 Load contig data; can be called by parse - and then no fseek is needed and
 the file position if right after parsing the contig summary data
*/
bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
    bool forgetCtg=false;
    char* r=NULL;
    if (ctgidx>=contigs.Count())
      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);

    LytCtgData* ctgdata=contigs[ctgidx];
    if (re_pos && currentContig!=NULL) { //free previous contig data
                                          //unless it was a parse() call
      currentContig->seqs.Clear();
      seqinfo.Clear();
      }
    currentContig=ctgdata;
    if (re_pos) {
       seek(ctgdata->fpos); //position right where the contig definition starts
       r=linebuf->getLine(f,f_pos);//skip the first line
       if (r==NULL) return false;
       }
    if (seqfn!=NULL)
       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
    int ctg_numSeqs=ctgdata->numseqs;
    int numseqs=0;
    while ((r=linebuf->getLine(f,f_pos))!=NULL) {
       if (linebuf->length()<4) continue;
       if (linebuf->chars()[0]=='>') {
            linebuf->pushBack();
            break; //reached next contig
            }
       //sequence data parsing

       bool forgetSeq=false;
       LytSeqInfo* seq=NULL;
       if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
         GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
         return false;
         }
        /*
        // Weird -- why would I MODIFY the given clipping of a sequence?
        //--
        bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
        if (ctg_clipping) {
          if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
             seq->left = ctgdata->lpos - seq->offs+1;
            if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
             seq->right = ctgdata->rpos-seq->offs+1;
          } */
        if (seqfn!=NULL)
          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
        if (forgetSeq) {
            ctg_numSeqs--;
            seqinfo.Remove(seq->name);
            ctgdata->seqs.RemovePtr(seq);
            }
          else {
            numseqs++;
            }
       } //while sequences
     if (forgetCtg) {
      ctgIDs.Remove(ctgdata->name);
      contigs.RemovePtr(ctgdata);
      }
    if (numseqs!=ctg_numSeqs) {
       GMessage("Mismatching number of sequences found (%d) for contig '%s' "
         "(length %d, numseqs %d)\n", numseqs,
                ctgdata->name, ctgdata->len, ctg_numSeqs);
       return false;
       }
return true;
}
Пример #27
0
char* AceParser::readSeq(LytSeqInfo* sqinfo) {
//assumes the next line is where a sequence starts!
//stops at the next empty line encountered
    char* buf;
    static char rlenbuf[12]= {0,0,0,0,0,0,0,0,0,0,0,0}; //buffer for parsing the gap length
    int rlenbufacc=0; //how many digits accumulated in rlenbuf so far
    int buflen=512;
    GMALLOC(buf, buflen); //this MUST be freed by the caller
    buf[0]='\0';
    int accrd=0; //accumulated read length so far -- excludes interseg gaps!
    int rgpos=0; //accumulated offset including interseg gaps!
    char *r = linebuf->getLine(f,f_pos);
    int linelen=linebuf->length();
    char r_splice=0, l_splice=0;
    while (linelen>0) {
        if (r==NULL) {
            GMessage("AceParser: error reading sequence data\n");
            return NULL;
        }
        //-- pass the line content for accumulation
        int i=0;
        while (r[i]) {
            while (r[i] && isdigit(r[i])) {
                rlenbuf[rlenbufacc]=r[i];
                rlenbufacc++;
                i++;
            }
            if (r[i]==0) break; //end of line reached already
            //now r[i] is surely a non-digit char
            if (rlenbufacc>0) { //have we just had a number before?
                rlenbuf[rlenbufacc]=0;
                if (r[i]=='=' || r[i]=='-') {
                    i++;
                    if (r[i]==0) break;
                } else { //check for splice site markers for this introns
                    if (r[i]=='('||r[i]=='[') {
                        l_splice=r[i];
                        i++;
                        if (r[i]==0) break;
                    }
                    if (r[i]==')'||r[i]==']') {
                        r_splice=r[i];
                        i++;
                        if (r[i]==0) break;
                    }
                }//splice check
                add_intron(buf, accrd, rgpos, rlenbuf, sqinfo, l_splice, r_splice);
                rlenbufacc=0;
                r_splice=0;
                l_splice=0;
                //i++;//skip the gap character
            }
            //check for digits here and break the linebuf as needed
            int bi=i; //start of non-digit run
            while (r[i] && !isdigit(r[i])) i++;
            int nl=(i-bi); //length of non-digit run
            if (nl>0) {
                int si=accrd;
                accrd+=nl;
                rgpos+=nl;
                if (accrd>=buflen-1) {
                    buflen=accrd+512;
                    GREALLOC(buf,buflen);
                }
                //append these non-digit chars
                for(int b=0; b<nl; b++) {
                    buf[si+b]=r[bi+b];
                }
            }//non-digit run
        } //while line chars

        /*
        //-- append the line to buf
        accrd+=linelen;
        if (accrd>=buflen) {
        	buflen+=1024;
        	GREALLOC(buf,buflen);
        	}
        strcat(buf, r);
        */

        r=linebuf->getLine(f,f_pos);
        linelen=linebuf->length();
    }//while linelen>0

//add the 0-ending
    buf[accrd]=0;
    return buf;
}
Пример #28
0
LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
 LytSeqInfo* seq;
 //s must be the line with sequence data
 char* p=strchrs(s," \t");
 if (p==NULL) return NULL;
 p++;
 char c;
 int slen, soffs, clpL, clpR;
 clpL=0;clpR=0;
 if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
 p--;
 *p='\0';
 if ((seq=seqinfo.Find(s))!=NULL) {
   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
      " so it cannot be added for contig '%s (%d nt)'\n",
     s, seq->contig->name, seq->contig->len,
     ctg->name, ctg->len);
   return NULL;
   }
 seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
 seqinfo.shkAdd(seq->name, seq);
 ctg->seqs.Add(seq);
 //parse optional extensions, if any
 p+=strlen(s); //position p after the seqname
 char* m=NULL;
 int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
 char segSplice, nextsegSplice;
 while ((m=strchr(p,':'))!=NULL) {
  switch (*(m-1)) {
    case 'G': //segmenting info
       prevSegStart=soffs+clpL-1;
       p=m+1;  //p to the beginning of G: data
       //accumulate the total length in lenSegs
       while (*p>='1' && *p<='9') {
         segEnd=0;
         segRclip=0;
         nextsegStart=0;
         nextsegLclip=0;
         segSplice=0;
         nextsegSplice=0;
         if (!parseInt(p,segEnd))
            GError("Error [segment] at LayoutParser for %s at: %s\n",
                 s, m-1);
         if (*p=='c') {
            p++;
            if (!parseInt(p,segRclip))
              GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            }
         if (*p=='S' || *p=='s') {
            segSplice=*p; p++;
            }
         if (*p!='-')
                GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            else p++;
         if (!parseInt(p,nextsegStart))
            GError("Error [segment] at LayoutParser for %s at: %s\n",
                 s, m-1);
         if (*p=='c') {
            p++;
            if (!parseInt(p,nextsegLclip))
              GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            }
         if (*p=='S' || *p=='s') {
            nextsegSplice=*p; p++;
            }
         seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
                                         segSplice, nextsegSplice);
         prevSegStart=nextsegStart;
         //
         if (*p==',') p++;
             else break;
         } //while inter-segment parsing
       break; // 'G:' case
    case 'L': //clone mates list
       p=m+1; //p to the beginning of L: data
       break;
    case 'D': //difference sequence
       p=m+1; //p to the beginning of D: data
       break;
    case 'S': //actual sequence
       p=m+1; //p to the beginning of S: data
       break;
    default:
       p=m+1;//next attribute
    }
  }

 return seq;
}
Пример #29
0
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
 //returns true if the transcript passed the filter
 char* gname=gffrec.getGeneName();
 if (gname==NULL) gname=gffrec.getGeneID();
 GStr defline(gffrec.getID());
 if (f_out && !fmtGTF) {
     const char* tname=NULL;
     if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
        gffrec.addAttr("Name", tname);
        gffrec.removeAttr("transcript_name");
        }
     }
 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
      const char* biotype=gffrec.getAttr("gene_biotype");
      if (biotype) {
         gffrec.addAttr("type", biotype);
         gffrec.removeAttr("gene_biotype");
         }
       else { //old Ensembl files lacking gene_biotype
         gffrec.addAttr("type", gffrec.getTrackName());
         }

      //bool is_gene=false;
      bool is_pseudo=false;
      if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
                gffrec.setFeatureName("mRNA");
       else {
          if (strcmp(biotype, "processed_transcript")==0) 
              gffrec.setFeatureName("proc_RNA");
            else {
              //is_gene=endsWith(biotype, "gene");
              is_pseudo=strifind(biotype, "pseudo");
              if (is_pseudo) {
                   gffrec.setFeatureName("pseudo_RNA");
                   }
                else if (endsWith(biotype, "RNA")) {
                   gffrec.setFeatureName(biotype);
                   } else gffrec.setFeatureName("misc_RNA");
              }
          }
      }
 if (gname && strcmp(gname, gffrec.getID())!=0) {
   int* isonum=isoCounter.Find(gname);
   if  (isonum==NULL) {
       isonum=new int(1);
       isoCounter.Add(gname,isonum);
       }
      else (*isonum)++;
   defline.appendfmt(" gene=%s", gname);
   }
  int seqlen=0;

  const char* tlabel=tracklabel;
  if (tlabel==NULL) tlabel=gffrec.getTrackName();
  //defline.appendfmt(" track:%s",tlabel);
  char* cdsnt = NULL;
  char* cdsaa = NULL;
  int aalen=0;
  for (int i=1;i<gffrec.exons.Count();i++) {
     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
     if (ilen>4000000) 
            GMessage("Warning: very large intron (%d) for transcript %s\n",
                           ilen, gffrec.getID());
     if (ilen>maxintron) {
         return false;
         }
     }
  GList<GSeg> seglst(false,true);
  GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
  if (spliceCheck && gffrec.exons.Count()>1) {
    //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
    if (faseq==NULL) GError("Error: no genomic sequence available!\n");
    int glen=gffrec.end-gffrec.start+1;
    const char* gseq=faseq->subseq(gffrec.start, glen);
    bool revcompl=(gffrec.strand=='-');
    bool ssValid=true;
    for (int e=1;e<gffrec.exons.Count();e++) {
      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
      GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
      GSpliceSite    donorSite(intron,intronlen, false, revcompl);
      //GMessage("%c intron %d-%d : %s .. %s\n",
      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
      if (acceptorSite=="AG") { // GT-AG or GC-AG
         if (!donorSite.canonicalDonor()) {
            ssValid=false;break;
            }
         }
      else if (acceptorSite=="AC") { //
         if (donorSite!="AT") { ssValid=false; break; }
         }
      else { ssValid=false; break; }
      }
    //GFREE(gseq);
    if (!ssValid) {
      if (verbose)
         GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
      return false; //don't print this one!
      }
    }

  bool trprint=true;
  int stopCodonAdjust=0;
  int mCDphase=0;
  bool hasStop=false;
  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
      mCDphase = gffrec.CDphase-'0';
  if (f_y!=NULL || f_x!=NULL || validCDSonly) {
    if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
    //if (protmap && fullCDSonly) {
    //if (protmap && (fullCDSonly ||  (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
    
    if (validCDSonly) { //make sure the stop codon is always included 
      //adjust_stopcodon(gffrec,3);
      stopCodonAdjust=adjust_stopcodon(gffrec,3);
      }
    int strandNum=0;
    int phaseNum=0;
  CDS_CHECK:
    cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst);
    if (cdsnt==NULL) trprint=false;
    else { //has CDS
      if (validCDSonly) {
         cdsaa=translateDNA(cdsnt, aalen, seqlen);
         char* p=strchr(cdsaa,'.');
         hasStop=false;
         if (p!=NULL) {
              if (p-cdsaa>=aalen-2) { //stop found as the last codon
                      *p='0';//remove it
                      hasStop=true;
                      if (aalen-2==p-cdsaa) {
                        //previous to last codon is the stop codon
                        //so correct the CDS stop accordingly
                        adjust_stopcodon(gffrec,-3, &seglst);
                        stopCodonAdjust=0; //clear artificial stop adjustment
                        seqlen-=3;
                        cdsnt[seqlen]=0;
                        }
                      aalen=p-cdsaa;
                      }
                   else {//stop found before the last codon
                      trprint=false;
                      }
              }//stop codon found
         if (trprint==false) { //failed CDS validity check
           //in-frame stop codon found
           if (altPhases && phaseNum<3) {
              phaseNum++;
              gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
              GFREE(cdsaa);
              goto CDS_CHECK;
              }
           if (gffrec.exons.Count()==1 && bothStrands) {
              strandNum++;
              phaseNum=0;
              if (strandNum<2) {
                 GFREE(cdsaa);
                 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
                 goto CDS_CHECK; //repeat the CDS check for a different frame
                 }
              }
           if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
           } //has in-frame STOP
         if (fullCDSonly) {
             if (!hasStop || cdsaa[0]!='M') trprint=false;
             }
         } // CDS check requested
      } //has CDS
    } //translation or codon check/output was requested
  if (!trprint) {
    GFREE(cdsnt);
    GFREE(cdsaa);
    return false;
    }
  if (stopCodonAdjust>0 && !hasStop) {
          //restore stop codon location
          adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
          if (cdsnt!=NULL && seqlen>0) {
             seqlen-=stopCodonAdjust;
             cdsnt[seqlen]=0;
             }
          if (cdsaa!=NULL) aalen--;
          }

  if (f_y!=NULL) { //CDS translation fasta output requested
         //char* 
         if (cdsaa==NULL) { //translate now if not done before
           cdsaa=translateDNA(cdsnt, aalen, seqlen);
           }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcripts
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_y, defline, cdsaa, aalen);
         }
   if (f_x!=NULL) { //CDS only
         if (writeExonSegs) {
              defline.append(" loc:");
              defline.append(gffrec.getGSeqName());
              defline.appendfmt("(%c)",gffrec.strand);
              //warning: not CDS coordinates are written here, but the exon ones
              defline+=(int)gffrec.start;
              defline+=(char)'-';
              defline+=(int)gffrec.end;
              // -- here these are CDS substring coordinates on the spliced sequence:
              defline.append(" segs:");
              for (int i=0;i<seglst.Count();i++) {
                  if (i>0) defline.append(",");
                  defline+=(int)seglst[i]->start;
                  defline.append("-");
                  defline+=(int)seglst[i]->end;
                  }
              }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcript
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_x, defline, cdsnt, seqlen);
         }
 GFREE(cdsnt);
 GFREE(cdsaa);
 if (f_w!=NULL) { //write spliced exons
    uint cds_start=0;
    uint cds_end=0;
    seglst.Clear();
    char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
    if (exont!=NULL) {
    if (gffrec.CDstart>0) {
        defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
        }
      if (writeExonSegs) {
        defline.append(" loc:");
        defline.append(gffrec.getGSeqName());
        defline+=(char)'|';
        defline+=(int)gffrec.start;
        defline+=(char)'-';
        defline+=(int)gffrec.end;
        defline+=(char)'|';
        defline+=(char)gffrec.strand;
        defline.append(" exons:");
        for (int i=0;i<gffrec.exons.Count();i++) {
                if (i>0) defline.append(",");
                defline+=(int)gffrec.exons[i]->start;
                defline.append("-");
                defline+=(int)gffrec.exons[i]->end;
                }
        defline.append(" segs:");
        for (int i=0;i<seglst.Count();i++) {
            if (i>0) defline.append(",");
            defline+=(int)seglst[i]->start;
            defline.append("-");
            defline+=(int)seglst[i]->end;
            }
        }
      if (fullattr && gffrec.attrs!=NULL) {
       //append all attributes found for each transcripts
        for (int i=0;i<gffrec.attrs->Count();i++) {
          defline.append(" ");
          defline.append(gffrec.getAttrName(i));
          defline.append("=");
          defline.append(gffrec.getAttrValue(i));
          }
        }
      printFasta(f_w, defline, exont, seqlen);
      GFREE(exont);
      }
    } //writing f_w (spliced exons)
 return true;
}
Пример #30
0
int main(int argc, char * const argv[]) {
 GArgs args(argc, argv, 
   "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
 args.printError(USAGE, true);
 if (args.getOpt('h') || args.getOpt("help")) {
    GMessage("%s",USAGE);
    exit(1);
    }
 debugMode=(args.getOpt("debug")!=NULL);
 decodeChars=(args.getOpt('D')!=NULL);
 forceExons=(args.getOpt("force-exons")!=NULL);
 NoPseudo=(args.getOpt("no-pseudo")!=NULL);
 mRNAOnly=(args.getOpt('O')==NULL);
 //sortByLoc=(args.getOpt('S')!=NULL);
 addDescr=(args.getOpt('A')!=NULL);
 verbose=(args.getOpt('v')!=NULL);
 wCDSonly=(args.getOpt('C')!=NULL);
 validCDSonly=(args.getOpt('V')!=NULL);
 altPhases=(args.getOpt('H')!=NULL);
 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
 bothStrands=(args.getOpt('B')!=NULL);
 fullCDSonly=(args.getOpt('J')!=NULL);
 spliceCheck=(args.getOpt('N')!=NULL);
 bool matchAllIntrons=(args.getOpt('K')==NULL);
 bool fuzzSpan=(args.getOpt('Q')!=NULL);
 if (args.getOpt('M') || args.getOpt("merge")) {
    doCluster=true;
    doCollapseRedundant=true;
    }
   else {
    if (!matchAllIntrons || fuzzSpan) {
      GMessage("%s",USAGE);
      GMessage("Error: -K or -Q options require -M/--merge option!\n");
      exit(1);
      }
    }
 if (args.getOpt("cluster-only")) {
    doCluster=true;
    doCollapseRedundant=false;
    if (!matchAllIntrons || fuzzSpan) {
      GMessage("%s",USAGE);
      GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
      exit(1);
      }
    }
 if (fullCDSonly) validCDSonly=true;
 if (verbose) { 
     fprintf(stderr, "Command line was:\n");
     args.printCmdLine(stderr);
     }

 fullattr=(args.getOpt('F')!=NULL);
 if (args.getOpt('G')==NULL) 
    noExonAttr=!fullattr;
   else {
     noExonAttr=true;
     fullattr=true;
     }
 if (NoPseudo && !fullattr) {
	 noExonAttr=true;
	 fullattr=true;
 }
 ensembl_convert=(args.getOpt('L')!=NULL);
 if (ensembl_convert) {
    fullattr=true;
    noExonAttr=false;
    //sortByLoc=true;
    }
    
 mergeCloseExons=(args.getOpt('Z')!=NULL);
 multiExon=(args.getOpt('U')!=NULL);
 writeExonSegs=(args.getOpt('W')!=NULL);
 tracklabel=args.getOpt('t');
 GFastaDb gfasta(args.getOpt('g'));
 //if (gfasta.fastaPath!=NULL)
 //    sortByLoc=true; //enforce sorting by chromosome/contig
 GStr s=args.getOpt('i');
 if (!s.is_empty()) maxintron=s.asInt();
 
 FILE* f_repl=NULL;
 s=args.getOpt('d');
 if (!s.is_empty()) {
   if (s=="-") f_repl=stdout;
     else {
       f_repl=fopen(s.chars(), "w");
       if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
       }
   }
 
 rfltWithin=(args.getOpt('R')!=NULL);
 s=args.getOpt('r');
 if (!s.is_empty()) {
   s.trim();
   if (s[0]=='+' || s[0]=='-') {
     rfltStrand=s[0];
     s.cut(0,1);
     }
   int isep=s.index(':');
   if (isep>0) { //gseq name given
      if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
        isep--;
        rfltStrand=s[isep];
        s.cut(isep,1);
        }
      if (isep>0) 
          rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
      s.cut(0,isep+1);
      }
   GStr gsend;
   char slast=s[s.length()-1];
   if (rfltStrand==0 && (slast=='+' || slast=='-')) {
      s.chomp(slast);
      rfltStrand=slast;
      }
   if (s.index("..")>=0) gsend=s.split("..");
                    else gsend=s.split('-');
   if (!s.is_empty()) rfltStart=(uint)s.asInt();
   if (!gsend.is_empty()) {
      rfltEnd=(uint)gsend.asInt();
      if (rfltEnd==0) rfltEnd=MAX_UINT;
      }
   } //gseq/range filtering
 else {
   if (rfltWithin)
     GError("Error: option -R requires -r!\n");
   //if (rfltWholeTranscript)
   //  GError("Error: option -P requires -r!\n");
   }
 s=args.getOpt('m');
 if (!s.is_empty()) {
   FILE* ft=fopen(s,"r");
   if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
   loadRefTable(ft, reftbl);
   fclose(ft);
   }
 s=args.getOpt('s');
 if (!s.is_empty()) {
   FILE* fsize=fopen(s,"r");
   if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
   loadSeqInfo(fsize, seqinfo);
   fclose(fsize);
   }

 openfw(f_out, args, 'o');
 //if (f_out==NULL) f_out=stdout;
 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
  GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");

 openfw(f_w, args, 'w');
 openfw(f_x, args, 'x');
 openfw(f_y, args, 'y');
 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
 
 int numfiles = args.startNonOpt();
 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
 int out_counter=0; //number of records printed
 while (true) {
   GStr infile;
   if (numfiles) {
          infile=args.nextNonOpt();
          if (infile.is_empty()) break;
          if (infile=="-") { f_in=stdin; infile="stdin"; }
               else 
                 if ((f_in=fopen(infile, "r"))==NULL)
                    GError("Error: cannot open input file %s!\n",infile.chars());
          }
        else 
          infile="-";
   GffLoader gffloader(infile.chars());
   gffloader.transcriptsOnly=mRNAOnly;
   gffloader.fullAttributes=fullattr;
   gffloader.noExonAttrs=noExonAttr;
   gffloader.mergeCloseExons=mergeCloseExons;
   gffloader.showWarnings=(args.getOpt('E')!=NULL);
   gffloader.noPseudo=NoPseudo;
   gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, 
                             matchAllIntrons, fuzzSpan, forceExons);
   if (doCluster) 
     collectLocusData(g_data);
   if (numfiles==0) break;
   }
   
 GStr loctrack("gffcl");
 if (tracklabel) loctrack=tracklabel;
 g_data.setSorted(&gseqCmpName);
 GffPrintMode exonPrinting;
 if (fmtGTF) {
	 exonPrinting = pgtfAny;
 } else {
	 exonPrinting = forceExons ? pgffBoth : pgffAny;
 }
 bool firstGff3Print=!fmtGTF;
 if (doCluster) {
   //grouped in loci
   for (int g=0;g<g_data.Count();g++) {
     GenomicSeqData* gdata=g_data[g];
     int gfs_i=0;
     for (int l=0;l<gdata->loci.Count();l++) {
       GffLocus& loc=*(gdata->loci[l]);
       //check all non-replaced transcripts in this locus:
       int numvalid=0;
       int idxfirstvalid=-1;
       for (int i=0;i<loc.rnas.Count();i++) {
         GffObj& t=*(loc.rnas[i]);
         if (f_out) {
          while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
             GffObj& gfst=*(gdata->gfs[gfs_i]);
             if ((gfst.udata&4)==0) { //never printed
               gfst.udata|=4;
               if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
               if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
                gfst.addExon(gfst.start,gfst.end);
               gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               }
             ++gfs_i;
          }
         }
         GTData* tdata=(GTData*)(t.uptr);
         if (tdata->replaced_by!=NULL) {
            if (f_repl && (t.udata & 8)==0) {
               //t.udata|=8;
               fprintf(f_repl, "%s", t.getID());
               GTData* rby=tdata;
               while (rby->replaced_by!=NULL) {
                  fprintf(f_repl," => %s", rby->replaced_by->getID());
                  rby->rna->udata|=8;
                  rby=(GTData*)(rby->replaced_by->uptr);
                  }
               fprintf(f_repl, "\n");
               }
            continue;
            }
         if (process_transcript(gfasta, t)) {
             t.udata|=4; //tag it as valid
             numvalid++;
             if (idxfirstvalid<0) idxfirstvalid=i;
             }
         }
       if (f_out && numvalid>0) {
         GStr locname("RLOC_");
         locname.appendfmt("%08d",loc.locus_num);
         if (!fmtGTF) {
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
                    loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
                     locname.chars(), locname.chars());
           //const char* loc_gname=loc.getGeneName();
           if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
              fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
              for (int i=1;i<loc.gene_names.Count();i++) {
                fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
                }
              }
           if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
              fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
              for (int i=1;i<loc.gene_ids.Count();i++) {
                fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
                }
              }
           fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
           for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
              fprintf(f_out, ",%s",loc.rnas[i]->getID());
              }
           fprintf(f_out, "\n");
           }
         //now print all valid, non-replaced transcripts in this locus:
         for (int i=0;i<loc.rnas.Count();i++) {
           GffObj& t=*(loc.rnas[i]);
           GTData* tdata=(GTData*)(t.uptr);
           if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
           t.addAttr("locus", locname.chars());
           out_counter++;
           if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               else {
                if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
                //print the parent first, if any
                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
                    GTData* pdata=(GTData*)(t.parent->uptr);
                    if (pdata && pdata->geneinfo!=NULL) 
                         pdata->geneinfo->finalize();
                    t.parent->addAttr("locus", locname.chars());
                    t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                    t.parent->udata|=4;
                    }
                t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                }
            }
          } //have valid transcripts to print
       }//for each locus
     //print the rest of the isolated pseudo/gene/region features not printed yet
     if (f_out) {
      while (gfs_i<gdata->gfs.Count()) {
         GffObj& gfst=*(gdata->gfs[gfs_i]);
         if ((gfst.udata&4)==0) { //never printed
           gfst.udata|=4;
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
             gfst.addExon(gfst.start,gfst.end);
           gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
           }
         ++gfs_i;
      }
     }
    } //for each genomic sequence
   }
  else {
   //not grouped into loci, print the rnas with their parents, if any
   int numvalid=0;
   for (int g=0;g<g_data.Count();g++) {
     GenomicSeqData* gdata=g_data[g];
     int gfs_i=0;
     for (int m=0;m<gdata->rnas.Count();m++) {
        GffObj& t=*(gdata->rnas[m]);
        if (f_out) {
         while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
            GffObj& gfst=*(gdata->gfs[gfs_i]);
            if ((gfst.udata&4)==0) { //never printed
              gfst.udata|=4;
              if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
              if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
               gfst.addExon(gfst.start,gfst.end);
              gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
              }
            ++gfs_i;
         }
        }
        GTData* tdata=(GTData*)(t.uptr);
        if (tdata->replaced_by!=NULL) continue;
        if (process_transcript(gfasta, t)) {
           t.udata|=4; //tag it as valid
           numvalid++;
           if (f_out) {
             if (tdata->geneinfo) tdata->geneinfo->finalize();
             out_counter++;
             if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               else {
                if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
                //print the parent first, if any
                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
                    GTData* pdata=(GTData*)(t.parent->uptr);
                    if (pdata && pdata->geneinfo!=NULL) 
                         pdata->geneinfo->finalize();
                    t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                    t.parent->udata|=4;
                    }
                t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                }
             }//GFF/GTF output requested
           } //valid transcript
        } //for each rna
     //print the rest of the isolated pseudo/gene/region features not printed yet
     if (f_out) {
      while (gfs_i<gdata->gfs.Count()) {
         GffObj& gfst=*(gdata->gfs[gfs_i]);
         if ((gfst.udata&4)==0) { //never printed
           gfst.udata|=4;
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
            gfst.addExon(gfst.start,gfst.end);
           gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
           }
         ++gfs_i;
      }
     }
    } //for each genomic seq
   } //not clustered
 if (f_repl && f_repl!=stdout) fclose(f_repl);
 seqinfo.Clear();
 //if (faseq!=NULL) delete faseq;
 //if (gcdb!=NULL) delete gcdb;
 GFREE(rfltGSeq);
 FRCLOSE(f_in);
 FWCLOSE(f_out);
 FWCLOSE(f_w);
 FWCLOSE(f_x);
 FWCLOSE(f_y);
 }