Ejemplo n.º 1
0
 void GBamRecord::set_cigar(const char* cigar) {
   //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
   int doff=b->core.l_qname;
   uint8_t* after_cigar=NULL;
   int after_cigar_len=0;
   uint8_t* prev_bdata=NULL;
   if (b->data_len>doff) {
      //cigar string already allocated, replace it
      int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
      after_cigar=b->data+d;
      after_cigar_len=b->data_len-d;
      }
   const char *s;
   char *t;
   int i, op;
   long x;
   b->core.n_cigar = 0;
   if (cigar != NULL && strcmp(cigar, "*") != 0) {
        for (s = cigar; *s; ++s) {
            if (isalpha(*s)) b->core.n_cigar++;
            else if (!isdigit(*s)) {
                 GError("Error: invalid CIGAR character (%s)\n",cigar);
                 }
            }
        if (after_cigar_len>0) { //replace/insert into existing full data
             prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
             memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
             free(prev_bdata);
             }
           else {
             realloc_bdata(b, doff + b->core.n_cigar * 4);
             }
        for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
            x = strtol(s, &t, 10);
            op = toupper(*t);
            if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
            else if (op == 'I') op = BAM_CINS;
            else if (op == 'D') op = BAM_CDEL;
            else if (op == 'N') op = BAM_CREF_SKIP; //has_Introns=true;
            else if (op == 'S') op = BAM_CSOFT_CLIP; //soft_Clipped=true;
            else if (op == 'H') op = BAM_CHARD_CLIP; //hard_Clipped=true;
            else if (op == 'P') op = BAM_CPAD;
            else GError("Error: invalid CIGAR operation (%s)\n",cigar);
            s = t + 1;
            bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
        }
        if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
        b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
    } else {//no CIGAR string given
        if (!(b->core.flag&BAM_FUNMAP)) {
            GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
            b->core.flag |= BAM_FUNMAP;
        }
        b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
    }
   setupCoordinates();
   } //set_cigar()
Ejemplo n.º 2
0
int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file
    if (records.Count()==0)
       GError("Error at GFastaIndex:storeIndex(): no records found!\n");
    FILE* fai=fopen(finame, "w");
    if (fai==NULL) GError("Error creating fasta index file: %s\n",finame);
    int rcount=storeIndex(fai);
    GFREE(fai_name);
    fai_name=Gstrdup(finame);
    return rcount;
    }
Ejemplo n.º 3
0
void Gmktempdir(char* templ) {
#ifdef __WIN32__
  int blen=strlen(templ);
  if (_mktemp_s(templ, blen)!=0)
	  GError("Error creating temp dir %s!\n", templ);
#else
  char* cdir=mkdtemp(templ);
  if (cdir==NULL)
	  GError("Error creating temp dir %s!(%s)\n", templ, strerror(errno));
#endif
}
Ejemplo n.º 4
0
bool unsplContained(GffObj& ti, GffObj&  tj, bool fuzzSpan) {
 //returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons
 //but it does not cross any intron-exon boundary of tj
  int imax=ti.exons.Count()-1;
  int jmax=tj.exons.Count()-1;
  if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n");
  int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan
  if (fuzzSpan) {
    for (int j=0;j<=jmax;j++) {
       //must NOT overlap the introns
       if ((j>0 && ti.start<tj.exons[j]->start) 
          || (j<jmax && ti.end>tj.exons[j]->end))
         return false;
       if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl)
              return true;
       }
      } else {
    for (int j=0;j<=jmax;j++) {
       //must NOT overlap the introns
       if ((j>0 && ti.start<tj.exons[j]->start) 
          || (j<jmax && ti.end>tj.exons[j]->end))
         return false;
         //strict containment
       if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) 
            return true;
       }
      }
 return false;
}
Ejemplo n.º 5
0
void openfw(FILE* &f, GArgs& args, char opt) {
  GStr s=args.getOpt(opt);
  if (!s.is_empty()) {
      if (s=='-')
       f=stdout;
      else {
       f=fopen(s,"w");
       if (f==NULL) GError("Error creating file: %s\n", s.chars());
       }
     }
}
Ejemplo n.º 6
0
int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index
    if (finame==NULL) finame=fai_name;
    if (finame!=fai_name) {
      fai_name=Gstrdup(finame);
      }
    if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n");
    records.Clear();
    haveFai=false;
    FILE* fi=fopen(fai_name,"rb");
    if (fi==NULL) {
       GMessage("Warning: cannot open fasta index file: %s!\n",fai_name);
       return 0;
       }
    GLineReader fl(fi);
    char* s=NULL;
    while ((s=fl.nextLine())!=NULL) {
      if (*s=='#') continue;
      char* p=strchrs(s,"\t ");
      if (p==NULL) GError(ERR_FAIDXLINE,s);
      *p=0; //s now holds the genomic sequence name
      p++;
      uint len=0;
      int line_len=0, line_blen=0;
      #ifdef __WIN32__
         long offset=-1;
         sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen);
      #else
         long long offset=-1;
         sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen);
      #endif
      if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len)
          GError(ERR_FAIDXLINE,p);
      addRecord(s,len,offset,line_len, line_blen);
      }
    fclose(fi);
    haveFai=(records.Count()>0);
    return records.Count();
    }
Ejemplo n.º 7
0
byte gdna2bit(char* &nt, int n) {
// Pack n bases into a byte (n can be 1..4)
byte out = 0;
while (n && *nt) {
    n--;
    out <<= 2;
    out += nt2bit[(int)*nt];
    nt++;
    }
#ifdef GDEBUG
if (n) {
     GError("Error: attempt to read 6-mer beyond the end of the string!\n");
     }
#endif
return out;
}
Ejemplo n.º 8
0
 void GBamRecord::add_sequence(const char* qseq, int slen) {
   //must be called AFTER set_cigar (cannot replace existing sequence for now)
   if (qseq==NULL) return; //should we ever care about this?
   if (slen<0) slen=strlen(qseq);
   int doff = b->core.l_qname + b->core.n_cigar * 4;
   if (strcmp(qseq, "*")!=0) {
       b->core.l_qseq=slen;
       if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
           GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
                  qseq);
       uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
       //also allocated quals memory
       memset(p, 0, (b->core.l_qseq+1)/2);
       for (int i = 0; i < b->core.l_qseq; ++i)
           p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
       } else b->core.l_qseq = 0;
   }
Ejemplo n.º 9
0
bool AceParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {

    bool forgetCtg = false;
    if (ctgidx>=contigs.Count())
        GError("LayoutParser: invalid contig index '%d'\n", ctgidx);
    LytCtgData* ctgdata=contigs[ctgidx];
    if (re_pos && currentContig!=NULL) { //free previously loaded contig data
        currentContig->seqs.Clear();       // unless it was a parse() call
        seqinfo.Clear();
    }
    currentContig=ctgdata;
    int ctg_numSeqs=ctgdata->numseqs;

    if (re_pos) {
        seek(ctgdata->fpos); //position right where the contig definition starts
        char *r = linebuf->getLine(f,f_pos);
        if (r==NULL) return false;
    }

    if (seqfn!=NULL) { //process the contig sequence!
        char* ctgseq=readSeq();
        forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, ctgseq);
        GFREE(ctgseq); //obviously the caller should have made a copy
    }
    //now look for all the component sequences
    if (fskipTo("AF ")<0) {
        GMessage("AceParser: error finding sequence offsets (AF)"
                 " for contig '%s' (%d)\n", ctgdata->name, ctgdata->len);
        return false;
    }
    int numseqs=0;
    while (startsWith(linebuf->chars(), "AF ",3)) {
        if (addSeq(linebuf->chars(), ctgdata)==NULL) {
            GMessage("AceParser: error parsing AF entry:\n%s\n",linebuf->chars());
            return false;
        }
        numseqs++;
        //read next line:
        linebuf->getLine(f,f_pos);
    }
    if (numseqs!=ctg_numSeqs) {
        GMessage("Invalid number of AF entries found (%d) for contig '%s' "
                 "(length %d, numseqs %d)\n", numseqs,
                 ctgdata->name, ctgdata->len, ctg_numSeqs);
        return false;
    }
    //now read each sequence entry
    off_t seqpos=fskipTo("RD ");
    numseqs=0; //count again, now the RD entries
    if (seqpos<0) {
        GMessage("AceParser: error locating first RD entry for contig '%s'\n",
                 ctgdata->name);
        return false;
    }
    //int numseqs=0;
    //reading the actual component sequence details
    while (startsWith(linebuf->chars(), "RD ",3)) {
        char* s=linebuf->chars()+3;
        char* p=strchrs(s, " \t");
        LytSeqInfo* seq;
        if (p==NULL) {
            GMessage("AceParser: Error parsing RD header line:\n%s\n", linebuf->chars());
            return false;
        }
        *p='\0';
        if ((seq=seqinfo.Find(s))==NULL) {
            GMessage("AceParser: unknown RD encountered: '%s'\n", s);
            return false;
        }
        p++; //now p is in linebuf after the RD name
        seq->fpos=seqpos;
        int len;
        if (sscanf(p, "%d", &len)!=1) {
            GMessage("AceParser: cannot parse RD length for '%s'\n", s);
            return false;
        }
        seq->setLength(len);
        //read the sequence data here if a callback fn was given:
        char* sseq=NULL;
        if (seqfn!=NULL)
            sseq=readSeq(seq); //read full sequence here
        if (fskipTo("QA ")<0) {
            GMessage("AceParser: Error finding QA entry for read %s! (fpos=%llu)\n", seq->name, (unsigned long long)f_pos);
            return false;
        }
        //parse QA entry:
        int tmpa, tmpb;
        if (sscanf(linebuf->chars()+3, "%d %d %d %d", &tmpa, &tmpb, &seq->left,&seq->right)!=4 ||
                seq->left<=0 || seq->right<=0) {
            GMessage("AceParser: Error parsing QA entry.\n");
            return false;
        }
        /*
        if (fskipTo("DS")<0) {
             GMessage("AceParser: Error closing RD entry ('DS' not found).\n");
             return false;
             }
             */
        seqpos=getFilePos()+1;
        bool forgetSeq=false;
        if (seqfn!=NULL) {
            forgetSeq=(*seqfn)(numContigs, ctgdata, seq, sseq);
            GFREE(sseq);
        }
        if (forgetSeq) { //parsing the whole stream -- aceconv)
            ctg_numSeqs--;
            seqinfo.Remove(seq->name);
            ctgdata->seqs.RemovePtr(seq);
        }
        numseqs++;
        if (numseqs<ctgdata->numseqs)
            seqpos=fskipTo("RD ", "CO "); //more sequences left to read
    }
    if (numseqs!=ctgdata->numseqs) {
        GMessage("Error: Invalid number of RD entries found (%d) for contig '%s' "
                 "(length %d, numseqs %d)\n", numseqs,
                 ctgdata->name, ctgdata->len, ctg_numSeqs);
        return false;
    }
    if (forgetCtg) {
        ctgIDs.Remove(ctgdata->name);
        ctgdata->seqs.Clear();
        seqinfo.Clear();
        contigs.RemovePtr(ctgdata);
    }
    return true;
}
Ejemplo n.º 10
0
int main(int argc, char * const argv[]) {
 GArgs args(argc, argv, 
   "debug;merge;cluster-only;help;force-exons;no-pseudo;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
 args.printError(USAGE, true);
 if (args.getOpt('h') || args.getOpt("help")) {
    GMessage("%s",USAGE);
    exit(1);
    }
 debugMode=(args.getOpt("debug")!=NULL);
 decodeChars=(args.getOpt('D')!=NULL);
 forceExons=(args.getOpt("force-exons")!=NULL);
 NoPseudo=(args.getOpt("no-pseudo")!=NULL);
 mRNAOnly=(args.getOpt('O')==NULL);
 //sortByLoc=(args.getOpt('S')!=NULL);
 addDescr=(args.getOpt('A')!=NULL);
 verbose=(args.getOpt('v')!=NULL);
 wCDSonly=(args.getOpt('C')!=NULL);
 validCDSonly=(args.getOpt('V')!=NULL);
 altPhases=(args.getOpt('H')!=NULL);
 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
 bothStrands=(args.getOpt('B')!=NULL);
 fullCDSonly=(args.getOpt('J')!=NULL);
 spliceCheck=(args.getOpt('N')!=NULL);
 bool matchAllIntrons=(args.getOpt('K')==NULL);
 bool fuzzSpan=(args.getOpt('Q')!=NULL);
 if (args.getOpt('M') || args.getOpt("merge")) {
    doCluster=true;
    doCollapseRedundant=true;
    }
   else {
    if (!matchAllIntrons || fuzzSpan) {
      GMessage("%s",USAGE);
      GMessage("Error: -K or -Q options require -M/--merge option!\n");
      exit(1);
      }
    }
 if (args.getOpt("cluster-only")) {
    doCluster=true;
    doCollapseRedundant=false;
    if (!matchAllIntrons || fuzzSpan) {
      GMessage("%s",USAGE);
      GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
      exit(1);
      }
    }
 if (fullCDSonly) validCDSonly=true;
 if (verbose) { 
     fprintf(stderr, "Command line was:\n");
     args.printCmdLine(stderr);
     }

 fullattr=(args.getOpt('F')!=NULL);
 if (args.getOpt('G')==NULL) 
    noExonAttr=!fullattr;
   else {
     noExonAttr=true;
     fullattr=true;
     }
 if (NoPseudo && !fullattr) {
	 noExonAttr=true;
	 fullattr=true;
 }
 ensembl_convert=(args.getOpt('L')!=NULL);
 if (ensembl_convert) {
    fullattr=true;
    noExonAttr=false;
    //sortByLoc=true;
    }
    
 mergeCloseExons=(args.getOpt('Z')!=NULL);
 multiExon=(args.getOpt('U')!=NULL);
 writeExonSegs=(args.getOpt('W')!=NULL);
 tracklabel=args.getOpt('t');
 GFastaDb gfasta(args.getOpt('g'));
 //if (gfasta.fastaPath!=NULL)
 //    sortByLoc=true; //enforce sorting by chromosome/contig
 GStr s=args.getOpt('i');
 if (!s.is_empty()) maxintron=s.asInt();
 
 FILE* f_repl=NULL;
 s=args.getOpt('d');
 if (!s.is_empty()) {
   if (s=="-") f_repl=stdout;
     else {
       f_repl=fopen(s.chars(), "w");
       if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
       }
   }
 
 rfltWithin=(args.getOpt('R')!=NULL);
 s=args.getOpt('r');
 if (!s.is_empty()) {
   s.trim();
   if (s[0]=='+' || s[0]=='-') {
     rfltStrand=s[0];
     s.cut(0,1);
     }
   int isep=s.index(':');
   if (isep>0) { //gseq name given
      if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
        isep--;
        rfltStrand=s[isep];
        s.cut(isep,1);
        }
      if (isep>0) 
          rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
      s.cut(0,isep+1);
      }
   GStr gsend;
   char slast=s[s.length()-1];
   if (rfltStrand==0 && (slast=='+' || slast=='-')) {
      s.chomp(slast);
      rfltStrand=slast;
      }
   if (s.index("..")>=0) gsend=s.split("..");
                    else gsend=s.split('-');
   if (!s.is_empty()) rfltStart=(uint)s.asInt();
   if (!gsend.is_empty()) {
      rfltEnd=(uint)gsend.asInt();
      if (rfltEnd==0) rfltEnd=MAX_UINT;
      }
   } //gseq/range filtering
 else {
   if (rfltWithin)
     GError("Error: option -R requires -r!\n");
   //if (rfltWholeTranscript)
   //  GError("Error: option -P requires -r!\n");
   }
 s=args.getOpt('m');
 if (!s.is_empty()) {
   FILE* ft=fopen(s,"r");
   if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
   loadRefTable(ft, reftbl);
   fclose(ft);
   }
 s=args.getOpt('s');
 if (!s.is_empty()) {
   FILE* fsize=fopen(s,"r");
   if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
   loadSeqInfo(fsize, seqinfo);
   fclose(fsize);
   }

 openfw(f_out, args, 'o');
 //if (f_out==NULL) f_out=stdout;
 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
  GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");

 openfw(f_w, args, 'w');
 openfw(f_x, args, 'x');
 openfw(f_y, args, 'y');
 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
 
 int numfiles = args.startNonOpt();
 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
 int out_counter=0; //number of records printed
 while (true) {
   GStr infile;
   if (numfiles) {
          infile=args.nextNonOpt();
          if (infile.is_empty()) break;
          if (infile=="-") { f_in=stdin; infile="stdin"; }
               else 
                 if ((f_in=fopen(infile, "r"))==NULL)
                    GError("Error: cannot open input file %s!\n",infile.chars());
          }
        else 
          infile="-";
   GffLoader gffloader(infile.chars());
   gffloader.transcriptsOnly=mRNAOnly;
   gffloader.fullAttributes=fullattr;
   gffloader.noExonAttrs=noExonAttr;
   gffloader.mergeCloseExons=mergeCloseExons;
   gffloader.showWarnings=(args.getOpt('E')!=NULL);
   gffloader.noPseudo=NoPseudo;
   gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant, 
                             matchAllIntrons, fuzzSpan, forceExons);
   if (doCluster) 
     collectLocusData(g_data);
   if (numfiles==0) break;
   }
   
 GStr loctrack("gffcl");
 if (tracklabel) loctrack=tracklabel;
 g_data.setSorted(&gseqCmpName);
 GffPrintMode exonPrinting;
 if (fmtGTF) {
	 exonPrinting = pgtfAny;
 } else {
	 exonPrinting = forceExons ? pgffBoth : pgffAny;
 }
 bool firstGff3Print=!fmtGTF;
 if (doCluster) {
   //grouped in loci
   for (int g=0;g<g_data.Count();g++) {
     GenomicSeqData* gdata=g_data[g];
     int gfs_i=0;
     for (int l=0;l<gdata->loci.Count();l++) {
       GffLocus& loc=*(gdata->loci[l]);
       //check all non-replaced transcripts in this locus:
       int numvalid=0;
       int idxfirstvalid=-1;
       for (int i=0;i<loc.rnas.Count();i++) {
         GffObj& t=*(loc.rnas[i]);
         if (f_out) {
          while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
             GffObj& gfst=*(gdata->gfs[gfs_i]);
             if ((gfst.udata&4)==0) { //never printed
               gfst.udata|=4;
               if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
               if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
                gfst.addExon(gfst.start,gfst.end);
               gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               }
             ++gfs_i;
          }
         }
         GTData* tdata=(GTData*)(t.uptr);
         if (tdata->replaced_by!=NULL) {
            if (f_repl && (t.udata & 8)==0) {
               //t.udata|=8;
               fprintf(f_repl, "%s", t.getID());
               GTData* rby=tdata;
               while (rby->replaced_by!=NULL) {
                  fprintf(f_repl," => %s", rby->replaced_by->getID());
                  rby->rna->udata|=8;
                  rby=(GTData*)(rby->replaced_by->uptr);
                  }
               fprintf(f_repl, "\n");
               }
            continue;
            }
         if (process_transcript(gfasta, t)) {
             t.udata|=4; //tag it as valid
             numvalid++;
             if (idxfirstvalid<0) idxfirstvalid=i;
             }
         }
       if (f_out && numvalid>0) {
         GStr locname("RLOC_");
         locname.appendfmt("%08d",loc.locus_num);
         if (!fmtGTF) {
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
                    loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
                     locname.chars(), locname.chars());
           //const char* loc_gname=loc.getGeneName();
           if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
              fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
              for (int i=1;i<loc.gene_names.Count();i++) {
                fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
                }
              }
           if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
              fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
              for (int i=1;i<loc.gene_ids.Count();i++) {
                fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
                }
              }
           fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
           for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
              fprintf(f_out, ",%s",loc.rnas[i]->getID());
              }
           fprintf(f_out, "\n");
           }
         //now print all valid, non-replaced transcripts in this locus:
         for (int i=0;i<loc.rnas.Count();i++) {
           GffObj& t=*(loc.rnas[i]);
           GTData* tdata=(GTData*)(t.uptr);
           if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
           t.addAttr("locus", locname.chars());
           out_counter++;
           if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               else {
                if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
                //print the parent first, if any
                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
                    GTData* pdata=(GTData*)(t.parent->uptr);
                    if (pdata && pdata->geneinfo!=NULL) 
                         pdata->geneinfo->finalize();
                    t.parent->addAttr("locus", locname.chars());
                    t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                    t.parent->udata|=4;
                    }
                t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                }
            }
          } //have valid transcripts to print
       }//for each locus
     //print the rest of the isolated pseudo/gene/region features not printed yet
     if (f_out) {
      while (gfs_i<gdata->gfs.Count()) {
         GffObj& gfst=*(gdata->gfs[gfs_i]);
         if ((gfst.udata&4)==0) { //never printed
           gfst.udata|=4;
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
             gfst.addExon(gfst.start,gfst.end);
           gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
           }
         ++gfs_i;
      }
     }
    } //for each genomic sequence
   }
  else {
   //not grouped into loci, print the rnas with their parents, if any
   int numvalid=0;
   for (int g=0;g<g_data.Count();g++) {
     GenomicSeqData* gdata=g_data[g];
     int gfs_i=0;
     for (int m=0;m<gdata->rnas.Count();m++) {
        GffObj& t=*(gdata->rnas[m]);
        if (f_out) {
         while (gfs_i<gdata->gfs.Count() && gdata->gfs[gfs_i]->start<=t.start) {
            GffObj& gfst=*(gdata->gfs[gfs_i]);
            if ((gfst.udata&4)==0) { //never printed
              gfst.udata|=4;
              if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
              if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
               gfst.addExon(gfst.start,gfst.end);
              gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
              }
            ++gfs_i;
         }
        }
        GTData* tdata=(GTData*)(t.uptr);
        if (tdata->replaced_by!=NULL) continue;
        if (process_transcript(gfasta, t)) {
           t.udata|=4; //tag it as valid
           numvalid++;
           if (f_out) {
             if (tdata->geneinfo) tdata->geneinfo->finalize();
             out_counter++;
             if (fmtGTF) t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
               else {
                if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
                //print the parent first, if any
                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
                    GTData* pdata=(GTData*)(t.parent->uptr);
                    if (pdata && pdata->geneinfo!=NULL) 
                         pdata->geneinfo->finalize();
                    t.parent->printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                    t.parent->udata|=4;
                    }
                t.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
                }
             }//GFF/GTF output requested
           } //valid transcript
        } //for each rna
     //print the rest of the isolated pseudo/gene/region features not printed yet
     if (f_out) {
      while (gfs_i<gdata->gfs.Count()) {
         GffObj& gfst=*(gdata->gfs[gfs_i]);
         if ((gfst.udata&4)==0) { //never printed
           gfst.udata|=4;
           if (firstGff3Print) { printGff3Header(f_out, args);firstGff3Print=false; }
           if (gfst.exons.Count()==0 && gfst.children.Count()==0 && forceExons)
            gfst.addExon(gfst.start,gfst.end);
           gfst.printGxf(f_out, exonPrinting, tracklabel, NULL, decodeChars);
           }
         ++gfs_i;
      }
     }
    } //for each genomic seq
   } //not clustered
 if (f_repl && f_repl!=stdout) fclose(f_repl);
 seqinfo.Clear();
 //if (faseq!=NULL) delete faseq;
 //if (gcdb!=NULL) delete gcdb;
 GFREE(rfltGSeq);
 FRCLOSE(f_in);
 FWCLOSE(f_out);
 FWCLOSE(f_w);
 FWCLOSE(f_x);
 FWCLOSE(f_y);
 }
Ejemplo n.º 11
0
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
 //returns true if the transcript passed the filter
 char* gname=gffrec.getGeneName();
 if (gname==NULL) gname=gffrec.getGeneID();
 GStr defline(gffrec.getID());
 if (f_out && !fmtGTF) {
     const char* tname=NULL;
     if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
        gffrec.addAttr("Name", tname);
        gffrec.removeAttr("transcript_name");
        }
     }
 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
      const char* biotype=gffrec.getAttr("gene_biotype");
      if (biotype) {
         gffrec.addAttr("type", biotype);
         gffrec.removeAttr("gene_biotype");
         }
       else { //old Ensembl files lacking gene_biotype
         gffrec.addAttr("type", gffrec.getTrackName());
         }

      //bool is_gene=false;
      bool is_pseudo=false;
      if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
                gffrec.setFeatureName("mRNA");
       else {
          if (strcmp(biotype, "processed_transcript")==0) 
              gffrec.setFeatureName("proc_RNA");
            else {
              //is_gene=endsWith(biotype, "gene");
              is_pseudo=strifind(biotype, "pseudo");
              if (is_pseudo) {
                   gffrec.setFeatureName("pseudo_RNA");
                   }
                else if (endsWith(biotype, "RNA")) {
                   gffrec.setFeatureName(biotype);
                   } else gffrec.setFeatureName("misc_RNA");
              }
          }
      }
 if (gname && strcmp(gname, gffrec.getID())!=0) {
   int* isonum=isoCounter.Find(gname);
   if  (isonum==NULL) {
       isonum=new int(1);
       isoCounter.Add(gname,isonum);
       }
      else (*isonum)++;
   defline.appendfmt(" gene=%s", gname);
   }
  int seqlen=0;

  const char* tlabel=tracklabel;
  if (tlabel==NULL) tlabel=gffrec.getTrackName();
  //defline.appendfmt(" track:%s",tlabel);
  char* cdsnt = NULL;
  char* cdsaa = NULL;
  int aalen=0;
  for (int i=1;i<gffrec.exons.Count();i++) {
     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
     if (ilen>4000000) 
            GMessage("Warning: very large intron (%d) for transcript %s\n",
                           ilen, gffrec.getID());
     if (ilen>maxintron) {
         return false;
         }
     }
  GList<GSeg> seglst(false,true);
  GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
  if (spliceCheck && gffrec.exons.Count()>1) {
    //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
    if (faseq==NULL) GError("Error: no genomic sequence available!\n");
    int glen=gffrec.end-gffrec.start+1;
    const char* gseq=faseq->subseq(gffrec.start, glen);
    bool revcompl=(gffrec.strand=='-');
    bool ssValid=true;
    for (int e=1;e<gffrec.exons.Count();e++) {
      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
      GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
      GSpliceSite    donorSite(intron,intronlen, false, revcompl);
      //GMessage("%c intron %d-%d : %s .. %s\n",
      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
      if (acceptorSite=="AG") { // GT-AG or GC-AG
         if (!donorSite.canonicalDonor()) {
            ssValid=false;break;
            }
         }
      else if (acceptorSite=="AC") { //
         if (donorSite!="AT") { ssValid=false; break; }
         }
      else { ssValid=false; break; }
      }
    //GFREE(gseq);
    if (!ssValid) {
      if (verbose)
         GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
      return false; //don't print this one!
      }
    }

  bool trprint=true;
  int stopCodonAdjust=0;
  int mCDphase=0;
  bool hasStop=false;
  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
      mCDphase = gffrec.CDphase-'0';
  if (f_y!=NULL || f_x!=NULL || validCDSonly) {
    if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
    //if (protmap && fullCDSonly) {
    //if (protmap && (fullCDSonly ||  (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
    
    if (validCDSonly) { //make sure the stop codon is always included 
      //adjust_stopcodon(gffrec,3);
      stopCodonAdjust=adjust_stopcodon(gffrec,3);
      }
    int strandNum=0;
    int phaseNum=0;
  CDS_CHECK:
    cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst);
    if (cdsnt==NULL) trprint=false;
    else { //has CDS
      if (validCDSonly) {
         cdsaa=translateDNA(cdsnt, aalen, seqlen);
         char* p=strchr(cdsaa,'.');
         hasStop=false;
         if (p!=NULL) {
              if (p-cdsaa>=aalen-2) { //stop found as the last codon
                      *p='0';//remove it
                      hasStop=true;
                      if (aalen-2==p-cdsaa) {
                        //previous to last codon is the stop codon
                        //so correct the CDS stop accordingly
                        adjust_stopcodon(gffrec,-3, &seglst);
                        stopCodonAdjust=0; //clear artificial stop adjustment
                        seqlen-=3;
                        cdsnt[seqlen]=0;
                        }
                      aalen=p-cdsaa;
                      }
                   else {//stop found before the last codon
                      trprint=false;
                      }
              }//stop codon found
         if (trprint==false) { //failed CDS validity check
           //in-frame stop codon found
           if (altPhases && phaseNum<3) {
              phaseNum++;
              gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
              GFREE(cdsaa);
              goto CDS_CHECK;
              }
           if (gffrec.exons.Count()==1 && bothStrands) {
              strandNum++;
              phaseNum=0;
              if (strandNum<2) {
                 GFREE(cdsaa);
                 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
                 goto CDS_CHECK; //repeat the CDS check for a different frame
                 }
              }
           if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
           } //has in-frame STOP
         if (fullCDSonly) {
             if (!hasStop || cdsaa[0]!='M') trprint=false;
             }
         } // CDS check requested
      } //has CDS
    } //translation or codon check/output was requested
  if (!trprint) {
    GFREE(cdsnt);
    GFREE(cdsaa);
    return false;
    }
  if (stopCodonAdjust>0 && !hasStop) {
          //restore stop codon location
          adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
          if (cdsnt!=NULL && seqlen>0) {
             seqlen-=stopCodonAdjust;
             cdsnt[seqlen]=0;
             }
          if (cdsaa!=NULL) aalen--;
          }

  if (f_y!=NULL) { //CDS translation fasta output requested
         //char* 
         if (cdsaa==NULL) { //translate now if not done before
           cdsaa=translateDNA(cdsnt, aalen, seqlen);
           }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcripts
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_y, defline, cdsaa, aalen);
         }
   if (f_x!=NULL) { //CDS only
         if (writeExonSegs) {
              defline.append(" loc:");
              defline.append(gffrec.getGSeqName());
              defline.appendfmt("(%c)",gffrec.strand);
              //warning: not CDS coordinates are written here, but the exon ones
              defline+=(int)gffrec.start;
              defline+=(char)'-';
              defline+=(int)gffrec.end;
              // -- here these are CDS substring coordinates on the spliced sequence:
              defline.append(" segs:");
              for (int i=0;i<seglst.Count();i++) {
                  if (i>0) defline.append(",");
                  defline+=(int)seglst[i]->start;
                  defline.append("-");
                  defline+=(int)seglst[i]->end;
                  }
              }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcript
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_x, defline, cdsnt, seqlen);
         }
 GFREE(cdsnt);
 GFREE(cdsaa);
 if (f_w!=NULL) { //write spliced exons
    uint cds_start=0;
    uint cds_end=0;
    seglst.Clear();
    char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
    if (exont!=NULL) {
    if (gffrec.CDstart>0) {
        defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
        }
      if (writeExonSegs) {
        defline.append(" loc:");
        defline.append(gffrec.getGSeqName());
        defline+=(char)'|';
        defline+=(int)gffrec.start;
        defline+=(char)'-';
        defline+=(int)gffrec.end;
        defline+=(char)'|';
        defline+=(char)gffrec.strand;
        defline.append(" exons:");
        for (int i=0;i<gffrec.exons.Count();i++) {
                if (i>0) defline.append(",");
                defline+=(int)gffrec.exons[i]->start;
                defline.append("-");
                defline+=(int)gffrec.exons[i]->end;
                }
        defline.append(" segs:");
        for (int i=0;i<seglst.Count();i++) {
            if (i>0) defline.append(",");
            defline+=(int)seglst[i]->start;
            defline.append("-");
            defline+=(int)seglst[i]->end;
            }
        }
      if (fullattr && gffrec.attrs!=NULL) {
       //append all attributes found for each transcripts
        for (int i=0;i<gffrec.attrs->Count();i++) {
          defline.append(" ");
          defline.append(gffrec.getAttrName(i));
          defline.append("=");
          defline.append(gffrec.getAttrValue(i));
          }
        }
      printFasta(f_w, defline, exont, seqlen);
      GFREE(exont);
      }
    } //writing f_w (spliced exons)
 return true;
}
Ejemplo n.º 12
0
int main(int argc, char * const argv[]) {
  GArgs args(argc, argv, "hFCq:r:o:");
  int e;
  if ((e=args.isError())>0)
    GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
    if (args.getOpt('h')!=NULL){
      GMessage("%s\n", USAGE);
                exit(1);
      }
  args.startNonOpt();
  GStr fadb(args.nextNonOpt());
  if (fadb.is_empty()) GError("%s Error: multi-fasta file expected!\n",USAGE);
  GStr fname(fadb);
  fname.append(".fai");
  bool createLocal=(args.getOpt('F')!=NULL);
  const char* idxname=(createLocal)? NULL : fname.chars();
  GFastaIndex faidx(fadb.chars(), idxname);
  //also tried to load the index if exists in the current directory
  GStr fnamecwd(fname); //name in current directory (without path)
  int ip=-1;
  if ((ip=fnamecwd.rindex(CHPATHSEP))>=0) {
    fnamecwd.cut(0,ip+1);
    }
  if (!createLocal) { //look for existing indexes to load
    //try the same directory as the fasta file first
    if (!faidx.hasIndex() and fileExists(fnamecwd.chars())>1) { //try current working directory next
        faidx.loadIndex(fnamecwd.chars());
       }
    if (!faidx.hasIndex()) {//could not load any index data
       //try to create it in the same directory as the fasta file
       GMessage("No fasta index found. Rebuilding..\n");
       faidx.buildIndex();
       if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n");
       GMessage("Fasta index rebuilt.\n");
       //check if we can create a file there
       FILE* fcreate=fopen(fname.chars(), "w");
       if (fcreate==NULL)
         GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fname.chars());
       else {
         fclose(fcreate);
         if (faidx.storeIndex(fname.chars())<faidx.getCount())
           GMessage("Warning: error writing the index file %s!\n",fname.chars());
         } //creating index file in the same directory as fasta file
       }//trying to create the index file
    }

  if (createLocal || !faidx.hasIndex()) {
    //simply rebuild the index in the current directory and use it:
    //remove directories in path, if any
    if (faidx.getCount()==0) {
          faidx.buildIndex();
          if (faidx.getCount()==0) GError("Error: no fasta records to be indexed!\n");
          }
    if (faidx.storeIndex(fnamecwd.chars())<faidx.getCount())
        GMessage("Warning: error writing the index file %s!\n",fnamecwd.chars());
    }

  GStr qry(args.getOpt('q'));
  if (qry.is_empty()) exit(0);
  GFastaRec* farec=faidx.getRecord(qry.chars());
  if (farec==NULL) {
      GMessage("Error: couldn't find fasta record for '%s'!\n",qry.chars());
      exit(1);
      }
  GFaSeqGet faseq(fadb.chars(),farec->seqlen, farec->fpos, farec->line_len, farec->line_blen);
  //TODO: read these from -r option
  uint qstart=0;
  uint qend=0; //farec->seqlen
  bool revCompl=(args.getOpt('C')!=NULL);
  char* s=args.getOpt('r');
  if (s!=NULL) {
     char *p=s;
     while (isdigit(*p)) p++;
     if (*p=='-') {
                sscanf(s,"%u-%u",&qstart, &qend);
                if (qstart==0 || qend==0)
                      GError("Error parsing sequence range: %s\n",s);
                }
       else if (*p==':') {
                int qlen=0;
                sscanf(s,"%u:%d", &qstart, &qlen);
                if (qstart==0 || qlen==0)
                     GError("Error parsing sequence range: %s\n",s);
                qend=qstart+qlen-1;
                }
       else if (*p=='.') {
               sscanf(s,"%u..%u",&qstart, &qend);
               if (qstart==0 || qend==0)
               GError("Error parsing sequence range: %s\n",s);
               }
     }
  if (qstart==0) qstart=1;
  if (qend==0) qend=farec->seqlen;
  // call faseq.loadall() here if multiple ranges are to be extracted all
  // over this genomic sequence
  char* subseq=faseq.copyRange(qstart, qend, revCompl, true);
  FILE* f_out=NULL;
  openfwrite(f_out, args, 'o');
  if (f_out==NULL) f_out=stdout;
  writeFasta(f_out, qry.chars(), NULL, subseq, 70, qend-qstart+1);
  GFREE(subseq);
}
Ejemplo n.º 13
0
LytSeqInfo* LayoutParser::addSeq(char* s, LytCtgData* ctg) {
 LytSeqInfo* seq;
 //s must be the line with sequence data
 char* p=strchrs(s," \t");
 if (p==NULL) return NULL;
 p++;
 char c;
 int slen, soffs, clpL, clpR;
 clpL=0;clpR=0;
 if (sscanf(p,"%c %d %d %d %d", &c, &slen, &soffs, &clpL, &clpR)<3) return NULL;
 p--;
 *p='\0';
 if ((seq=seqinfo.Find(s))!=NULL) {
   GMessage("Sequence '%s' already found for contig '%s (%d nt)'\n"
      " so it cannot be added for contig '%s (%d nt)'\n",
     s, seq->contig->name, seq->contig->len,
     ctg->name, ctg->len);
   return NULL;
   }
 seq = new LytSeqInfo(s, ctg, soffs, (c=='-') ? 1 : 0, slen, clpL, clpR);
 seqinfo.shkAdd(seq->name, seq);
 ctg->seqs.Add(seq);
 //parse optional extensions, if any
 p+=strlen(s); //position p after the seqname
 char* m=NULL;
 int segEnd, segRclip,nextsegStart, nextsegLclip, prevSegStart;
 char segSplice, nextsegSplice;
 while ((m=strchr(p,':'))!=NULL) {
  switch (*(m-1)) {
    case 'G': //segmenting info
       prevSegStart=soffs+clpL-1;
       p=m+1;  //p to the beginning of G: data
       //accumulate the total length in lenSegs
       while (*p>='1' && *p<='9') {
         segEnd=0;
         segRclip=0;
         nextsegStart=0;
         nextsegLclip=0;
         segSplice=0;
         nextsegSplice=0;
         if (!parseInt(p,segEnd))
            GError("Error [segment] at LayoutParser for %s at: %s\n",
                 s, m-1);
         if (*p=='c') {
            p++;
            if (!parseInt(p,segRclip))
              GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            }
         if (*p=='S' || *p=='s') {
            segSplice=*p; p++;
            }
         if (*p!='-')
                GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            else p++;
         if (!parseInt(p,nextsegStart))
            GError("Error [segment] at LayoutParser for %s at: %s\n",
                 s, m-1);
         if (*p=='c') {
            p++;
            if (!parseInt(p,nextsegLclip))
              GError("Error [segment] at LayoutParser for %s at: %s\n",
                      s, m-1);
            }
         if (*p=='S' || *p=='s') {
            nextsegSplice=*p; p++;
            }
         seq->addInterSeg(segEnd,nextsegStart,segRclip,nextsegLclip,
                                         segSplice, nextsegSplice);
         prevSegStart=nextsegStart;
         //
         if (*p==',') p++;
             else break;
         } //while inter-segment parsing
       break; // 'G:' case
    case 'L': //clone mates list
       p=m+1; //p to the beginning of L: data
       break;
    case 'D': //difference sequence
       p=m+1; //p to the beginning of D: data
       break;
    case 'S': //actual sequence
       p=m+1; //p to the beginning of S: data
       break;
    default:
       p=m+1;//next attribute
    }
  }

 return seq;
}
Ejemplo n.º 14
0
/*
 Load contig data; can be called by parse - and then no fseek is needed and
 the file position if right after parsing the contig summary data
*/
bool LayoutParser::loadContig(int ctgidx, fnLytSeq* seqfn, bool re_pos) {
    bool forgetCtg=false;
    char* r=NULL;
    if (ctgidx>=contigs.Count())
      GError("LayoutParser: invalid contig index '%d'\n", ctgidx);

    LytCtgData* ctgdata=contigs[ctgidx];
    if (re_pos && currentContig!=NULL) { //free previous contig data
                                          //unless it was a parse() call
      currentContig->seqs.Clear();
      seqinfo.Clear();
      }
    currentContig=ctgdata;
    if (re_pos) {
       seek(ctgdata->fpos); //position right where the contig definition starts
       r=linebuf->getLine(f,f_pos);//skip the first line
       if (r==NULL) return false;
       }
    if (seqfn!=NULL)
       forgetCtg=(*seqfn)(numContigs, ctgdata, NULL, NULL);
    int ctg_numSeqs=ctgdata->numseqs;
    int numseqs=0;
    while ((r=linebuf->getLine(f,f_pos))!=NULL) {
       if (linebuf->length()<4) continue;
       if (linebuf->chars()[0]=='>') {
            linebuf->pushBack();
            break; //reached next contig
            }
       //sequence data parsing

       bool forgetSeq=false;
       LytSeqInfo* seq=NULL;
       if ((seq=addSeq(linebuf->chars(), ctgdata))==NULL) {
         GMessage("LayoutParser: error parsing sequence entry:\n%s\n",linebuf->chars());
         return false;
         }
        /*
        // Weird -- why would I MODIFY the given clipping of a sequence?
        //--
        bool ctg_clipping = (ctgdata->rpos>ctgdata->lpos);
        if (ctg_clipping) {
          if (ctgdata->lpos > seq->offs && ctgdata->lpos < seq->offs+seq->length())
             seq->left = ctgdata->lpos - seq->offs+1;
            if (ctgdata->rpos < seq->offs+seq->length() && ctgdata->rpos>seq->offs )
             seq->right = ctgdata->rpos-seq->offs+1;
          } */
        if (seqfn!=NULL)
          forgetSeq=(*seqfn)(numContigs, ctgdata, seq, NULL);
        if (forgetSeq) {
            ctg_numSeqs--;
            seqinfo.Remove(seq->name);
            ctgdata->seqs.RemovePtr(seq);
            }
          else {
            numseqs++;
            }
       } //while sequences
     if (forgetCtg) {
      ctgIDs.Remove(ctgdata->name);
      contigs.RemovePtr(ctgdata);
      }
    if (numseqs!=ctg_numSeqs) {
       GMessage("Mismatching number of sequences found (%d) for contig '%s' "
         "(length %d, numseqs %d)\n", numseqs,
                ctgdata->name, ctgdata->len, ctg_numSeqs);
       return false;
       }
return true;
}
Ejemplo n.º 15
0
int GFastaIndex::buildIndex() {
    //this parses the whole fasta file, so it could be slow
    if (fa_name==NULL)
       GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n");
    FILE* fa=fopen(fa_name,"rb");
    if (fa==NULL) {
       GMessage("Warning: cannot open fasta index file: %s!\n",fa_name);
       return 0;
       }
    records.Clear();
    GLineReader fl(fa);
    char* s=NULL;
    uint seqlen=0;
    int line_len=0,line_blen=0;
    bool newSeq=false; //set to true after defline
    off_t newSeqOffset=0;
    int prevOffset=0;
    char* seqname=NULL;
    int last_len=0;
    bool mustbeLastLine=false; //true if the line length decreases
    while ((s=fl.nextLine())!=NULL) {
     if (s[0]=='>') {
        if (seqname!=NULL) {
         if (seqlen==0)
            GError("Warning: empty FASTA record skipped (%s)!\n",seqname);
         else { //seqlen!=0
           addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen);
           }
         }
        char *p=s;
        while (*p > 32) p++;
        *p=0;
        GFREE(seqname);
        seqname=Gstrdup(&s[1]);
        newSeq=true;
        newSeqOffset=fl.getfpos();
        last_len=0;
        line_len=0;
        line_blen=0;
        seqlen=0;
        mustbeLastLine=false;
        } //defline parsing
     else { //sequence line
       int llen=fl.length();
       int lblen=fl.getFpos()-prevOffset;
        if (newSeq) { //first sequence line after defline
          line_len=llen;
          line_blen=lblen;
          }
        else {//next seq lines after first
          if (mustbeLastLine || llen>last_len)
             GError(ERR_FALINELEN);
          if (llen<last_len) mustbeLastLine=true;
          }
        seqlen+=llen;
        last_len=llen;
        newSeq=false;
        } //sequence line
     prevOffset=fl.getfpos();
     }//for each line of the fasta file
    if (seqlen>0)
       addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen);
    GFREE(seqname);
    fclose(fa);
    return records.Count();
    }
Ejemplo n.º 16
0
void bitError(int idx) {
 GError("Error bit checking (index %d)!\n", idx);
}