Exemplo n.º 1
0
int main(int argc, char* argv[]) {

    if (argc == 1 or argc > 2) {
        std::cerr << "Usage: TestGFFParse input\n";
        std::exit(1);
    }

    GffReader reader(argv[1]);
    reader.readAll(true);

    std::cerr << "had count of " << reader.gflst.Count() << "\n";
    size_t nfeat = reader.gflst.Count();
    for (size_t i=0; i < nfeat; ++i) {
       GffObj* f = reader.gflst[i];
       if (f->isTranscript()) {

           std::cout << f->getID() << '\t' << f->getGeneID() << '\t';
           if (f->attrs) {
               for (size_t j=0; j < f->attrs->Count(); ++j) {
                   std::cout << f->getAttrName(j) << "\t" << f->getAttrValue(j) << "\t";
               }
           }
           std::cout << "\n";
       }
    }
    std::exit(0);
}
Exemplo n.º 2
0
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
 //returns true if the transcript passed the filter
 char* gname=gffrec.getGeneName();
 if (gname==NULL) gname=gffrec.getGeneID();
 GStr defline(gffrec.getID());
 if (f_out && !fmtGTF) {
     const char* tname=NULL;
     if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
        gffrec.addAttr("Name", tname);
        gffrec.removeAttr("transcript_name");
        }
     }
 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
      const char* biotype=gffrec.getAttr("gene_biotype");
      if (biotype) {
         gffrec.addAttr("type", biotype);
         gffrec.removeAttr("gene_biotype");
         }
       else { //old Ensembl files lacking gene_biotype
         gffrec.addAttr("type", gffrec.getTrackName());
         }

      //bool is_gene=false;
      bool is_pseudo=false;
      if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
                gffrec.setFeatureName("mRNA");
       else {
          if (strcmp(biotype, "processed_transcript")==0) 
              gffrec.setFeatureName("proc_RNA");
            else {
              //is_gene=endsWith(biotype, "gene");
              is_pseudo=strifind(biotype, "pseudo");
              if (is_pseudo) {
                   gffrec.setFeatureName("pseudo_RNA");
                   }
                else if (endsWith(biotype, "RNA")) {
                   gffrec.setFeatureName(biotype);
                   } else gffrec.setFeatureName("misc_RNA");
              }
          }
      }
 if (gname && strcmp(gname, gffrec.getID())!=0) {
   int* isonum=isoCounter.Find(gname);
   if  (isonum==NULL) {
       isonum=new int(1);
       isoCounter.Add(gname,isonum);
       }
      else (*isonum)++;
   defline.appendfmt(" gene=%s", gname);
   }
  int seqlen=0;

  const char* tlabel=tracklabel;
  if (tlabel==NULL) tlabel=gffrec.getTrackName();
  //defline.appendfmt(" track:%s",tlabel);
  char* cdsnt = NULL;
  char* cdsaa = NULL;
  int aalen=0;
  for (int i=1;i<gffrec.exons.Count();i++) {
     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
     if (ilen>4000000) 
            GMessage("Warning: very large intron (%d) for transcript %s\n",
                           ilen, gffrec.getID());
     if (ilen>maxintron) {
         return false;
         }
     }
  GList<GSeg> seglst(false,true);
  GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
  if (spliceCheck && gffrec.exons.Count()>1) {
    //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
    if (faseq==NULL) GError("Error: no genomic sequence available!\n");
    int glen=gffrec.end-gffrec.start+1;
    const char* gseq=faseq->subseq(gffrec.start, glen);
    bool revcompl=(gffrec.strand=='-');
    bool ssValid=true;
    for (int e=1;e<gffrec.exons.Count();e++) {
      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
      GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
      GSpliceSite    donorSite(intron,intronlen, false, revcompl);
      //GMessage("%c intron %d-%d : %s .. %s\n",
      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
      if (acceptorSite=="AG") { // GT-AG or GC-AG
         if (!donorSite.canonicalDonor()) {
            ssValid=false;break;
            }
         }
      else if (acceptorSite=="AC") { //
         if (donorSite!="AT") { ssValid=false; break; }
         }
      else { ssValid=false; break; }
      }
    //GFREE(gseq);
    if (!ssValid) {
      if (verbose)
         GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
      return false; //don't print this one!
      }
    }

  bool trprint=true;
  int stopCodonAdjust=0;
  int mCDphase=0;
  bool hasStop=false;
  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
      mCDphase = gffrec.CDphase-'0';
  if (f_y!=NULL || f_x!=NULL || validCDSonly) {
    if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
    //if (protmap && fullCDSonly) {
    //if (protmap && (fullCDSonly ||  (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
    
    if (validCDSonly) { //make sure the stop codon is always included 
      //adjust_stopcodon(gffrec,3);
      stopCodonAdjust=adjust_stopcodon(gffrec,3);
      }
    int strandNum=0;
    int phaseNum=0;
  CDS_CHECK:
    cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst);
    if (cdsnt==NULL) trprint=false;
    else { //has CDS
      if (validCDSonly) {
         cdsaa=translateDNA(cdsnt, aalen, seqlen);
         char* p=strchr(cdsaa,'.');
         hasStop=false;
         if (p!=NULL) {
              if (p-cdsaa>=aalen-2) { //stop found as the last codon
                      *p='0';//remove it
                      hasStop=true;
                      if (aalen-2==p-cdsaa) {
                        //previous to last codon is the stop codon
                        //so correct the CDS stop accordingly
                        adjust_stopcodon(gffrec,-3, &seglst);
                        stopCodonAdjust=0; //clear artificial stop adjustment
                        seqlen-=3;
                        cdsnt[seqlen]=0;
                        }
                      aalen=p-cdsaa;
                      }
                   else {//stop found before the last codon
                      trprint=false;
                      }
              }//stop codon found
         if (trprint==false) { //failed CDS validity check
           //in-frame stop codon found
           if (altPhases && phaseNum<3) {
              phaseNum++;
              gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
              GFREE(cdsaa);
              goto CDS_CHECK;
              }
           if (gffrec.exons.Count()==1 && bothStrands) {
              strandNum++;
              phaseNum=0;
              if (strandNum<2) {
                 GFREE(cdsaa);
                 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
                 goto CDS_CHECK; //repeat the CDS check for a different frame
                 }
              }
           if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
           } //has in-frame STOP
         if (fullCDSonly) {
             if (!hasStop || cdsaa[0]!='M') trprint=false;
             }
         } // CDS check requested
      } //has CDS
    } //translation or codon check/output was requested
  if (!trprint) {
    GFREE(cdsnt);
    GFREE(cdsaa);
    return false;
    }
  if (stopCodonAdjust>0 && !hasStop) {
          //restore stop codon location
          adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
          if (cdsnt!=NULL && seqlen>0) {
             seqlen-=stopCodonAdjust;
             cdsnt[seqlen]=0;
             }
          if (cdsaa!=NULL) aalen--;
          }

  if (f_y!=NULL) { //CDS translation fasta output requested
         //char* 
         if (cdsaa==NULL) { //translate now if not done before
           cdsaa=translateDNA(cdsnt, aalen, seqlen);
           }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcripts
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_y, defline, cdsaa, aalen);
         }
   if (f_x!=NULL) { //CDS only
         if (writeExonSegs) {
              defline.append(" loc:");
              defline.append(gffrec.getGSeqName());
              defline.appendfmt("(%c)",gffrec.strand);
              //warning: not CDS coordinates are written here, but the exon ones
              defline+=(int)gffrec.start;
              defline+=(char)'-';
              defline+=(int)gffrec.end;
              // -- here these are CDS substring coordinates on the spliced sequence:
              defline.append(" segs:");
              for (int i=0;i<seglst.Count();i++) {
                  if (i>0) defline.append(",");
                  defline+=(int)seglst[i]->start;
                  defline.append("-");
                  defline+=(int)seglst[i]->end;
                  }
              }
         if (fullattr && gffrec.attrs!=NULL) {
             //append all attributes found for each transcript
              for (int i=0;i<gffrec.attrs->Count();i++) {
                defline.append(" ");
                defline.append(gffrec.getAttrName(i));
                defline.append("=");
                defline.append(gffrec.getAttrValue(i));
                }
              }
         printFasta(f_x, defline, cdsnt, seqlen);
         }
 GFREE(cdsnt);
 GFREE(cdsaa);
 if (f_w!=NULL) { //write spliced exons
    uint cds_start=0;
    uint cds_end=0;
    seglst.Clear();
    char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
    if (exont!=NULL) {
    if (gffrec.CDstart>0) {
        defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
        }
      if (writeExonSegs) {
        defline.append(" loc:");
        defline.append(gffrec.getGSeqName());
        defline+=(char)'|';
        defline+=(int)gffrec.start;
        defline+=(char)'-';
        defline+=(int)gffrec.end;
        defline+=(char)'|';
        defline+=(char)gffrec.strand;
        defline.append(" exons:");
        for (int i=0;i<gffrec.exons.Count();i++) {
                if (i>0) defline.append(",");
                defline+=(int)gffrec.exons[i]->start;
                defline.append("-");
                defline+=(int)gffrec.exons[i]->end;
                }
        defline.append(" segs:");
        for (int i=0;i<seglst.Count();i++) {
            if (i>0) defline.append(",");
            defline+=(int)seglst[i]->start;
            defline.append("-");
            defline+=(int)seglst[i]->end;
            }
        }
      if (fullattr && gffrec.attrs!=NULL) {
       //append all attributes found for each transcripts
        for (int i=0;i<gffrec.attrs->Count();i++) {
          defline.append(" ");
          defline.append(gffrec.getAttrName(i));
          defline.append("=");
          defline.append(gffrec.getAttrValue(i));
          }
        }
      printFasta(f_w, defline, exont, seqlen);
      GFREE(exont);
      }
    } //writing f_w (spliced exons)
 return true;
}