Esempio n. 1
0
int main(int argc, char **argv)
{
    AjPSeqall nucseq;		/* input nucleic sequences */
    AjPSeqset protseq;		/* input aligned protein sequences */
    AjPSeqout seqout;
    AjPSeq nseq;		/* next nucleic sequence to align */
    const AjPSeq pseq;		/* next protein sequence use in alignment */
    AjPTrn trnTable;
    AjPSeq pep;			/* translation of nseq */
    AjPStr tablelist;
    ajint table;
    AjPSeqset outseqset;	/* set of aligned nucleic sequences */
    ajint proteinseqcount = 0;
    AjPStr degapstr = NULL;
    /* used to check if it matches with START removed */
    AjPStr degapstr2 = NULL;
    AjPStr codon = NULL;	/* holds temporary codon to check if is START */
    char aa;			/* translated putative START codon */
    ajint type;			/* returned type of the putative START codon */
    /* start position of guide protein in translation */
    ajlong pos = 0;
    AjPSeq newseq = NULL;	/* output aligned nucleic sequence */
    ajint frame;

    embInit("tranalign", argc, argv);

    nucseq    = ajAcdGetSeqall("asequence");
    protseq   = ajAcdGetSeqset("bsequence");
    tablelist = ajAcdGetListSingle("table");
    seqout    = ajAcdGetSeqoutset("outseq");

    outseqset = ajSeqsetNew();
    degapstr  = ajStrNew();

    /* initialise the translation table */
    ajStrToInt(tablelist, &table);
    trnTable = ajTrnNewI(table);

    ajSeqsetFill(protseq);

    while(ajSeqallNext(nucseq, &nseq))
    {
    	if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL)
    	    ajErr("No guide protein sequence available for "
		  "nucleic sequence %S",
		  ajSeqGetNameS(nseq));

	ajDebug("Aligning %S and %S\n",
		ajSeqGetNameS(nseq), ajSeqGetNameS(pseq));

        /* get copy of pseq string with no gaps */
        ajStrAssignS(&degapstr, ajSeqGetSeqS(pseq));
        ajStrRemoveGap(&degapstr);

        /*
	** for each translation frame look for subset of pep that
	** matches pseq
	*/
        for(frame = 1; frame <4; frame++)
	{
	    ajDebug("trying frame %d\n", frame);
            pep = ajTrnSeqOrig(trnTable, nseq, frame);
            degapstr2 = ajStrNew();
            ajStrAssignRef(&degapstr2, degapstr);
            pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr);

            /* 
            ** we might have a START codon that should be translated as 'M'
            ** we need to check if there is a match after a possible START
            ** codon 
            */
            if(pos == -1 && ajStrGetLen(degapstr) > 1 && 
                (ajStrGetPtr(degapstr)[0] == 'M' ||
		 ajStrGetPtr(degapstr)[0] == 'm'))
	      {
                /* see if pep minus the first character is a match */
                ajStrCutStart(&degapstr2, 1);
                pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); 

                /*
		** pos is >= 1 if we have a match that is after the first
		** residue
		*/
                if(pos >= 1)
		{
                    /* point back at the putative START Methionine */
                    pos--;
                    /* test if first codon is a START */
                    codon = ajStrNew();
                    ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), 
                                (pos*3)+frame-1, (pos*3)+frame+2);
                    type = ajTrnCodonstrTypeS(trnTable, codon, &aa);

                    if(type != 1)
                    {
                        /* first codon is not a valid START, force a mismatch */
                        pos = -1;
                    }
                    ajStrDel(&codon);
                
            	}
		else
		{
                    /* force 'pos == 0' to be treated as a mismatch */
            	    pos = -1;
		}
            }

            ajStrDel(&degapstr2);
            ajSeqDel(&pep);

            if(pos != -1)
            	break;
        }

        if(pos == -1)
	    ajErr("Guide protein sequence %S not found in nucleic sequence %S",
		  ajSeqGetNameS(pseq), ajSeqGetNameS(nseq));
	else
	{
	    ajDebug("got a match with frame=%d\n", frame);
            /* extract the coding region of nseq with gaps */
            newseq = ajSeqNew();
            ajSeqSetNuc(newseq);
            ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq));
            ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq));
            tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1);

            /* output the gapped nucleic sequence */
            ajSeqsetApp(outseqset, newseq);

            ajSeqDel(&newseq);
        }

        ajStrRemoveWhiteExcess(&degapstr);
    }

    ajSeqoutWriteSet(seqout, outseqset);
    ajSeqoutClose(seqout);

    ajTrnDel(&trnTable);
    ajSeqsetDel(&outseqset);
    ajStrDel(&degapstr);
    ajStrDel(&degapstr2);

    ajSeqallDel(&nucseq);
    ajSeqDel(&nseq);
    ajSeqoutDel(&seqout);
    ajSeqsetDel(&protseq);
    ajStrDel(&tablelist);

    embExit();

    return 0;
}
Esempio n. 2
0
void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable,
                     ajuint minsize, ajuint maxsize, AjPSeqout seqout,
                     AjBool sense, AjBool circular, ajint find,
                     ajint *orf_no, AjBool methionine, ajint around,
                     ORFrec *record) {
  AjBool ORF[3];            /* true if found an ORF */
  AjBool LASTORF[3];         /* true if hit the end of an ORF past
                    the end on the genome in this
                    frame */
  AjBool GOTSTOP[3];         /* true if found a STOP in a circular
                    genome's frame when
                    find = P_STOP2STOP or
                    N_STOP2STOP */
  ajint start[3];          /* possible starting position of the
                     three frames */
  ajint pos;
  ajint codon;
  char aa;
  ajint frame;
  AjPStr newstr[3];         /* strings of the three frames of ORF
                    sequences that we are growing */
  AjPSeq pep = NULL;
  ajint i;

  ajint seqlen;
  const char *chrseq;

  seqlen = ajSeqGetLen(seq);
  chrseq = ajSeqGetSeqC(seq);

  /* initialise the ORF sequences */
  newstr[0] = NULL;
  newstr[1] = NULL;
  newstr[2] = NULL;

  /*
  ** initialise flags for found the last ORF past the end of a circular
  ** genome
  */
  LASTORF[0] = ajFalse;
  LASTORF[1] = ajFalse;
  LASTORF[2] = ajFalse;

  /* initialise flags for found at least one STOP codon in a frame */
  GOTSTOP[0] = ajFalse;
  GOTSTOP[1] = ajFalse;
  GOTSTOP[2] = ajFalse;

  if (circular || find == P_START2STOP || find == N_START2STOP ||
      find == AROUND_START) {
    ORF[0] = ajFalse;
    ORF[1] = ajFalse;
    ORF[2] = ajFalse;
  } else {
    /*
    ** assume already in a ORF so we get ORFs at the start of the
    ** sequence
    */
    ORF[0] = ajTrue;
    ORF[1] = ajTrue;
    ORF[2] = ajTrue;
    start[0] = 0;
    start[1] = 1;
    start[2] = 2;
  }

  for (pos=0; pos<seqlen-2; pos++) {
    codon = ajTrnStartStopC(trnTable, &chrseq[pos], &aa);
    frame = pos % 3;
    ajDebug("len=%d, Pos=%d, Frame=%d start/stop=%d, aa=%c",
            len, pos, frame, codon, aa);

    /* don't want to find extra ORFs when already been round circ */
    if (LASTORF[frame])
      continue;

    if (find == P_STOP2STOP || find == N_STOP2STOP ||
        find == AROUND_INIT_STOP || find == AROUND_END_STOP) {  /* look for stop codon to begin reporting ORF */
      /* note that there was at least one STOP in a circular genome */
      if (codon == STOP) {
        GOTSTOP[frame] = ajTrue;
      }

      /* write details if a STOP is hit or the end of the sequence */
      if (codon == STOP || pos >= seqlen-5) {

        /*
        ** End of the sequence? If so, append any
        ** last codon to the sequence - otherwise, ignore the STOP
        ** codon
        */
        if (codon != STOP)
          getorf_AppORF(find, &newstr[frame], chrseq, pos,
                        aa);

        /* Already have a sequence to write out? */
        if (ORF[frame]) {
          if (ajStrGetLen(newstr[frame]) >= minsize &&
              ajStrGetLen(newstr[frame]) <= maxsize) {
            /* create a new sequence */
            if (codon == STOP)
              getorf_WriteORF(seq, len, seqlen, sense,
                              find, orf_no, start[frame],
                              pos-1, newstr[frame],
                              seqout, around);
            else
              getorf_WriteORF(seq, len, seqlen, sense,
                              find, orf_no, start[frame],
                              pos+2, newstr[frame],
                              seqout, around);
          }

          ajStrSetClear(&newstr[frame]);
        }

        /*
        ** if its a circular genome and the STOP codon hits past
        ** the end of the genome in all frames, then break
        */
        if (circular && pos >= len) {
          ORF[frame] = ajFalse; /* past the end of the genome */
          LASTORF[frame] = ajTrue; /* finished getting ORFs */
          if (LASTORF[0] && LASTORF[1] && LASTORF[2])
            break;
        } else {
          /*
          ** hit a STOP, therefore a potential ORF to write
          ** out next time, even if the genome is circular
          */
          ORF[frame]   = ajTrue;
          start[frame] = pos+3; /* next start of the ORF */
        }

      } else if (ORF[frame])
        /* append sequence to newstr if in an ORF */
        getorf_AppORF(find, &newstr[frame], chrseq, pos, aa);
    } else { /* Look for start: P_START2STOP N_START2STOP AROUND_START */

      if (codon == START && !ORF[frame]) {
        /* not in a ORF already and found a START */
        if (pos < len) {
          /*
          **  reset the newstr to zero length to enable
          **  storing the ORF for this
          */
          ajStrSetClear(&newstr[frame]);
          ORF[frame] = ajTrue; /* now in an ORF */
          start[frame] = pos;    /* start of the ORF for this frame */
          if (methionine)
            getorf_AppORF(find, &newstr[frame], chrseq,
                          pos, 'M');
          else
            getorf_AppORF(find, &newstr[frame], chrseq,
                          pos, aa);
        }
      } else if (codon == STOP) {
        /* hit a STOP */

        /* Already have a sequence to write out? */
        if (ORF[frame]) {
          ORF[frame] = ajFalse; /* not in an ORF */

          if (ajStrGetLen(newstr[frame]) >= minsize &&
              ajStrGetLen(newstr[frame]) <= maxsize) {
            /* create a new sequence */
            getorf_WriteORF(seq, len, seqlen, sense,
                            find, orf_no, start[frame],
                            pos-1, newstr[frame],
                            seqout, around);
          }
        }

        /*
        ** if a circular genome and hit the STOP past
        ** the end of the genome in all frames, then break
        */
        if (circular && pos >= len) {
          LASTORF[frame] = ajTrue; /* finished getting ORFs */
          if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break;
        }

        ajStrSetClear(&newstr[frame]);
      } else if (pos >= seqlen-5) {
        /* hit the end of the sequence  without a stop */

        /* Already have a sequence to write out? */
        if (ORF[frame]) {
          ORF[frame] = ajFalse; /* not in an ORF */

          /*
          ** End of the sequence? If so, append any
          ** last codon to the sequence - otherwise, ignore the
          ** STOP codon
          */
          if (pos >= seqlen-5 && pos < seqlen-2)
            getorf_AppORF(find, &newstr[frame], chrseq,
                          pos, aa);

          if (ajStrGetLen(newstr[frame]) >= minsize &&
              ajStrGetLen(newstr[frame]) <= maxsize) {
            /* create a new sequence */
            getorf_WriteORF(seq, len, seqlen, sense,
                            find, orf_no, start[frame],
                            pos+2, newstr[frame],
                            seqout, around);
          }
        }

        /*
        ** if a circular genome and hit the STOP past
        ** the end of the genome in all frames, then break
        */
        if (circular && pos >= len) {
          LASTORF[frame] = ajTrue; /* finished getting ORFs */
          if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break;
        }

        ajStrSetClear(&newstr[frame]);
      } else
        if (ORF[frame])
          getorf_AppORF(find, &newstr[frame], chrseq, pos,
                        aa);

    }
  }

  /*
  ** Currently miss reporting a STOP-to-STOP ORF that is
  ** the full length of a circular genome when there are no STOP codons in
  ** that frame
  */
  if ((find == P_STOP2STOP || find == N_STOP2STOP) && circular) {
    if (!GOTSTOP[0]) {
      /* translate frame 1 into pep */
      pep = ajTrnSeqOrig(trnTable, seq, 1);
      if (ajSeqGetLen(pep) >= minsize &&
          ajSeqGetLen(pep) <= maxsize)
        getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
                        0, seqlen-1, ajSeqGetSeqS(pep), seqout,
                        around);
      ajSeqDel(&pep);
    }

    if (!GOTSTOP[1]) {
      /* translate frame 2 into pep */
      pep = ajTrnSeqOrig(trnTable, seq, 2);
      if (ajSeqGetLen(pep) >= minsize &&
          ajSeqGetLen(pep) <= maxsize)
        getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
                        1, seqlen-1, ajSeqGetSeqS(pep), seqout,
                        around);
      ajSeqDel(&pep);
    }

    if (!GOTSTOP[2]) {
      /* translate frame 3 into pep */
      pep = ajTrnSeqOrig(trnTable, seq, 3);
      if (ajSeqGetLen(pep) >= minsize &&
          ajSeqGetLen(pep) >= maxsize)
        getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
                        2, seqlen-1, ajSeqGetSeqS(pep), seqout,
                        around);
      ajSeqDel(&pep);
    }
  }

  for (i=0;i<3;++i)
    ajStrDel(&newstr[i]);

  return;
}