int main(int argc, char **argv) { AjPSeqall nucseq; /* input nucleic sequences */ AjPSeqset protseq; /* input aligned protein sequences */ AjPSeqout seqout; AjPSeq nseq; /* next nucleic sequence to align */ const AjPSeq pseq; /* next protein sequence use in alignment */ AjPTrn trnTable; AjPSeq pep; /* translation of nseq */ AjPStr tablelist; ajint table; AjPSeqset outseqset; /* set of aligned nucleic sequences */ ajint proteinseqcount = 0; AjPStr degapstr = NULL; /* used to check if it matches with START removed */ AjPStr degapstr2 = NULL; AjPStr codon = NULL; /* holds temporary codon to check if is START */ char aa; /* translated putative START codon */ ajint type; /* returned type of the putative START codon */ /* start position of guide protein in translation */ ajlong pos = 0; AjPSeq newseq = NULL; /* output aligned nucleic sequence */ ajint frame; embInit("tranalign", argc, argv); nucseq = ajAcdGetSeqall("asequence"); protseq = ajAcdGetSeqset("bsequence"); tablelist = ajAcdGetListSingle("table"); seqout = ajAcdGetSeqoutset("outseq"); outseqset = ajSeqsetNew(); degapstr = ajStrNew(); /* initialise the translation table */ ajStrToInt(tablelist, &table); trnTable = ajTrnNewI(table); ajSeqsetFill(protseq); while(ajSeqallNext(nucseq, &nseq)) { if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL) ajErr("No guide protein sequence available for " "nucleic sequence %S", ajSeqGetNameS(nseq)); ajDebug("Aligning %S and %S\n", ajSeqGetNameS(nseq), ajSeqGetNameS(pseq)); /* get copy of pseq string with no gaps */ ajStrAssignS(°apstr, ajSeqGetSeqS(pseq)); ajStrRemoveGap(°apstr); /* ** for each translation frame look for subset of pep that ** matches pseq */ for(frame = 1; frame <4; frame++) { ajDebug("trying frame %d\n", frame); pep = ajTrnSeqOrig(trnTable, nseq, frame); degapstr2 = ajStrNew(); ajStrAssignRef(°apstr2, degapstr); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr); /* ** we might have a START codon that should be translated as 'M' ** we need to check if there is a match after a possible START ** codon */ if(pos == -1 && ajStrGetLen(degapstr) > 1 && (ajStrGetPtr(degapstr)[0] == 'M' || ajStrGetPtr(degapstr)[0] == 'm')) { /* see if pep minus the first character is a match */ ajStrCutStart(°apstr2, 1); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); /* ** pos is >= 1 if we have a match that is after the first ** residue */ if(pos >= 1) { /* point back at the putative START Methionine */ pos--; /* test if first codon is a START */ codon = ajStrNew(); ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), (pos*3)+frame-1, (pos*3)+frame+2); type = ajTrnCodonstrTypeS(trnTable, codon, &aa); if(type != 1) { /* first codon is not a valid START, force a mismatch */ pos = -1; } ajStrDel(&codon); } else { /* force 'pos == 0' to be treated as a mismatch */ pos = -1; } } ajStrDel(°apstr2); ajSeqDel(&pep); if(pos != -1) break; } if(pos == -1) ajErr("Guide protein sequence %S not found in nucleic sequence %S", ajSeqGetNameS(pseq), ajSeqGetNameS(nseq)); else { ajDebug("got a match with frame=%d\n", frame); /* extract the coding region of nseq with gaps */ newseq = ajSeqNew(); ajSeqSetNuc(newseq); ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq)); ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq)); tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1); /* output the gapped nucleic sequence */ ajSeqsetApp(outseqset, newseq); ajSeqDel(&newseq); } ajStrRemoveWhiteExcess(°apstr); } ajSeqoutWriteSet(seqout, outseqset); ajSeqoutClose(seqout); ajTrnDel(&trnTable); ajSeqsetDel(&outseqset); ajStrDel(°apstr); ajStrDel(°apstr2); ajSeqallDel(&nucseq); ajSeqDel(&nseq); ajSeqoutDel(&seqout); ajSeqsetDel(&protseq); ajStrDel(&tablelist); embExit(); return 0; }
void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable, ajuint minsize, ajuint maxsize, AjPSeqout seqout, AjBool sense, AjBool circular, ajint find, ajint *orf_no, AjBool methionine, ajint around, ORFrec *record) { AjBool ORF[3]; /* true if found an ORF */ AjBool LASTORF[3]; /* true if hit the end of an ORF past the end on the genome in this frame */ AjBool GOTSTOP[3]; /* true if found a STOP in a circular genome's frame when find = P_STOP2STOP or N_STOP2STOP */ ajint start[3]; /* possible starting position of the three frames */ ajint pos; ajint codon; char aa; ajint frame; AjPStr newstr[3]; /* strings of the three frames of ORF sequences that we are growing */ AjPSeq pep = NULL; ajint i; ajint seqlen; const char *chrseq; seqlen = ajSeqGetLen(seq); chrseq = ajSeqGetSeqC(seq); /* initialise the ORF sequences */ newstr[0] = NULL; newstr[1] = NULL; newstr[2] = NULL; /* ** initialise flags for found the last ORF past the end of a circular ** genome */ LASTORF[0] = ajFalse; LASTORF[1] = ajFalse; LASTORF[2] = ajFalse; /* initialise flags for found at least one STOP codon in a frame */ GOTSTOP[0] = ajFalse; GOTSTOP[1] = ajFalse; GOTSTOP[2] = ajFalse; if (circular || find == P_START2STOP || find == N_START2STOP || find == AROUND_START) { ORF[0] = ajFalse; ORF[1] = ajFalse; ORF[2] = ajFalse; } else { /* ** assume already in a ORF so we get ORFs at the start of the ** sequence */ ORF[0] = ajTrue; ORF[1] = ajTrue; ORF[2] = ajTrue; start[0] = 0; start[1] = 1; start[2] = 2; } for (pos=0; pos<seqlen-2; pos++) { codon = ajTrnStartStopC(trnTable, &chrseq[pos], &aa); frame = pos % 3; ajDebug("len=%d, Pos=%d, Frame=%d start/stop=%d, aa=%c", len, pos, frame, codon, aa); /* don't want to find extra ORFs when already been round circ */ if (LASTORF[frame]) continue; if (find == P_STOP2STOP || find == N_STOP2STOP || find == AROUND_INIT_STOP || find == AROUND_END_STOP) { /* look for stop codon to begin reporting ORF */ /* note that there was at least one STOP in a circular genome */ if (codon == STOP) { GOTSTOP[frame] = ajTrue; } /* write details if a STOP is hit or the end of the sequence */ if (codon == STOP || pos >= seqlen-5) { /* ** End of the sequence? If so, append any ** last codon to the sequence - otherwise, ignore the STOP ** codon */ if (codon != STOP) getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); /* Already have a sequence to write out? */ if (ORF[frame]) { if (ajStrGetLen(newstr[frame]) >= minsize && ajStrGetLen(newstr[frame]) <= maxsize) { /* create a new sequence */ if (codon == STOP) getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, start[frame], pos-1, newstr[frame], seqout, around); else getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, start[frame], pos+2, newstr[frame], seqout, around); } ajStrSetClear(&newstr[frame]); } /* ** if its a circular genome and the STOP codon hits past ** the end of the genome in all frames, then break */ if (circular && pos >= len) { ORF[frame] = ajFalse; /* past the end of the genome */ LASTORF[frame] = ajTrue; /* finished getting ORFs */ if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break; } else { /* ** hit a STOP, therefore a potential ORF to write ** out next time, even if the genome is circular */ ORF[frame] = ajTrue; start[frame] = pos+3; /* next start of the ORF */ } } else if (ORF[frame]) /* append sequence to newstr if in an ORF */ getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); } else { /* Look for start: P_START2STOP N_START2STOP AROUND_START */ if (codon == START && !ORF[frame]) { /* not in a ORF already and found a START */ if (pos < len) { /* ** reset the newstr to zero length to enable ** storing the ORF for this */ ajStrSetClear(&newstr[frame]); ORF[frame] = ajTrue; /* now in an ORF */ start[frame] = pos; /* start of the ORF for this frame */ if (methionine) getorf_AppORF(find, &newstr[frame], chrseq, pos, 'M'); else getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); } } else if (codon == STOP) { /* hit a STOP */ /* Already have a sequence to write out? */ if (ORF[frame]) { ORF[frame] = ajFalse; /* not in an ORF */ if (ajStrGetLen(newstr[frame]) >= minsize && ajStrGetLen(newstr[frame]) <= maxsize) { /* create a new sequence */ getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, start[frame], pos-1, newstr[frame], seqout, around); } } /* ** if a circular genome and hit the STOP past ** the end of the genome in all frames, then break */ if (circular && pos >= len) { LASTORF[frame] = ajTrue; /* finished getting ORFs */ if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break; } ajStrSetClear(&newstr[frame]); } else if (pos >= seqlen-5) { /* hit the end of the sequence without a stop */ /* Already have a sequence to write out? */ if (ORF[frame]) { ORF[frame] = ajFalse; /* not in an ORF */ /* ** End of the sequence? If so, append any ** last codon to the sequence - otherwise, ignore the ** STOP codon */ if (pos >= seqlen-5 && pos < seqlen-2) getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); if (ajStrGetLen(newstr[frame]) >= minsize && ajStrGetLen(newstr[frame]) <= maxsize) { /* create a new sequence */ getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, start[frame], pos+2, newstr[frame], seqout, around); } } /* ** if a circular genome and hit the STOP past ** the end of the genome in all frames, then break */ if (circular && pos >= len) { LASTORF[frame] = ajTrue; /* finished getting ORFs */ if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break; } ajStrSetClear(&newstr[frame]); } else if (ORF[frame]) getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); } } /* ** Currently miss reporting a STOP-to-STOP ORF that is ** the full length of a circular genome when there are no STOP codons in ** that frame */ if ((find == P_STOP2STOP || find == N_STOP2STOP) && circular) { if (!GOTSTOP[0]) { /* translate frame 1 into pep */ pep = ajTrnSeqOrig(trnTable, seq, 1); if (ajSeqGetLen(pep) >= minsize && ajSeqGetLen(pep) <= maxsize) getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, 0, seqlen-1, ajSeqGetSeqS(pep), seqout, around); ajSeqDel(&pep); } if (!GOTSTOP[1]) { /* translate frame 2 into pep */ pep = ajTrnSeqOrig(trnTable, seq, 2); if (ajSeqGetLen(pep) >= minsize && ajSeqGetLen(pep) <= maxsize) getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, 1, seqlen-1, ajSeqGetSeqS(pep), seqout, around); ajSeqDel(&pep); } if (!GOTSTOP[2]) { /* translate frame 3 into pep */ pep = ajTrnSeqOrig(trnTable, seq, 3); if (ajSeqGetLen(pep) >= minsize && ajSeqGetLen(pep) >= maxsize) getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, 2, seqlen-1, ajSeqGetSeqS(pep), seqout, around); ajSeqDel(&pep); } } for (i=0;i<3;++i) ajStrDel(&newstr[i]); return; }