int main(int argc, char **argv) { ajint begin, end; AjPSeqall seqall; AjPSeq seq; EmbPShow ss; AjPFile outfile; AjPStr tablename; ajint table; AjPRange uppercase; AjPRange highlight; AjBool threeletter; AjBool numberseq; AjBool nameseq; ajint width; ajint length; ajint margin; AjBool description; ajint offset; AjBool html; AjPStr descriptionline; ajint orfminsize; AjPTrn trnTable; AjBool translation; AjBool reverse; AjBool cutlist; AjBool flat; EmbPMatMatch mm = NULL; AjPStr *framelist; AjBool frames[6]; /* frames to be translated 1 to 3, -1 to -3 */ /* stuff for tables and lists of enzymes and hits */ ajint default_mincuts = 1; ajint default_maxcuts = 2000000000; AjPTable hittable; /* enzyme hits */ /* stuff lifted from Alan's 'restrict.c' */ AjPStr enzymes = NULL; ajint mincuts; ajint maxcuts; ajint sitelen; AjBool single; AjBool blunt; AjBool sticky; AjBool ambiguity; AjBool plasmid; AjBool commercial; AjBool limit; AjBool methyl; AjPFile enzfile = NULL; AjPFile equfile = NULL; AjPFile methfile = NULL; AjPTable retable = NULL; ajint hits; AjPList restrictlist = NULL; embInit("remap", argc, argv); seqall = ajAcdGetSeqall("sequence"); outfile = ajAcdGetOutfile("outfile"); tablename = ajAcdGetListSingle("table"); uppercase = ajAcdGetRange("uppercase"); highlight = ajAcdGetRange("highlight"); threeletter = ajAcdGetBoolean("threeletter"); numberseq = ajAcdGetBoolean("number"); width = ajAcdGetInt("width"); length = ajAcdGetInt("length"); margin = ajAcdGetInt("margin"); nameseq = ajAcdGetBoolean("name"); description = ajAcdGetBoolean("description"); offset = ajAcdGetInt("offset"); html = ajAcdGetBoolean("html"); orfminsize = ajAcdGetInt("orfminsize"); translation = ajAcdGetBoolean("translation"); reverse = ajAcdGetBoolean("reverse"); cutlist = ajAcdGetBoolean("cutlist"); flat = ajAcdGetBoolean("flatreformat"); framelist = ajAcdGetList("frame"); /* restriction enzyme stuff */ mincuts = ajAcdGetInt("mincuts"); maxcuts = ajAcdGetInt("maxcuts"); sitelen = ajAcdGetInt("sitelen"); single = ajAcdGetBoolean("single"); blunt = ajAcdGetBoolean("blunt"); sticky = ajAcdGetBoolean("sticky"); ambiguity = ajAcdGetBoolean("ambiguity"); plasmid = ajAcdGetBoolean("plasmid"); commercial = ajAcdGetBoolean("commercial"); limit = ajAcdGetBoolean("limit"); enzymes = ajAcdGetString("enzymes"); methfile = ajAcdGetDatafile("mfile"); methyl = ajAcdGetBoolean("methylation"); if(!blunt && !sticky) ajFatal("Blunt/Sticky end cutters shouldn't both be disabled."); /* get the number of the genetic code used */ ajStrToInt(tablename, &table); trnTable = ajTrnNewI(table); /* read the local file of enzymes names */ remap_read_file_of_enzyme_names(&enzymes); /* get the frames to be translated */ remap_GetFrames(framelist, frames); while(ajSeqallNext(seqall, &seq)) { /* get begin and end positions */ begin = ajSeqGetBegin(seq)-1; end = ajSeqGetEnd(seq)-1; /* do the name and description */ if(nameseq) { if(html) ajFmtPrintF(outfile, "<H2>%S</H2>\n", ajSeqGetNameS(seq)); else ajFmtPrintF(outfile, "%S\n", ajSeqGetNameS(seq)); } if(description) { /* ** wrap the description line at the width of the sequence ** plus margin */ if(html) ajFmtPrintF(outfile, "<H3>%S</H3>\n", ajSeqGetDescS(seq)); else { descriptionline = ajStrNew(); ajStrAssignS(&descriptionline, ajSeqGetDescS(seq)); ajStrFmtWrap(&descriptionline, width+margin); ajFmtPrintF(outfile, "%S\n", descriptionline); ajStrDel(&descriptionline); } } /* get the restriction cut sites */ /* ** most of this is lifted from the program 'restrict.c' by Alan ** Bleasby */ if(single) maxcuts=mincuts=1; retable = ajTablestrNew(EQUGUESS); enzfile = ajDatafileNewInNameC(ENZDATA); if(!enzfile) ajFatal("Cannot locate enzyme file. Run REBASEEXTRACT"); if(limit) { equfile = ajDatafileNewInNameC(EQUDATA); if(!equfile) limit = ajFalse; else remap_read_equiv(&equfile, &retable, commercial); } ajFileSeek(enzfile, 0L, 0); restrictlist = ajListNew(); /* search for hits, but don't use mincuts and maxcuts criteria yet */ hits = embPatRestrictMatch(seq, begin+1, end+1, enzfile, methfile, enzymes, sitelen,plasmid, ambiguity, default_mincuts, default_maxcuts, blunt, sticky, commercial, methyl, restrictlist); ajDebug("Remap found %d hits\n", hits); if(hits) { /* this bit is lifted from printHits */ embPatRestrictRestrict(restrictlist, hits, !limit, ajFalse); if(limit) remap_RestrictPreferred(restrictlist,retable); } ajFileClose(&enzfile); ajFileClose(&methfile); /* ** Remove those violating the mincuts and maxcuts ** criteria, but save them in hittable for printing out later. ** Keep a count of how many hits each enzyme gets in hittable. */ hittable = ajTablestrNewCase(TABLEGUESS); remap_RemoveMinMax(restrictlist, hittable, mincuts, maxcuts); /* make the Show Object */ ss = embShowNew(seq, begin, end, width, length, margin, html, offset); if(html) ajFmtPrintF(outfile, "<PRE>"); /* create the format to display */ embShowAddBlank(ss); embShowAddRE(ss, 1, restrictlist, plasmid, flat); embShowAddSeq(ss, numberseq, threeletter, uppercase, highlight); if(!numberseq) embShowAddTicknum(ss); embShowAddTicks(ss); if(reverse) { embShowAddComp(ss, numberseq); embShowAddRE(ss, -1, restrictlist, plasmid, flat); } if(translation) { if(reverse) embShowAddBlank(ss); if(frames[0]) embShowAddTran(ss, trnTable, 1, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); if(frames[1]) embShowAddTran(ss, trnTable, 2, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); if(frames[2]) embShowAddTran(ss, trnTable, 3, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); if(reverse) { embShowAddTicks(ss); if(frames[5]) embShowAddTran(ss, trnTable, -3, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); if(frames[4]) embShowAddTran(ss, trnTable, -2, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); if(frames[3]) embShowAddTran(ss, trnTable, -1, threeletter, numberseq, NULL, orfminsize, AJFALSE, AJFALSE, AJFALSE, AJFALSE); } } embShowPrint(outfile, ss); /* display a list of the Enzymes that cut and don't cut */ if(cutlist) { remap_CutList(outfile, hittable, limit, html, mincuts, maxcuts); remap_NoCutList(outfile, hittable, html, enzymes, blunt, sticky, sitelen, commercial, ambiguity, limit, retable); } /* add a gratuitous newline at the end of the sequence */ ajFmtPrintF(outfile, "\n"); /* tidy up */ embShowDel(&ss); while(ajListPop(restrictlist,(void **)&mm)) embMatMatchDel(&mm); ajListFree(&restrictlist); remap_DelTable(&hittable); ajTablestrFree(&retable); } ajTrnDel(&trnTable); ajSeqallDel(&seqall); ajSeqDel(&seq); ajFileClose(&outfile); ajStrDel(&tablename); ajStrDel(&enzymes); ajStrDelarray(&framelist); ajRangeDel(&uppercase); ajRangeDel(&highlight); embExit(); return 0; }
int main(int argc, char **argv) { AjPSeqall nucseq; /* input nucleic sequences */ AjPSeqset protseq; /* input aligned protein sequences */ AjPSeqout seqout; AjPSeq nseq; /* next nucleic sequence to align */ const AjPSeq pseq; /* next protein sequence use in alignment */ AjPTrn trnTable; AjPSeq pep; /* translation of nseq */ AjPStr tablelist; ajint table; AjPSeqset outseqset; /* set of aligned nucleic sequences */ ajint proteinseqcount = 0; AjPStr degapstr = NULL; /* used to check if it matches with START removed */ AjPStr degapstr2 = NULL; AjPStr codon = NULL; /* holds temporary codon to check if is START */ char aa; /* translated putative START codon */ ajint type; /* returned type of the putative START codon */ /* start position of guide protein in translation */ ajlong pos = 0; AjPSeq newseq = NULL; /* output aligned nucleic sequence */ ajint frame; embInit("tranalign", argc, argv); nucseq = ajAcdGetSeqall("asequence"); protseq = ajAcdGetSeqset("bsequence"); tablelist = ajAcdGetListSingle("table"); seqout = ajAcdGetSeqoutset("outseq"); outseqset = ajSeqsetNew(); degapstr = ajStrNew(); /* initialise the translation table */ ajStrToInt(tablelist, &table); trnTable = ajTrnNewI(table); ajSeqsetFill(protseq); while(ajSeqallNext(nucseq, &nseq)) { if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL) ajErr("No guide protein sequence available for " "nucleic sequence %S", ajSeqGetNameS(nseq)); ajDebug("Aligning %S and %S\n", ajSeqGetNameS(nseq), ajSeqGetNameS(pseq)); /* get copy of pseq string with no gaps */ ajStrAssignS(°apstr, ajSeqGetSeqS(pseq)); ajStrRemoveGap(°apstr); /* ** for each translation frame look for subset of pep that ** matches pseq */ for(frame = 1; frame <4; frame++) { ajDebug("trying frame %d\n", frame); pep = ajTrnSeqOrig(trnTable, nseq, frame); degapstr2 = ajStrNew(); ajStrAssignRef(°apstr2, degapstr); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr); /* ** we might have a START codon that should be translated as 'M' ** we need to check if there is a match after a possible START ** codon */ if(pos == -1 && ajStrGetLen(degapstr) > 1 && (ajStrGetPtr(degapstr)[0] == 'M' || ajStrGetPtr(degapstr)[0] == 'm')) { /* see if pep minus the first character is a match */ ajStrCutStart(°apstr2, 1); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); /* ** pos is >= 1 if we have a match that is after the first ** residue */ if(pos >= 1) { /* point back at the putative START Methionine */ pos--; /* test if first codon is a START */ codon = ajStrNew(); ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), (pos*3)+frame-1, (pos*3)+frame+2); type = ajTrnCodonstrTypeS(trnTable, codon, &aa); if(type != 1) { /* first codon is not a valid START, force a mismatch */ pos = -1; } ajStrDel(&codon); } else { /* force 'pos == 0' to be treated as a mismatch */ pos = -1; } } ajStrDel(°apstr2); ajSeqDel(&pep); if(pos != -1) break; } if(pos == -1) ajErr("Guide protein sequence %S not found in nucleic sequence %S", ajSeqGetNameS(pseq), ajSeqGetNameS(nseq)); else { ajDebug("got a match with frame=%d\n", frame); /* extract the coding region of nseq with gaps */ newseq = ajSeqNew(); ajSeqSetNuc(newseq); ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq)); ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq)); tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1); /* output the gapped nucleic sequence */ ajSeqsetApp(outseqset, newseq); ajSeqDel(&newseq); } ajStrRemoveWhiteExcess(°apstr); } ajSeqoutWriteSet(seqout, outseqset); ajSeqoutClose(seqout); ajTrnDel(&trnTable); ajSeqsetDel(&outseqset); ajStrDel(°apstr); ajStrDel(°apstr2); ajSeqallDel(&nucseq); ajSeqDel(&nseq); ajSeqoutDel(&seqout); ajSeqsetDel(&protseq); ajStrDel(&tablelist); embExit(); return 0; }
static AjPList silent_mismatch(const AjPStr sstr, AjPList relist, AjPStr* tailstr, const AjPStr sname, ajint RStotal, ajint begin, ajint radj,AjBool rev, ajint end, AjBool tshow) { PSilent res; AjPList results; AjPStr str; /*holds RS patterns*/ AjPStr tstr; AjBool dummy = ajFalse; /*need a bool for ajPatClassify*/ ajint mm; /*number of mismatches*/ ajint hits; /*number of pattern matches found*/ ajint i; ajint aw; ajint start; AjPList patlist = NULL; /*a list for pattern matches. a list of ..*/ EmbPMatMatch match; /*..AjMatMatch structures*/ PRinfo rlp = NULL; AjPStr pep = NULL; /*string to hold protein*/ AjPTrn table = NULL; /*object to hold translation table*/ str = ajStrNew(); tstr = ajStrNew(); pep = ajStrNew(); table = ajTrnNewI(0); /*0 stands for standard DNA- see fuzztran.acd*/ results = ajListNew(); start = 1; if(!rev) /* forward strand */ { ajTrnSeqFrameS(table,sstr,1,&pep); /*1 stands for frame number*/ if(tshow) { silent_fmt_sequence("TRANSLATED SEQUENCE", pep,tailstr,start,ajFalse); } } else /* reverse strand */ { ajStrAssignC(&tstr,ajStrGetPtr(sstr)+(end-begin+1)%3); ajTrnSeqFrameS(table,tstr,1,&pep); if(tshow) { silent_fmt_sequence("REVERSE TRANSLATED SEQUENCE", pep,tailstr,start,ajFalse); } } for(aw=0;aw<RStotal;aw++) /* loop to read in restriction enzyme patterns */ { /* Pop off the first RE */ ajListPop(relist,(void **)&rlp); /* ignore unknown cut sites and zero cutters */ if(*ajStrGetPtr(rlp->site)=='?'||!rlp->ncuts) { ajListPushAppend(relist,(void*)rlp); continue; } ajStrFmtUpper(&rlp->site); /* convert all RS to upper case */ ajStrAssignS(&str,rlp->site); /* str now holds RS patterns */ patlist = ajListNew(); if(!embPatClassify(str,&str, &dummy,&dummy,&dummy,&dummy,&dummy,&dummy,0)) continue; mm = 0; hits=embPatBruteForce(sstr,str,ajFalse,ajFalse,patlist,begin+1,mm, sname); if(hits) while(ajListPop(patlist,(void**)&match)) embMatMatchDel(&match); else { mm = 1; hits = embPatBruteForce(sstr,str,ajFalse,ajFalse,patlist, begin+1,mm,sname); if(hits) for(i=0;i<hits;++i) { ajListPop(patlist,(void**)&match); res = silent_checktrans(sstr,match,rlp,begin,radj, rev,end); if (res) ajListPushAppend(results,(void *)res); embMatMatchDel(&match); } } ajListPushAppend(relist,(void *)rlp); ajListFree(&patlist); } ajStrDel(&str); ajStrDel(&tstr); ajStrDel(&pep); ajTrnDel(&table); return (results); }
static PSilent silent_checktrans(const AjPStr seq,const EmbPMatMatch match, const PRinfo rlp, ajint begin, ajint radj, AjBool rev, ajint end) { PSilent ret; const char *p = NULL; const char *q = NULL; const char *s = NULL; char *t; const char *u; ajint matchpos; ajint framep; ajint count; AjPTrn table = NULL; AjPStr s1 = NULL; AjPStr s2 = NULL; char c; char rc; ajint min = INT_MAX; /* Reverse sense intentional! */ ajint max = -INT_MAX; ajint fpos; ajint rpos; ajint x; AjPStr tstr = NULL; matchpos = match->start; fpos = matchpos; rpos=radj-fpos-match->len; tstr = ajStrNewS(seq); t = ajStrGetuniquePtr(&tstr); p = t+fpos-(begin+1); u = q = ajStrGetPtr(rlp->site); /* Test here for whether cut site is within sequence substring */ if(rlp->ncuts==4) { min = AJMIN(rlp->cut1,rlp->cut2); max = AJMAX(rlp->cut3,rlp->cut4); } else if(rlp->ncuts==2) { min = AJMIN(rlp->cut1,rlp->cut2); max = AJMAX(rlp->cut1,rlp->cut2); } else { ajWarn("Possibly corrupt RE file"); ajStrDel(&tstr); return NULL; } if(!rev) /* forward strand */ { if(matchpos+min<0||matchpos+max>end+1) { /*Cut site not in sequence range*/ ajStrDel(&tstr); return NULL; } } else /* reverse strand */ { if(radj-matchpos-1-min>end+1||radj-matchpos-1-max<begin) { /*Cut site not in sequence range*/ ajStrDel(&tstr); return NULL; } } count=0; while(ajBaseAlphaToBin(*q++) & ajBaseAlphaToBin(*p++)) ++count; /* Changed base postion */ x = fpos+count-(begin+1); /* Where the frame starts on the reverse strand */ framep = (end-begin+1)%3; c = t[x]; rc = u[count]; if(!rev) /* forward strand */ s = t+x-x%3; else /* reverse strand */ s = t+x-(x-framep)%3; table = ajTrnNewI(0); /* translates codon pointed to by s (original seq) */ s1 = ajStrNewK(ajTrnCodonC(table,s)); t[x] = rc; /* translates codon pointed to by s (mutated base from RS pattern */ s2 = ajStrNewK(ajTrnCodonC(table,s)); t[x] = c; /* changes mutated base in seq back to original base */ AJNEW(ret); ret->obase = c; ret->nbase = rc; ret->code = ajStrNewC(ajStrGetPtr(rlp->code)); ret->site = ajStrNewC(ajStrGetPtr(rlp->site)); ret->seqaa = ajStrNewC(ajStrGetPtr(s1)); ret->reaa = ajStrNewC(ajStrGetPtr(s2)); if(ajStrMatchS(s1,s2)) ret->issilent = ajTrue; else ret->issilent = ajFalse; if(!rev) { ret->match = matchpos; ret->base = matchpos+count; } else { ret->match = rpos; ret->base = rpos+match->len-1-count; } ajStrDel(&tstr); ajStrDel(&s1); ajStrDel(&s2); ajTrnDel(&table); return ret; }
AjPSeqout get_orf( AjPSeqout seqout, AjPSeqall seqall, AjPStr tablestr, ajuint minsize, ajuint maxsize, AjPStr findstr, AjBool methionine, AjBool circular, AjBool reverse, ajint around ) { ajint table; ajint find; AjPSeq seq = NULL; AjPTrn trnTable; AjPStr sseq = NULL; /* sequence string */ /* ORF number to append to name of sequence to create unique name */ ajint orf_no; AjBool sense; /* ajTrue = forward sense */ ajint len; /* initialise the translation table */ ajStrToInt(tablestr, &table); trnTable = ajTrnNewI(table); /* what sort of ORF are we looking for */ ajStrToInt(findstr, &find); /* ** get the minimum size converted to protein length if storing ** protein sequences */ if (find == P_STOP2STOP || find == P_START2STOP || find == AROUND_START) { minsize /= 3; maxsize /= 3; } while (ajSeqallNext(seqall, &seq)) { orf_no = 1; /* number of the next ORF */ sense = ajTrue; /* forward sense initially */ /* get the length of the sequence */ len = ajSeqGetLen(seq); /* ** if the sequence is circular, append it to itself to triple its ** length so can deal easily with wrapped ORFs, but don't update ** len */ if (circular) { ajStrAssignS(&sseq, ajSeqGetSeqS(seq)); ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); ajSeqAssignSeqS(seq, sseq); } /* find the ORFs */ getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, circular, find, &orf_no, methionine, around, &record); /* now reverse complement the sequence and do it again */ if (reverse) { sense = ajFalse; ajSeqReverseForce(seq); getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, circular, find, &orf_no, methionine, around); } } ajTrnDel(&trnTable); ajSeqDel(&seq); ajStrDel(&sseq); }