/* @funcstatic seqwords_keysearch ******************************************** ** ** Search swissprot with terms structure and writes a hitlist structure ** ** @param [r] inf [AjPFile] File pointer to swissprot database ** @param [r] terms [AjPTerms] Terms object pointer ** @param [w] hits [EmbPHitlist*] Hitlist object pointer ** ** @return [AjBool] True on success ** @@ ******************************************************************************/ static AjBool seqwords_keysearch(AjPFile inf, AjPTerms terms, EmbPHitlist *hits) { AjPStr line =NULL; /* Line of text. */ AjPStr id =NULL; /* Line of text. */ AjPStr temp =NULL; ajint s =0; /* Temp. start of hit value. */ ajint e =0; /* Temp. end of hit value. */ AjPInt start =NULL; /* Array of start of hit(s). */ AjPInt end =NULL; /* Array of end of hit(s). */ ajint nhits =0; /* Number of hits. */ ajint x =0; AjBool foundkw =ajFalse; AjBool foundft =ajFalse; /* Check for valid args. */ if(!inf) return ajFalse; /* Allocate strings and arrays. */ line = ajStrNew(); id = ajStrNew(); start = ajIntNew(); end = ajIntNew(); /* Start of main loop. */ while((ajReadlineTrim(inf,&line))) { /* Parse the AC line. */ if(ajStrPrefixC(line,"AC")) { /* Copy accesion number and remove the ';' from the end. */ ajFmtScanS(line, "%*s %S", &id); ajStrExchangeCC(&id, ";", "\0"); /* Reset flags & no. hits. */ foundkw=ajFalse; foundft=ajFalse; nhits=0; } /* Search the description and keyword lines with search terms. */ else if((ajStrPrefixC(line,"DE") || (ajStrPrefixC(line,"KW")))) { /* ** Search terms have a leading and trailing space to prevent ** them being found as substrings within other words. To ** catch cases where a DE or KW line begins with a search ** term, we must add a leading and trailing space to line. ** We must first remove punctation from the line to be parsed. */ ajStrExchangeSetCC(&line, ".,;:", " "); ajStrAppendK(&line, ' '); ajStrInsertC(&line, 0, " "); for (x = 0; x < terms->N; x++) /* Search term is found. */ if((ajStrFindCaseS(line, terms->Keywords[x])!=-1)) { foundkw=ajTrue; break; } } /* Search the feature table line with search terms. */ else if((ajStrPrefixC(line,"FT DOMAIN"))) { /* ** Search terms have a leading and trailing space to prevent ** them being found as substrings within other words. To ** catch cases where a FT line ends with a search ** term, we must add a trailing space to line ** We must first remove punctation from the line to be parsed. */ ajStrExchangeSetCC(&line, ".,;:", " "); ajStrAppendK(&line, ' '); for (x = 0; x < terms->N; x++) if((ajStrFindCaseS(line, terms->Keywords[x])!=-1)) { /* Search term is found. */ foundft = ajTrue; nhits++; /* Assign start and end of hit. */ ajFmtScanS(line, "%*s %*s %d %d", &s, &e); ajIntPut(&start, nhits-1, s); ajIntPut(&end, nhits-1, e); break; } } /* Parse the sequence. */ else if((ajStrPrefixC(line,"SQ") && ((foundkw == ajTrue) || (foundft == ajTrue)))) { /* Allocate memory for temp. sequence. */ temp = ajStrNew(); /* Read the sequence into hitlist structure. */ while((ajReadlineTrim(inf,&line)) && !ajStrPrefixC(line,"//")) /* Read sequence line into temp. */ ajStrAppendC(&temp,ajStrGetPtr(line)+3); /* Clean up temp. sequence. */ ajStrRemoveWhite(&temp); /*Priority is given to domain (rather than full length) sequence.*/ if(foundft) { for(x=0;x<nhits;x++) { /* Increment counter of hits for subsequent hits*/ (*hits)->N++; /* Reallocate memory for array of hits in hitlist structure. */ AJCRESIZE((*hits)->hits, (*hits)->N); (*hits)->hits[(*hits)->N-1]=embHitNew(); ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model, "KEYWORD"); /* Assign start and end of hit. */ (*hits)->hits[(*hits)->N-1]->Start = ajIntGet(start, x); (*hits)->hits[(*hits)->N-1]->End = ajIntGet(end, x); /* Extract sequence within specified range */ ajStrAssignSubS(&(*hits)->hits[(*hits)->N - 1]->Seq, temp, (*hits)->hits[(*hits)->N - 1]->Start - 1, (*hits)->hits[(*hits)->N - 1]->End - 1); /* Put id into structure */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id); } } else { /* Increment counter of hits */ (*hits)->N++; /* Reallocate memory for array of hits in hitlist structure */ AJCRESIZE((*hits)->hits, (*hits)->N); (*hits)->hits[(*hits)->N-1]=embHitNew(); ajStrAssignC(&(*hits)->hits[(*hits)->N-1]->Model, "KEYWORD"); /* Extract whole sequence */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Seq, temp); (*hits)->hits[(*hits)->N - 1]->Start = 1; (*hits)->hits[(*hits)->N - 1]->End = ajStrGetLen((*hits)->hits[(*hits)->N - 1]->Seq); /* Put id into structure */ ajStrAssignRef(&(*hits)->hits[(*hits)->N - 1]->Acc, id); } /* Free temp. sequence */ ajStrDel(&temp); } } /* Clean up */ ajStrDel(&line); ajStrDel(&id); ajIntDel(&start); ajIntDel(&end); return ajTrue; }
int main(int argc, char **argv) { AjPSeqall nucseq; /* input nucleic sequences */ AjPSeqset protseq; /* input aligned protein sequences */ AjPSeqout seqout; AjPSeq nseq; /* next nucleic sequence to align */ const AjPSeq pseq; /* next protein sequence use in alignment */ AjPTrn trnTable; AjPSeq pep; /* translation of nseq */ AjPStr tablelist; ajint table; AjPSeqset outseqset; /* set of aligned nucleic sequences */ ajint proteinseqcount = 0; AjPStr degapstr = NULL; /* used to check if it matches with START removed */ AjPStr degapstr2 = NULL; AjPStr codon = NULL; /* holds temporary codon to check if is START */ char aa; /* translated putative START codon */ ajint type; /* returned type of the putative START codon */ /* start position of guide protein in translation */ ajlong pos = 0; AjPSeq newseq = NULL; /* output aligned nucleic sequence */ ajint frame; embInit("tranalign", argc, argv); nucseq = ajAcdGetSeqall("asequence"); protseq = ajAcdGetSeqset("bsequence"); tablelist = ajAcdGetListSingle("table"); seqout = ajAcdGetSeqoutset("outseq"); outseqset = ajSeqsetNew(); degapstr = ajStrNew(); /* initialise the translation table */ ajStrToInt(tablelist, &table); trnTable = ajTrnNewI(table); ajSeqsetFill(protseq); while(ajSeqallNext(nucseq, &nseq)) { if((pseq = ajSeqsetGetseqSeq(protseq, proteinseqcount++)) == NULL) ajErr("No guide protein sequence available for " "nucleic sequence %S", ajSeqGetNameS(nseq)); ajDebug("Aligning %S and %S\n", ajSeqGetNameS(nseq), ajSeqGetNameS(pseq)); /* get copy of pseq string with no gaps */ ajStrAssignS(°apstr, ajSeqGetSeqS(pseq)); ajStrRemoveGap(°apstr); /* ** for each translation frame look for subset of pep that ** matches pseq */ for(frame = 1; frame <4; frame++) { ajDebug("trying frame %d\n", frame); pep = ajTrnSeqOrig(trnTable, nseq, frame); degapstr2 = ajStrNew(); ajStrAssignRef(°apstr2, degapstr); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr); /* ** we might have a START codon that should be translated as 'M' ** we need to check if there is a match after a possible START ** codon */ if(pos == -1 && ajStrGetLen(degapstr) > 1 && (ajStrGetPtr(degapstr)[0] == 'M' || ajStrGetPtr(degapstr)[0] == 'm')) { /* see if pep minus the first character is a match */ ajStrCutStart(°apstr2, 1); pos = ajStrFindCaseS(ajSeqGetSeqS(pep), degapstr2); /* ** pos is >= 1 if we have a match that is after the first ** residue */ if(pos >= 1) { /* point back at the putative START Methionine */ pos--; /* test if first codon is a START */ codon = ajStrNew(); ajStrAssignSubS(&codon, ajSeqGetSeqS(nseq), (pos*3)+frame-1, (pos*3)+frame+2); type = ajTrnCodonstrTypeS(trnTable, codon, &aa); if(type != 1) { /* first codon is not a valid START, force a mismatch */ pos = -1; } ajStrDel(&codon); } else { /* force 'pos == 0' to be treated as a mismatch */ pos = -1; } } ajStrDel(°apstr2); ajSeqDel(&pep); if(pos != -1) break; } if(pos == -1) ajErr("Guide protein sequence %S not found in nucleic sequence %S", ajSeqGetNameS(pseq), ajSeqGetNameS(nseq)); else { ajDebug("got a match with frame=%d\n", frame); /* extract the coding region of nseq with gaps */ newseq = ajSeqNew(); ajSeqSetNuc(newseq); ajSeqAssignNameS(newseq, ajSeqGetNameS(nseq)); ajSeqAssignDescS(newseq, ajSeqGetDescS(nseq)); tranalign_AddGaps(newseq, nseq, pseq, (pos*3)+frame-1); /* output the gapped nucleic sequence */ ajSeqsetApp(outseqset, newseq); ajSeqDel(&newseq); } ajStrRemoveWhiteExcess(°apstr); } ajSeqoutWriteSet(seqout, outseqset); ajSeqoutClose(seqout); ajTrnDel(&trnTable); ajSeqsetDel(&outseqset); ajStrDel(°apstr); ajStrDel(°apstr2); ajSeqallDel(&nucseq); ajSeqDel(&nseq); ajSeqoutDel(&seqout); ajSeqsetDel(&protseq); ajStrDel(&tablelist); embExit(); return 0; }