int esl_trans_s2p(ESL_SQ *in, ESL_SQ **out, int frameshift, int rcFlag) { // The encoding for this is taken from squid: A=0, C=1, G=2, U/T=3, // code[0] corresponds to AAA, code[1] is AAC... code[4] is ACA... // and so on up to 63 being UUU. 64 is a sentinel. Regular 20 amino codes and '*' for stop // the nucleotide indices match well with the easel alphabet index // but the actual translation still needs to be hard coded char code[] = {'K','N','K','N','T','T','T','T','R','S','R','S', 'I','I','M','I','Q','H','Q','H','P','P','P','P', 'R','R','R','R','L','L','L','L','E','D','E','D', 'A','A','A','A','G','G','G','G','V','V','V','V', '*','Y','*','Y','L','F','L','F','*','C','W','C', 'L','F','L','F'}; int status; int codon; //progress in counting current codon char *aaseq; //hold the protein sequence to be output char *aaptr; //pointer records progress in writing to output char *readseq; //pointer records progress in reading nucleotide sequence int read_dg; //index into digital sequence ESL_ALPHABET *abc = esl_alphabet_Create(eslDNA); char errbuf[256]; //validateseq demands this char namestring[256]; (*out) = NULL; if(frameshift >= in->n) return eslFAIL; if(!abc) goto ERROR; //make sure we have a nucleotide sequence; could use esl_abc_ValidateSeq but that wants too //much boilerplate for the simple bit I need done. doesn't help that i don't care if there are U or T //characters but that would test against two alphabets if(in->seq) { if(eslOK != esl_abc_ValidateSeq(abc, in->seq, in->n, errbuf)) goto ERROR; } else if(in->dsq) { if(in->abc->type != eslRNA && in->abc->type != eslDNA) goto ERROR; } else { goto ERROR; } //apply the reverse compliment if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;} ESL_ALLOC(aaseq, (in->n+1) * sizeof(char)); aaptr = aaseq; if(in->seq) //text sequence { //get an alphabet to do the lookup with. //an ordinary text sequence doesn't have in->abc //if it has one that is not a standard dna/rna alphabet //then this code won't work. I wanted to use an alphabet if available, could save some allocating time that way //if we're calling this repeatedly //but the compiler complains about "pointer qualifiers" so nevermind readseq = in->seq+frameshift; //as long as there are at least 3 nucleotides left, pull and translate another codon for (; *readseq != '\0' && *(readseq+1) != '\0' && *(readseq+2) != '\0'; readseq += 3) { codon = abc->inmap[(int)*(readseq)] * 16 + abc->inmap[(int)*(readseq+1)] * 4 + abc->inmap[(int)*(readseq+2)]; if(codon > 63 || codon < 0) break; *aaptr = code[codon]; aaptr += 1; } *aaptr = '\0'; } else if(in->dsq) //do it digitally { if(in->dsq == NULL) goto ERROR; read_dg = 1+frameshift; //add one here because digital index 0 is a sentinel for(;in->dsq[read_dg] != 255 && in->dsq[read_dg+1] != 255 && in->dsq[read_dg+2] != 255; read_dg += 3) { codon = in->dsq[read_dg] * 16 + in->dsq[read_dg+1] * 4 + in->dsq[read_dg+2]; if(codon > 63 || codon < 0) break; *aaptr = code[codon]; aaptr += 1; } *aaptr = '\0'; } else { goto ERROR; } //modify name to record any reading frame adjustments sprintf(namestring, "%s_s%d", in->name, frameshift); if(rcFlag) strcat(namestring, "_rc"); *out = esl_sq_CreateFrom(namestring, aaseq, in->desc, in->acc, in->ss); if(aaseq != NULL) free(aaseq); //return the input to its original state if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;} if(abc) esl_alphabet_Destroy(abc); if(*out) return eslOK; ERROR: if(abc) esl_alphabet_Destroy(abc); if(aaseq != NULL) free(aaseq); (*out) = NULL; return eslEMEM; }
int esl_trans_seq_stop_split(ESL_SQ *in, ESL_SQ ***out, int *outCount) { int status; int x, y; //loop counters int nextSeqOut; //index of the next open location in the output sequence array int front; //front of the segment of sequence currently being read char* buff; //temporary home of output sequence before calling createFrom char name[256]; //workbench for building the name of each output sequence ESL_ALLOC(buff, (in->n+1) * sizeof(char)); *outCount = 1; if(in->seq) //text mode { //count how many sequences are present. minimum size is one non-stop residue x = 1; while(in->seq[x] != '\0') { if(in->seq[x] == '*' && in->seq[x-1] != '*') (*outCount)++; x++; } ESL_ALLOC(*out, sizeof(ESL_SQ*) * *outCount); x = front = 0; nextSeqOut = 0; //continue until the sequence front steps past the end of the list while(front < in->n) { //x is the location currently being read, current segment is from front to x x++; if(in->seq[x] == '\0' || in->seq[x] == '*') //if we see something that ends a segment { if(x - front > 0) //if there is at least one residue { //build name sprintf(name, "%s_%dto%d", in->name, front+1, x); //build temporary sequence string strncpy(buff, in->seq+front, x-front); buff[x-front] = '\0'; //load output array (*out)[nextSeqOut++] = esl_sq_CreateFrom(name, buff, in->desc, in->acc, in->ss); } //step the front to the beginning of the next sequence front = x+1; } } } else if(in->dsq) //digital mode { //start a little different because dsq has a sentinel in position 0 x = 2; while(in->dsq[x] != 255) //until the end sentinal, count sequences with at least one residue { if(in->abc->inmap[(int)'*'] == in->dsq[x] && in->abc->inmap[(int)'*'] != in->dsq[x-1]) (*outCount)++; x++; } ESL_ALLOC(*out, sizeof(ESL_SQ*) * *outCount); x = front = 1; nextSeqOut = 0; while(front < in->n+2) //as long as we have residues left { x++; if(in->dsq[x] == 255 || in->abc->inmap[(int)'*'] == in->dsq[x]) //if we see something that finishes a sequence { if(x - front > 0) //have at least one residue in the sequence { //build name sprintf(name, "%s_%dto%d", in->name, front, x-1); //build temporary sequence for(y = 0; y < x-front; y++) buff[y] = in->abc->sym[in->dsq[front+y]]; buff[x-front] = '\0'; //load output (*out)[nextSeqOut++] = esl_sq_CreateFrom(name, buff, in->desc, in->acc, in->ss); } front = x+1; } } } else { goto ERROR; } free(buff); return eslOK; ERROR: if(buff) free(buff); return eslFAIL; }
void run_hmmer_pipeline(const char* seq) { int index, i, status; ESL_SQ* sq = esl_sq_CreateFrom(NULL, seq, NULL, NULL, NULL); P7_OPROFILE *om = NULL; P7_PROFILE *gm = NULL; float usc, vfsc, fwdsc; /* filter scores */ float filtersc; /* HMM null filter score */ float nullsc; /* null model score */ float seqbias; float seq_score; /* the corrected per-seq bit score */ double P; WRAPPER_RESULT* result; num_results = 0; if(sq->n == 0) { esl_sq_Destroy(sq); return; } esl_sq_Digitize(abc, sq); int n = 0; float oasc; for(index = 0;index < num_models;index++) { om = models[index]; p7_omx_Reuse(oxf); p7_omx_Reuse(oxb); p7_omx_GrowTo(oxf, om->M, sq->n, sq->n); p7_omx_GrowTo(oxb, om->M, sq->n, sq->n); p7_oprofile_ReconfigLength(om, sq->n); p7_bg_SetFilter(bg, om->M, om->compo); p7_bg_SetLength(bg, sq->n); //Calibrate null model p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc); //MSV Filter p7_MSVFilter(sq->dsq, sq->n, om, oxf, &usc); seq_score = (usc - nullsc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Bias filter (model compo) p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc); seq_score = (usc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); if (P > f1) continue; //Viterbi filter (Only do if P value from Bias is high) if(P > f2) { p7_ViterbiFilter(sq->dsq, sq->n, om, oxf, &vfsc); seq_score = (vfsc - filtersc) / eslCONST_LOG2; P = esl_gumbel_surv(seq_score, om->evparam[p7_VMU], om->evparam[p7_VLAMBDA]); if (P > f2) continue; } //Get the real probability (forward) p7_Forward(sq->dsq, sq->n, om, oxf, &fwdsc); seq_score = (fwdsc - filtersc) / eslCONST_LOG2; P = esl_exp_surv(seq_score, om->evparam[p7_FTAU], om->evparam[p7_FLAMBDA]); if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } if (P > f3) continue; //Real hit, go in to posterior decoding and alignment p7_omx_Reuse(oxb); p7_trace_Reuse(tr); p7_Backward(sq->dsq, sq->n, om, oxf, oxb, NULL); status = p7_Decoding(om, oxf, oxb, oxb); if(status == eslOK) { //And then trace the result p7_OptimalAccuracy(om, oxb, oxf, &oasc); p7_OATrace(om, oxb, oxf, tr); } else if(status == eslERANGE) { fprintf(stderr, "Decoding overflow on model %s\n", om->name); gm = gmodels[index]; if(gxf == NULL) { gxf = p7_gmx_Create(gm->M, sq->n); gxb = p7_gmx_Create(gm->M, sq->n); } else { p7_gmx_GrowTo(gxf, gm->M, sq->n); p7_gmx_GrowTo(gxb, gm->M, sq->n); } p7_ReconfigLength(gm, sq->n); p7_GForward (sq->dsq, sq->n, gm, gxf, &fwdsc); p7_GBackward(sq->dsq, sq->n, gm, gxb, NULL); p7_GDecoding(gm, gxf, gxb, gxb); p7_GOptimalAccuracy(gm, gxb, gxf, &oasc); p7_GOATrace (gm, gxb, gxf, tr); p7_gmx_Reuse(gxf); p7_gmx_Reuse(gxb); } if(hmmer_error) { fprintf(stderr, "HMM: %s, seq: %s", om->name, seq); hmmer_error = 0; continue; } result = wrapper_results[num_results]; reuse_result(result, tr->N + om->M, om->name); //We're way overallocating here, but it's hard to know at this point how much space we'll need for the alignment (plus leading and trailing gaps) trace_into(tr, result, sq, abc, om->M); result->bits = seq_score; num_results++; } esl_sq_Destroy(sq); }