Esempio n. 1
0
int esl_trans_s2p(ESL_SQ *in, ESL_SQ **out, int frameshift, int rcFlag)
{
  // The encoding for this is taken from squid:  A=0, C=1, G=2, U/T=3, 
  // code[0] corresponds to AAA, code[1] is AAC... code[4] is ACA... 
  // and so on up to 63 being UUU. 64 is a sentinel. Regular 20 amino codes and '*' for stop
  // the nucleotide indices match well with the easel alphabet index
  // but the actual translation still needs to be hard coded
  char code[] = {'K','N','K','N','T','T','T','T','R','S','R','S',
                 'I','I','M','I','Q','H','Q','H','P','P','P','P',
                 'R','R','R','R','L','L','L','L','E','D','E','D',
                 'A','A','A','A','G','G','G','G','V','V','V','V',
                 '*','Y','*','Y','L','F','L','F','*','C','W','C',
                 'L','F','L','F'};

  int status;

  int codon;     //progress in counting current codon
  char *aaseq;   //hold the protein sequence to be output
  char *aaptr;   //pointer records progress in writing to output
  char *readseq; //pointer records progress in reading nucleotide sequence
  int read_dg;   //index into digital sequence
  
  ESL_ALPHABET *abc = esl_alphabet_Create(eslDNA);
  char errbuf[256]; //validateseq demands this
  
  char namestring[256];
  
  (*out) = NULL;

  if(frameshift >= in->n) return eslFAIL;
  if(!abc) goto ERROR;
  
  //make sure we have a nucleotide sequence; could use esl_abc_ValidateSeq but that wants too
  //much boilerplate for the simple bit I need done. doesn't help that i don't care if there are U or T
  //characters but that would test against two alphabets
  if(in->seq)
  {
    if(eslOK != esl_abc_ValidateSeq(abc, in->seq, in->n, errbuf)) goto ERROR;
  }
  else if(in->dsq)
  {
    if(in->abc->type != eslRNA && in->abc->type != eslDNA) goto ERROR;
  }
  else
  {
    goto ERROR;
  }

  
  //apply the reverse compliment
  if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;}
  
  
  ESL_ALLOC(aaseq, (in->n+1) * sizeof(char));
  aaptr = aaseq;
  
  if(in->seq) //text sequence
  { 
    //get an alphabet to do the lookup with.
    //an ordinary text sequence doesn't have in->abc
    //if it has one that is not a standard dna/rna alphabet
    //then this code won't work. I wanted to use an alphabet if available, could save some allocating time that way
    //if we're calling this repeatedly
    //but the compiler complains about "pointer qualifiers" so nevermind
    
    readseq = in->seq+frameshift;
      
    //as long as there are at least 3 nucleotides left, pull and translate another codon
    for (; *readseq != '\0' && *(readseq+1) != '\0' && *(readseq+2) != '\0'; readseq += 3)
    {
      codon = abc->inmap[(int)*(readseq)] * 16 + abc->inmap[(int)*(readseq+1)] * 4 + abc->inmap[(int)*(readseq+2)];
      if(codon > 63 || codon < 0) break;

      *aaptr = code[codon];
      aaptr += 1;
    }
    *aaptr = '\0';
  }
  else if(in->dsq)  //do it digitally
  { 
    if(in->dsq == NULL) goto ERROR;
    
    read_dg = 1+frameshift; //add one here because digital index 0 is a sentinel
    for(;in->dsq[read_dg] != 255 && in->dsq[read_dg+1] != 255 && in->dsq[read_dg+2] != 255; read_dg += 3)
    {
      codon = in->dsq[read_dg] * 16 + in->dsq[read_dg+1] * 4 + in->dsq[read_dg+2];
      if(codon > 63 || codon < 0) break;
      *aaptr = code[codon];
      aaptr += 1;
    }
    *aaptr = '\0';
  }
  else
  {
    goto ERROR;
  }
  
  //modify name to record any reading frame adjustments
  sprintf(namestring, "%s_s%d", in->name, frameshift);
  if(rcFlag) strcat(namestring, "_rc");
  *out = esl_sq_CreateFrom(namestring, aaseq, in->desc, in->acc, in->ss);
        
  if(aaseq != NULL) free(aaseq);
  
  //return the input to its original state
  if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;}
  
  if(abc) esl_alphabet_Destroy(abc);
  if(*out) return eslOK;
  
  ERROR:
    
  if(abc) esl_alphabet_Destroy(abc);
  if(aaseq != NULL) free(aaseq);
  (*out) = NULL;
  
  return eslEMEM;
}
Esempio n. 2
0
int esl_trans_seq_stop_split(ESL_SQ *in, ESL_SQ ***out, int *outCount)
{
  int status;
  
  int x, y;          //loop counters
  int nextSeqOut;    //index of the next open location in the output sequence array
  int front;         //front of the segment of sequence currently being read
  
  char* buff;        //temporary home of output sequence before calling createFrom
  char name[256];    //workbench for building the name of each output sequence
  
  ESL_ALLOC(buff, (in->n+1) * sizeof(char));
  
  *outCount = 1;
  
  if(in->seq) //text mode
  {
    //count how many sequences are present. minimum size is one non-stop residue
    x = 1;
    while(in->seq[x] != '\0')
    {
      if(in->seq[x] == '*' && in->seq[x-1] != '*') (*outCount)++;
      x++;
    }
    
    ESL_ALLOC(*out, sizeof(ESL_SQ*) * *outCount);
    
    x = front = 0;
    nextSeqOut = 0;
    
    //continue until the sequence front steps past the end of the list
    while(front < in->n)
    {
      //x is the location currently being read, current segment is from front to x
      x++;
      if(in->seq[x] == '\0' || in->seq[x] == '*') //if we see something that ends a segment
      {
        if(x - front > 0) //if there is at least one residue
        {
          //build name
          sprintf(name, "%s_%dto%d", in->name, front+1, x);
          
          //build temporary sequence string
          strncpy(buff, in->seq+front, x-front);
          buff[x-front] = '\0';
          
          //load output array
          (*out)[nextSeqOut++] = esl_sq_CreateFrom(name, buff, in->desc, in->acc, in->ss);
        }
        //step the front to the beginning of the next sequence
        front = x+1;
      }
    }
  }
  else if(in->dsq) //digital mode
  {
    //start a little different because dsq has a sentinel in position 0
    x = 2;
    while(in->dsq[x] != 255) //until the end sentinal, count sequences with at least one residue
    {
      if(in->abc->inmap[(int)'*'] == in->dsq[x] && in->abc->inmap[(int)'*'] != in->dsq[x-1]) (*outCount)++;
      x++;
    }

    ESL_ALLOC(*out, sizeof(ESL_SQ*) * *outCount);
    
    x = front = 1;    
    nextSeqOut = 0;
    
    while(front < in->n+2) //as long as we have residues left
    {
      x++;
      if(in->dsq[x] == 255 || in->abc->inmap[(int)'*'] == in->dsq[x]) //if we see something that finishes a sequence
      {
        if(x - front > 0) //have at least one residue in the sequence
        {
          //build name
          sprintf(name, "%s_%dto%d", in->name, front, x-1);
          
          //build temporary sequence
          for(y = 0; y < x-front; y++) buff[y] = in->abc->sym[in->dsq[front+y]];
          buff[x-front] = '\0';
          
          //load output
          (*out)[nextSeqOut++] = esl_sq_CreateFrom(name, buff, in->desc, in->acc, in->ss);
        }
        front = x+1;
      }
    }
  }
  else
  {
    goto ERROR;
  }
  
  free(buff);
  
  return eslOK;
  
  ERROR:

  if(buff) free(buff);

  return eslFAIL;
}
Esempio n. 3
0
void run_hmmer_pipeline(const char* seq) {
  int index, i, status;
  ESL_SQ* sq = esl_sq_CreateFrom(NULL, seq, NULL, NULL, NULL);
  P7_OPROFILE *om = NULL;
  P7_PROFILE *gm = NULL;
  float usc, vfsc, fwdsc;   /* filter scores                           */
  float filtersc;           /* HMM null filter score                   */
  float nullsc;             /* null model score                        */
  float seqbias;
  float seq_score;          /* the corrected per-seq bit score */
  double P;
  WRAPPER_RESULT* result;

  num_results = 0;
  if(sq->n == 0) {
    esl_sq_Destroy(sq);
    return;
  }

  esl_sq_Digitize(abc, sq);  

  int n = 0;
  float oasc;

  for(index = 0;index < num_models;index++) {
    om = models[index];

    p7_omx_Reuse(oxf);
    p7_omx_Reuse(oxb);

    p7_omx_GrowTo(oxf, om->M, sq->n, sq->n);
    p7_omx_GrowTo(oxb, om->M, sq->n, sq->n);

    p7_oprofile_ReconfigLength(om, sq->n);

    p7_bg_SetFilter(bg, om->M, om->compo);
    p7_bg_SetLength(bg, sq->n);

    //Calibrate null model
    p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc);

    //MSV Filter
    p7_MSVFilter(sq->dsq, sq->n, om, oxf, &usc);
    seq_score = (usc - nullsc) / eslCONST_LOG2;
    P = esl_gumbel_surv(seq_score,  om->evparam[p7_MMU],  om->evparam[p7_MLAMBDA]);
    if (P > f1) continue;

    //Bias filter (model compo)
    p7_bg_FilterScore(bg, sq->dsq, sq->n, &filtersc);
    seq_score = (usc - filtersc) / eslCONST_LOG2;
    P = esl_gumbel_surv(seq_score,  om->evparam[p7_MMU],  om->evparam[p7_MLAMBDA]);
    if (P > f1) continue;

    //Viterbi filter (Only do if P value from Bias is high)
    if(P > f2) {
      p7_ViterbiFilter(sq->dsq, sq->n, om, oxf, &vfsc);
      seq_score = (vfsc - filtersc) / eslCONST_LOG2;
      P = esl_gumbel_surv(seq_score,  om->evparam[p7_VMU],  om->evparam[p7_VLAMBDA]);
      if (P > f2) continue;
    }

    //Get the real probability (forward)
    p7_Forward(sq->dsq, sq->n, om, oxf, &fwdsc);
    seq_score = (fwdsc - filtersc) / eslCONST_LOG2;
    P = esl_exp_surv(seq_score,  om->evparam[p7_FTAU],  om->evparam[p7_FLAMBDA]);
    if(hmmer_error) {
      fprintf(stderr, "HMM: %s, seq: %s", om->name, seq);
      hmmer_error = 0;
      continue;
    }
    if (P > f3) continue;

    //Real hit, go in to posterior decoding and alignment
    p7_omx_Reuse(oxb);
    p7_trace_Reuse(tr);

    p7_Backward(sq->dsq, sq->n, om, oxf, oxb, NULL);

    status = p7_Decoding(om, oxf, oxb, oxb);

    if(status == eslOK) {
      //And then trace the result
      p7_OptimalAccuracy(om, oxb, oxf, &oasc);
      p7_OATrace(om, oxb, oxf, tr);
    } else if(status == eslERANGE) {
      fprintf(stderr, "Decoding overflow on model %s\n", om->name);
      gm = gmodels[index];
      if(gxf == NULL) {
	gxf = p7_gmx_Create(gm->M, sq->n);
	gxb = p7_gmx_Create(gm->M, sq->n);
      } else {
	p7_gmx_GrowTo(gxf, gm->M, sq->n);
	p7_gmx_GrowTo(gxb, gm->M, sq->n);
      }

      p7_ReconfigLength(gm, sq->n);

      p7_GForward (sq->dsq, sq->n, gm, gxf, &fwdsc);
      p7_GBackward(sq->dsq, sq->n, gm, gxb, NULL);

      p7_GDecoding(gm, gxf, gxb, gxb);
      p7_GOptimalAccuracy(gm, gxb, gxf, &oasc);
      p7_GOATrace        (gm, gxb, gxf, tr);

      p7_gmx_Reuse(gxf);
      p7_gmx_Reuse(gxb);
    }

    if(hmmer_error) {
      fprintf(stderr, "HMM: %s, seq: %s", om->name, seq);
      hmmer_error = 0;
      continue;
    }

    result = wrapper_results[num_results];
    reuse_result(result, tr->N + om->M, om->name); //We're way overallocating here, but it's hard to know at this point how much space we'll need for the alignment (plus leading and trailing gaps)
    trace_into(tr, result, sq, abc, om->M);
    result->bits = seq_score;
    num_results++;
  }

  esl_sq_Destroy(sq);
}