Exemplo n.º 1
0
int esl_trans_s2p(ESL_SQ *in, ESL_SQ **out, int frameshift, int rcFlag)
{
  // The encoding for this is taken from squid:  A=0, C=1, G=2, U/T=3, 
  // code[0] corresponds to AAA, code[1] is AAC... code[4] is ACA... 
  // and so on up to 63 being UUU. 64 is a sentinel. Regular 20 amino codes and '*' for stop
  // the nucleotide indices match well with the easel alphabet index
  // but the actual translation still needs to be hard coded
  char code[] = {'K','N','K','N','T','T','T','T','R','S','R','S',
                 'I','I','M','I','Q','H','Q','H','P','P','P','P',
                 'R','R','R','R','L','L','L','L','E','D','E','D',
                 'A','A','A','A','G','G','G','G','V','V','V','V',
                 '*','Y','*','Y','L','F','L','F','*','C','W','C',
                 'L','F','L','F'};

  int status;

  int codon;     //progress in counting current codon
  char *aaseq;   //hold the protein sequence to be output
  char *aaptr;   //pointer records progress in writing to output
  char *readseq; //pointer records progress in reading nucleotide sequence
  int read_dg;   //index into digital sequence
  
  ESL_ALPHABET *abc = esl_alphabet_Create(eslDNA);
  char errbuf[256]; //validateseq demands this
  
  char namestring[256];
  
  (*out) = NULL;

  if(frameshift >= in->n) return eslFAIL;
  if(!abc) goto ERROR;
  
  //make sure we have a nucleotide sequence; could use esl_abc_ValidateSeq but that wants too
  //much boilerplate for the simple bit I need done. doesn't help that i don't care if there are U or T
  //characters but that would test against two alphabets
  if(in->seq)
  {
    if(eslOK != esl_abc_ValidateSeq(abc, in->seq, in->n, errbuf)) goto ERROR;
  }
  else if(in->dsq)
  {
    if(in->abc->type != eslRNA && in->abc->type != eslDNA) goto ERROR;
  }
  else
  {
    goto ERROR;
  }

  
  //apply the reverse compliment
  if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;}
  
  
  ESL_ALLOC(aaseq, (in->n+1) * sizeof(char));
  aaptr = aaseq;
  
  if(in->seq) //text sequence
  { 
    //get an alphabet to do the lookup with.
    //an ordinary text sequence doesn't have in->abc
    //if it has one that is not a standard dna/rna alphabet
    //then this code won't work. I wanted to use an alphabet if available, could save some allocating time that way
    //if we're calling this repeatedly
    //but the compiler complains about "pointer qualifiers" so nevermind
    
    readseq = in->seq+frameshift;
      
    //as long as there are at least 3 nucleotides left, pull and translate another codon
    for (; *readseq != '\0' && *(readseq+1) != '\0' && *(readseq+2) != '\0'; readseq += 3)
    {
      codon = abc->inmap[(int)*(readseq)] * 16 + abc->inmap[(int)*(readseq+1)] * 4 + abc->inmap[(int)*(readseq+2)];
      if(codon > 63 || codon < 0) break;

      *aaptr = code[codon];
      aaptr += 1;
    }
    *aaptr = '\0';
  }
  else if(in->dsq)  //do it digitally
  { 
    if(in->dsq == NULL) goto ERROR;
    
    read_dg = 1+frameshift; //add one here because digital index 0 is a sentinel
    for(;in->dsq[read_dg] != 255 && in->dsq[read_dg+1] != 255 && in->dsq[read_dg+2] != 255; read_dg += 3)
    {
      codon = in->dsq[read_dg] * 16 + in->dsq[read_dg+1] * 4 + in->dsq[read_dg+2];
      if(codon > 63 || codon < 0) break;
      *aaptr = code[codon];
      aaptr += 1;
    }
    *aaptr = '\0';
  }
  else
  {
    goto ERROR;
  }
  
  //modify name to record any reading frame adjustments
  sprintf(namestring, "%s_s%d", in->name, frameshift);
  if(rcFlag) strcat(namestring, "_rc");
  *out = esl_sq_CreateFrom(namestring, aaseq, in->desc, in->acc, in->ss);
        
  if(aaseq != NULL) free(aaseq);
  
  //return the input to its original state
  if(rcFlag) {if(esl_sq_ReverseComplement(in) != eslOK) goto ERROR;}
  
  if(abc) esl_alphabet_Destroy(abc);
  if(*out) return eslOK;
  
  ERROR:
    
  if(abc) esl_alphabet_Destroy(abc);
  if(aaseq != NULL) free(aaseq);
  (*out) = NULL;
  
  return eslEMEM;
}
static int
profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr )
{
  /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method.
  /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples...
  ESL_MSA                 *msa      = NULL;
  string profile_string;
  char *buf;
  long len;
  int                      seqidx;
  int                      status;
  char       errmsg2[eslERRBUFSIZE];

  ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC));

  const char * const seqname = "Galosh Profile Consensus";
  const char * const msaname = "Galosh Profile";
  uint32_t profile_length;
  galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence;
  stringstream tmp_consensus_output_stream;

  uint32_t pos_i;

  if (profile_ptr == NULL)  { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); }
  //if (feof(afp->bf->fp))  { status = eslEOF; goto ERROR; }
  afp->errmsg[0] = '\0';

  // Read in the galosh profile (from profillic)
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end
  //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length)
  //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again.

  //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer
  //fread( buf, len, 1, afp->bf->fp ); //read into buffer

  //profile_string = buf;
  //profile_ptr->fromString( profile_string );
  profile_ptr->fromFile( afp->bf->filename );
  //if (buf)      free(buf);
  // \todo WHY WON'T THIS WORK?  See HACKs in profillic-hmmbuild.cpp to work around it.
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function)

  // Calculate the consensus sequence.
  profile_length = profile_ptr->length();
  consensus_sequence.reinitialize( profile_length );
  for( pos_i = 0; pos_i < profile_length; pos_i++ ) {
    consensus_sequence[ pos_i ] =
      ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType();
  }
  tmp_consensus_output_stream << consensus_sequence;

  /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }


  // Set first-and-only seq to the consensus.  This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself.  Also msa->sqname[0] to the "name" of that consensus sequence.

  /* if nec, make room for the new seq */
  if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; 
  seqidx = msa->nseq; // 0
  msa->nseq++; // = 1
  status = esl_strdup(seqname, -1, &(msa->sqname[seqidx]));
  // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc).
#ifdef eslAUGMENT_ALPHABET
  if (msa->flags & eslMSA_DIGITAL)
    {
      // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, "
      if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) {
        /* invalid char(s), get informative error message */
        if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) 
          ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg);
      }
    }
#endif
  if (! (msa->flags & eslMSA_DIGITAL))
    {
      status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length);
      msa->sqlen[seqidx] = profile_length;
    } 
  msa->alen = profile_length;

  /// \todo OR read in a fasta file of sequences too.
  /// \todo (Optional?) Set msa->name to the name of the profile (file?)
  esl_strdup(msaname, -1, &(msa->name));
  /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0.
  /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0].
  /// \todo Investigate what msa->sa and msa->pp are for.

  /* Give the newly parsed MSA a good
   * going-over, and finalize the fields of the MSA data structure.
   * verify_parse will fill in errmsg if it sees a problem.
   */
  //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } 

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa);
  return eslOK;

 ERROR:
  if (msa != NULL)      esl_msa_Destroy(msa);
  if (ret_msa != NULL) *ret_msa = NULL;
  return status;
}