/* Function:  esl_dst_CJukesCantor()
 * Synopsis:  Jukes-Cantor distance for two aligned strings.
 * Incept:    SRE, Tue Apr 18 14:00:37 2006 [St. Louis]
 *
 * Purpose:   Calculate the generalized Jukes-Cantor distance between
 *            two aligned character strings <as1> and <as2>, in
 *            substitutions/site, for an alphabet of <K> residues
 *            (<K=4> for nucleic acid, <K=20> for proteins). The
 *            maximum likelihood estimate for the distance is
 *            optionally returned in <opt_distance>. The large-sample
 *            variance for the distance estimate is
 *            optionally returned in <opt_variance>.
 *            
 *            Alphabetic symbols <[a-zA-Z]> are compared
 *            case-insensitively to count the number of identities
 *            (<n1>) and mismatches (<n2>>). Any nonalphabetic
 *            character is assumed to be a gap symbol, and aligned
 *            columns containing gap symbols are ignored.  The
 *            fractional difference <D> used to calculate the
 *            Jukes/Cantor distance is <n2/n1+n2>.
 *            
 * Args:      K            - size of the alphabet (4 or 20)
 *            as1          - 1st aligned seq, 0..L-1, \0-terminated
 *            as2          - 2nd aligned seq, 0..L-1, \0-terminated 
 *            opt_distance - optRETURN: ML estimate of distance d
 *            opt_variance - optRETURN: large-sample variance of d
 *
 * Returns:   <eslOK> on success.
 * 
 *            Infinite distances are possible, in which case distance
 *            and variance are both <HUGE_VAL>. Caller has to deal
 *            with this case as it sees fit, perhaps by enforcing
 *            an arbitrary maximum distance.
 *
 * Throws:    <eslEINVAL> if the two strings aren't the same length (and
 *            thus can't have been properly aligned).
 *            <eslEDIVZERO> if no aligned residues were counted.
 *            On either failure, distance and variance are both returned
 *            as <HUGE_VAL>.
 */
int
esl_dst_CJukesCantor(int K, const char *as1, const char *as2, 
		     double *opt_distance, double *opt_variance)
{
  int     status;
  int     n1, n2;               /* number of observed identities, substitutions */
  int     i;                    /* position in aligned seqs   */

  /* 1. Count identities, mismatches.
   */
  n1 = n2 = 0;
  for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) 
    {
      if (isalpha(as1[i]) && isalpha(as2[i]))
	{
	  if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++;
	}
    }
  if (as1[i] != '\0' || as2[i] != '\0') 
    ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
  
  return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */

 ERROR:
  if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
  if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
  return status;
}
Exemple #2
0
/* Function: p7_Handmodelmaker()
 * 
 * Purpose:  Manual model construction.
 *           Construct an HMM from a digital alignment, where the
 *           <#=RF> line of the alignment file is used to indicate the
 *           columns assigned to matches vs. inserts.
 *           
 *           The <msa> must be in digital mode, and it must have
 *           a reference annotation line.
 *           
 *           NOTE: <p7_Handmodelmaker()> will slightly revise the
 *           alignment if necessary, if the assignment of columns
 *           implies DI and ID transitions.
 *           
 *           Returns both the HMM in counts form (ready for applying
 *           Dirichlet priors as the next step), and fake tracebacks
 *           for each aligned sequence. 
 *
 *           Models must have at least one node, so if the <msa> defined 
 *           no consensus columns, a <eslENORESULT> error is returned.
 *           
 * Args:     msa     - multiple sequence alignment
 *           bld       - holds information on regions requiring masking, optionally NULL -> no masking
 *           ret_hmm - RETURN: counts-form HMM
 *           opt_tr  - optRETURN: array of tracebacks for aseq's
 *           
 * Return:   <eslOK> on success. <ret_hmm> and <opt_tr> are allocated 
 *           here, and must be free'd by caller.
 *
 *           Returns <eslENORESULT> if no consensus columns were annotated;
 *           in this case, <ret_hmm> and <opt_tr> are returned NULL. 
 *           
 *           Returns <eslEFORMAT> if the <msa> doesn't have a reference
 *           annotation line.
 *           
 * Throws:   <eslEMEM> on allocation failure. Throws <eslEINVAL> if the <msa>
 *           isn't in digital mode.
 */            
int
p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
{
  int        status;
  int       *matassign = NULL;    /* MAT state assignments if 1; 1..alen */
  int        apos;                /* counter for aligned columns         */

  if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need a digital msa");
  if (msa->rf == NULL)                 return eslEFORMAT;

  ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
 
  /* Watch for off-by-one. rf is [0..alen-1]; matassign is [1..alen] */
  for (apos = 1; apos <= msa->alen; apos++)
    matassign[apos] = (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])? FALSE : TRUE);

  /* matassign2hmm leaves ret_hmm, opt_tr in their proper state: */
  if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) goto ERROR;

  free(matassign);
  return eslOK;

 ERROR:
  if (matassign != NULL) free(matassign);
  return status;
}
/* Function:  esl_dst_XPairId()
 * Synopsis:  Pairwise identity of two aligned digital seqs.
 * Incept:    SRE, Tue Apr 18 09:24:05 2006 [St. Louis]
 *
 * Purpose:   Digital version of <esl_dst_PairId()>: <adsq1> and
 *            <adsq2> are digitized aligned sequences, in alphabet
 *            <abc>. Otherwise, same as <esl_dst_PairId()>.
 *            
 * Args:      abc          - digital alphabet in use
 *            ax1          - aligned digital seq 1
 *            ax2          - aligned digital seq 2
 *            opt_pid      - optRETURN: pairwise identity, 0<=x<=1
 *            opt_nid      - optRETURN: # of identities
 *            opt_n        - optRETURN: denominator MIN(len1,len2)
 *
 * Returns:   <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n>
 *            contain the answers, for any of these that were passed
 *            non-<NULL> pointers.
 *
 * Throws:    <eslEINVAL> if the strings are different lengths (not aligned).
 */
int
esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, 
		double *opt_distance, int *opt_nid, int *opt_n)
{
  int     status;
  int     idents;               /* total identical positions  */
  int     len1, len2;           /* lengths of seqs            */
  int     i;                    /* position in aligned seqs   */

  idents = len1 = len2 = 0;
  for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) 
    {
      if (esl_abc_XIsCanonical(abc, ax1[i])) len1++;
      if (esl_abc_XIsCanonical(abc, ax2[i])) len2++;

      if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])
	  && ax1[i] == ax2[i])
	idents++;
    }
  if (len2 < len1) len1 = len2;

  if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) 
    ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");

  if (opt_distance != NULL)  *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 );
  if (opt_nid      != NULL)  *opt_nid      = idents;
  if (opt_n        != NULL)  *opt_n        = len1;
  return eslOK;

 ERROR:
  if (opt_distance != NULL)  *opt_distance = 0.;
  if (opt_nid      != NULL)  *opt_nid      = 0;
  if (opt_n        != NULL)  *opt_n        = 0;
  return status;
}
/* Function:  esl_dst_XJukesCantor()
 * Synopsis:  Jukes-Cantor distance for two aligned digitized seqs.
 * Incept:    SRE, Tue Apr 18 15:26:51 2006 [St. Louis]
 *
 * Purpose:   Calculate the generalized Jukes-Cantor distance between two
 *            aligned digital strings <ax> and <ay>, in substitutions/site, 
 *            using alphabet <abc> to evaluate identities and differences.
 *            The maximum likelihood estimate for the distance is optionally returned in
 *            <opt_distance>. The large-sample variance for the distance
 *            estimate is optionally returned in <opt_variance>.
 *            
 *            Identical to <esl_dst_CJukesCantor()>, except that it takes
 *            digital sequences instead of character strings.
 *
 * Args:      abc          - bioalphabet to use for comparisons
 *            ax           - 1st digital aligned seq
 *            ay           - 2nd digital aligned seq
 *            opt_distance - optRETURN: ML estimate of distance d
 *            opt_variance - optRETURN: large-sample variance of d
 *
 * Returns:   <eslOK> on success. As in <esl_dst_CJukesCantor()>, the
 *            distance and variance may be infinite, in which case they
 *            are returned as <HUGE_VAL>.
 *
 * Throws:    <eslEINVAL> if the two strings aren't the same length (and
 *            thus can't have been properly aligned).
 *            <eslEDIVZERO> if no aligned residues were counted.
 *            On either failure, the distance and variance are set
 *            to <HUGE_VAL>.
 */
int
esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay, 
		     double *opt_distance, double *opt_variance)
{
  int     status;
  int     n1, n2;               /* number of observed identities, substitutions */
  int     i;                    /* position in aligned seqs   */

  n1 = n2 = 0;
  for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++) 
    {
      if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i]))
	{
	  if (ax[i] == ay[i]) n1++;
	  else                n2++;
	}
    }
  if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL) 
    ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
  
  return jukescantor(n1, n2, abc->K, opt_distance, opt_variance);

 ERROR:
  if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
  if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
  return status;
}
/* Function:  esl_dst_CPairId()
 * Synopsis:  Pairwise identity of two aligned text strings.
 * Incept:    SRE, Mon Apr 17 20:06:07 2006 [St. Louis]
 *
 * Purpose:   Calculates pairwise fractional identity between two
 *            aligned character strings <asq1> and <asq2>. 
 *            Return this distance in <opt_pid>; return the
 *            number of identities counted in <opt_nid>; and
 *            return the denominator <MIN(len1,len2)> in
 *            <opt_n>.
 *            
 *            Alphabetic symbols <[a-zA-Z]> are compared
 *            case-insensitively for identity. Any nonalphabetic
 *            character is assumed to be a gap symbol.
 *            
 *            This simple comparison rule is unaware of synonyms and
 *            degeneracies in biological alphabets.  For a more
 *            sophisticated and biosequence-aware comparison, use
 *            digitized sequences and the <esl_dst_XPairId()> function
 *            instead.
 *
 * Args:      asq1         - aligned character string 1
 *            asq2         - aligned character string 2
 *            opt_pid      - optRETURN: pairwise identity, 0<=x<=1
 *            opt_nid      - optRETURN: # of identities
 *            opt_n        - optRETURN: denominator MIN(len1,len2)
 *
 * Returns:   <eslOK> on success. <opt_pid>, <opt_nid>, <opt_n>
 *            contain the answers (for whichever were passed non-NULL). 
 *
 * Throws:    <eslEINVAL> if the strings are different lengths
 *            (not aligned).
 */
int
esl_dst_CPairId(const char *asq1, const char *asq2, 
		double *opt_pid, int *opt_nid, int *opt_n)
{
  int     status;
  int     idents;               /* total identical positions  */
  int     len1, len2;           /* lengths of seqs            */
  int     i;                    /* position in aligned seqs   */

  idents = len1 = len2 = 0;
  for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++) 
    {
      if (isalpha(asq1[i])) len1++;
      if (isalpha(asq2[i])) len2++;
      if (isalpha(asq1[i]) && isalpha(asq2[i])
	  && toupper(asq1[i]) == toupper(asq2[i])) 
	idents++;
    }
  if (asq1[i] != '\0' || asq2[i] != '\0') 
    ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");

  if (opt_pid  != NULL)  *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2));
  if (opt_nid  != NULL)  *opt_nid = idents;
  if (opt_n    != NULL)  *opt_n   = len1;
  return eslOK;

 ERROR:
  if (opt_pid  != NULL)  *opt_pid = 0.;
  if (opt_nid  != NULL)  *opt_nid = 0;
  if (opt_n    != NULL)  *opt_n   = 0;
  return status;
}
/* Function:  esl_dst_XPairIdMx()
 * Synopsis:  NxN identity matrix for N aligned digital seqs.
 * Incept:    SRE, Thu Apr 27 09:08:11 2006 [New York]
 *
 * Purpose:   Given a digitized multiple sequence alignment <ax>, consisting
 *            of <N> aligned digital sequences in alphabet <abc>; calculate
 *            a symmetric pairwise fractional identity matrix by $N(N-1)/2$
 *            calls to <esl_dst_XPairId()>, and return it in <ret_S>.
 *            
 * Args:      abc   - digital alphabet in use
 *            ax    - aligned dsq's, [0..N-1][1..alen]                  
 *            N     - number of aligned sequences
 *            ret_S - RETURN: NxN matrix of fractional identities
 *
 * Returns:   <eslOK> on success, and <ret_S> contains the distance
 *            matrix. Caller is obligated to free <S> with 
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if a seq has a different
 *            length than others. On failure, <ret_S> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XPairIdMx(const ESL_ALPHABET *abc,  ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
{
  int status;
  ESL_DMATRIX *S = NULL;
  int i,j;

  if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR;
  
  for (i = 0; i < N; i++)
    {
      S->mx[i][i] = 1.;
      for (j = i+1; j < N; j++)
	{
	  status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
	  if (status != eslOK)
	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
	  S->mx[j][i] =  S->mx[i][j];
	}
    }
  if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
  return eslOK;

 ERROR:
  if (S     != NULL)  esl_dmatrix_Destroy(S);
  if (ret_S != NULL) *ret_S = NULL;
  return status;
}
Exemple #7
0
/* Function:  p7_oprofile_MPIRecv()
 * Synopsis:  Receives an OPROFILE as a work unit from an MPI sender.
 * Incept:    MSF, Wed Oct 21, 2009 [Janelia]
 *
 * Purpose:   Receive a work unit that consists of a single OPROFILE
 *            sent by MPI <source> (<0..nproc-1>, or
 *            <MPI_ANY_SOURCE>) tagged as <tag> for MPI communicator <comm>.
 *            
 *            Work units are prefixed by a status code. If the unit's
 *            code is <eslOK> and no errors are encountered, this
 *            routine will return <eslOK> and a non-<NULL> <*ret_om>.
 *            If the unit's code is <eslEOD> (a shutdown signal), 
 *            this routine returns <eslEOD> and <*ret_om> is <NULL>.
 *   
 *            Caller provides a working buffer <*buf> of size
 *            <*nalloc> characters. These are passed by reference, so
 *            that <*buf> can be reallocated and <*nalloc> increased
 *            if necessary. As a special case, if <*buf> is <NULL> and
 *            <*nalloc> is 0, the buffer will be allocated
 *            appropriately, but the caller is still responsible for
 *            free'ing it.
 *            
 *            Caller may or may not already know what alphabet the OPROFILE
 *            is expected to be in.  A reference to the current
 *            alphabet is passed in <abc>. If the alphabet is unknown,
 *            pass <*abc = NULL>, and when the OPROFILE is received, an
 *            appropriate new alphabet object is allocated and passed
 *            back to the caller via <*abc>.  If the alphabet is
 *            already known, <*ret_abc> is that alphabet, and the new
 *            OPROFILE's alphabet type is verified to agree with it. This
 *            mechanism allows an application to let the first OPROFILE
 *            determine the alphabet type for the application, while
 *            still keeping the alphabet under the application's scope
 *            of control.
 *
 * Returns:   <eslOK> on success. <*ret_om> contains the received OPROFILE;
 *            it is allocated here, and the caller is responsible for
 *            free'ing it.  <*buf> may have been reallocated to a
 *            larger size, and <*nalloc> may have been increased.  If
 *            <*abc> was passed as <NULL>, it now points to an
 *            <ESL_ALPHABET> object that was allocated here; caller is
 *            responsible for free'ing this.
 *            
 *            Returns <eslEOD> if an end-of-data signal was received.
 *            In this case, <*buf>, <*nalloc>, and <*abc> are left unchanged,
 *            and <*ret_om> is <NULL>.
 *            
 *            Returns <eslEINCOMPAT> if the OPROFILE is in a different alphabet
 *            than <*abc> said to expect. In this case, <*abc> is unchanged,
 *            <*buf> and <*nalloc> may have been changed, and <*ret_om> is
 *            <NULL>.
 *            
 * Throws:    <eslEMEM> on allocation error, in which case <*ret_om> is 
 *            <NULL>.           
 */
int
p7_oprofile_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_OPROFILE **ret_om)
{
  int         status;
  int         code;
  P7_OPROFILE     *om     = NULL;
  int         n;
  int         pos;
  MPI_Status  mpistatus;

  /* Probe first, because we need to know if our buffer is big enough. */
  MPI_Probe(source, tag, comm, &mpistatus);
  MPI_Get_count(&mpistatus, MPI_PACKED, &n);

  /* Make sure the buffer is allocated appropriately */
  if (*buf == NULL || n > *nalloc) {
    void *tmp;
    ESL_RALLOC(*buf, tmp, sizeof(char) * n);
    *nalloc = n; 
  }

  /* Receive the packed work unit */
  MPI_Recv(*buf, n, MPI_PACKED, source, tag, comm, &mpistatus);

  /* Unpack it, looking at the status code prefix for EOD/EOK  */
  pos = 0;
  if (MPI_Unpack(*buf, n, &pos, &code, 1, MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (code == eslEOD)  { *ret_om = NULL;  return eslEOD; }

  return p7_oprofile_MPIUnpack(*buf, *nalloc, &pos, comm, abc, ret_om);

 ERROR:
  if (om != NULL) p7_oprofile_Destroy(om);
  return status;
}
/* Function:  esl_workqueue_Create()
 * Synopsis:  Create a work queue object.
 * Incept:    MSF, Thu Jun 18 11:51:39 2009
 *
 * Purpose:   Creates an <ESL_WORK_QUEUE> object of <size>.  The
 *            queues are used to handle objects <void *> that
 *            are ready to be processed and that have been
 *            processed by worker threads.
 *
 * Returns:   ptr to the new <ESL_WORK_QUEUE> object.
 * 
 * Throws:    <eslESYS> on allocation or initialization failure.
 */
ESL_WORK_QUEUE *
esl_workqueue_Create(int size)
{
  int             i;
  int             status;
  ESL_WORK_QUEUE *queue = NULL;

  ESL_ALLOC(queue, sizeof(ESL_WORK_QUEUE));

  queue->readerQueue     = NULL;
  queue->readerQueueCnt  = 0;
  queue->readerQueueHead = 0;

  queue->workerQueue     = NULL;
  queue->workerQueueCnt  = 0;
  queue->workerQueueHead = 0;

  queue->queueSize       = size;
  queue->pendingWorkers  = 0;

  if (pthread_mutex_init(&queue->queueMutex, NULL) != 0)     ESL_XEXCEPTION(eslESYS, "mutex init failed");

  if (pthread_cond_init(&queue->readerQueueCond, NULL) != 0) ESL_XEXCEPTION(eslESYS, "cond reader init failed");
  if (pthread_cond_init(&queue->workerQueueCond, NULL) != 0) ESL_XEXCEPTION(eslESYS, "cond worker init failed");

  ESL_ALLOC(queue->readerQueue, sizeof(void *) * size);
  ESL_ALLOC(queue->workerQueue, sizeof(void *) * size);

  for (i = 0; i < queue->queueSize; ++i)
    {
      queue->readerQueue[i] = NULL;
      queue->workerQueue[i] = NULL;
    }

  return queue;

 ERROR:
  esl_workqueue_Destroy(queue);
  return NULL;
}
Exemple #9
0
/* Function:  esl_workqueue_WorkerUpdate()
 * Synopsis:  Consumer routine.
 * Incept:    MSF, Thu Jun 18 11:51:39 2009
 *
 * Purpose:   The consumer (i.e. Worker) places an object that has
 *            been processed on the producers (i.e. Readers) queue.
 *
 *            If the <in> object is not null, it is placed on the
 *            readers queue.  If the reader is waiting for an object, 
 *            it is signaled it to wake up.
 *
 *            If the worker routine has supplied an <out> pointer,
 *            an object that is ready for processing by a worker,
 *            is placed in <out> so the worker thread can continue.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslESYS> if thread synchronization fails somewhere.
 *            <eslEINVAL> if something's wrong with <queue>.
 */
int esl_workqueue_WorkerUpdate(ESL_WORK_QUEUE *queue, void *in, void **out)
{
  int cnt;
  int inx;
  int queueSize;
  int status;

  if (queue == NULL)                                ESL_XEXCEPTION(eslEINVAL, "Invalid queue object");
  if (pthread_mutex_lock (&queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS,   "mutex lock failed");

  queueSize = queue->queueSize;

  /* check if the caller is queuing up an item */
  if (in != NULL)
    {

      /* check to make sure we don't overflow */
      if (queue->readerQueueCnt >= queueSize) ESL_XEXCEPTION(eslEINVAL, "Reader queue overflow");

      inx = (queue->readerQueueHead + queue->readerQueueCnt) % queueSize;
      queue->readerQueue[inx] = in;
      cnt = queue->readerQueueCnt++;
      if (cnt == 0)
	{
	  if (pthread_cond_signal (&queue->readerQueueCond) != 0) ESL_XEXCEPTION(eslESYS, "cond signal failed");
	}
    }

  /* check if the caller is waiting for a queued item */
  if (out != NULL)
    {

      if (queue->workerQueueCnt == 0)
	{
	  /* wait for a processed buffers to be returned */
	  ++queue->pendingWorkers;
	  while (queue->workerQueueCnt == 0)
	    {
	      if (pthread_cond_wait (&queue->workerQueueCond, &queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS, "cond wait failed");
	    }
	  --queue->pendingWorkers;
	}

      inx = queue->workerQueueHead;
      *out = queue->workerQueue[inx];
      queue->workerQueue[inx] = NULL;
      queue->workerQueueHead = (queue->workerQueueHead + 1) % queueSize;
      --queue->workerQueueCnt;
    }

  if (pthread_mutex_unlock (&queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS, "mutex unlock failed");
  return eslOK;

 ERROR:
  if (out) *out = NULL;
  return status;
}
Exemple #10
0
/* Function:  p7_SingleBuilder()
 * Synopsis:  Build a new HMM from a single sequence.
 *
 * Purpose:   Take the sequence <sq> and a build configuration <bld>, and
 *            build a new HMM.
 *            
 *            The single sequence scoring system in the <bld>
 *            configuration must have been previously initialized by
 *            <p7_builder_SetScoreSystem()>.
 *            
 * Args:      bld       - build configuration
 *            sq        - query sequence
 *            bg        - null model (needed to paramaterize insert emission probs)
 *            opt_hmm   - optRETURN: new HMM
 *            opt_gm    - optRETURN: profile corresponding to <hmm>
 *            opt_om    - optRETURN: optimized profile corresponding to <gm>
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslEINVAL> if <bld> isn't properly configured somehow.
 */
int
p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq, P7_BG *bg, P7_HMM **opt_hmm,
		 P7_TRACE **opt_tr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om)
{
  P7_HMM   *hmm = NULL;
  P7_TRACE *tr  = NULL;
  int       k;
  int       status;
  
  bld->errbuf[0] = '\0';
  if (! bld->Q) ESL_XEXCEPTION(eslEINVAL, "score system not initialized");

  if ((status = p7_Seqmodel(bld->abc, sq->dsq, sq->n, sq->name, bld->Q, bg->f, bld->popen, bld->pextend, &hmm)) != eslOK) goto ERROR;
  if ((status = p7_hmm_SetComposition(hmm))                                                                     != eslOK) goto ERROR;
  if ((status = p7_hmm_SetConsensus(hmm, sq))                                                                   != eslOK) goto ERROR; 
  if ((status = calibrate(bld, hmm, bg, opt_gm, opt_om))                                                        != eslOK) goto ERROR;

  if ( bld->abc->type == eslDNA ||  bld->abc->type == eslRNA ) {
    if (bld->w_len > 0)           hmm->max_length = bld->w_len;
    else if (bld->w_beta == 0.0)  hmm->max_length = hmm->M *4;
    else if ( (status =  p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR;
  }


  /* build a faux trace: relative to core model (B->M_1..M_L->E) */
  if (opt_tr != NULL) 
    {
      if ((tr = p7_trace_Create())                      == NULL)  goto ERROR;
      if ((status = p7_trace_Append(tr, p7T_B, 0, 0))   != eslOK) goto ERROR; 
      for (k = 1; k <= sq->n; k++)
        if ((status = p7_trace_Append(tr, p7T_M, k, k)) != eslOK) goto ERROR;
      if ((status = p7_trace_Append(tr, p7T_E, 0, 0))   != eslOK) goto ERROR; 
      tr->M = sq->n;
      tr->L = sq->n;
    }

  /* note that <opt_gm> and <opt_om> were already set by calibrate() call above. */
  if (opt_hmm   != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm);
  if (opt_tr    != NULL) *opt_tr  = tr;
  return eslOK;

 ERROR:
  p7_hmm_Destroy(hmm);
  if (tr        != NULL) p7_trace_Destroy(tr);
  if (opt_gm    != NULL) p7_profile_Destroy(*opt_gm);
  if (opt_om    != NULL) p7_oprofile_Destroy(*opt_om);
  return status;
}
Exemple #11
0
/* Function: p7_Fastmodelmaker()
 * 
 * Purpose:  Heuristic model construction.
 *           Construct an HMM from an alignment by a simple rule,
 *           based on the fractional occupancy of each columns w/
 *           residues vs gaps. Any column w/ a fractional
 *           occupancy of $\geq$ <symfrac> is assigned as a MATCH column;
 *           for instance, if thresh = 0.5, columns w/ $\geq$ 50\% 
 *           residues are assigned to match... roughly speaking.
 *           
 *           "Roughly speaking" because sequences may be weighted
 *           in the input <msa>, and because missing data symbols are
 *           ignored, in order to deal with sequence fragments.
 *
 *           The <msa> must be in digital mode. 
 *
 *           If the caller wants to designate any sequences as
 *           fragments, it does so by converting all N-terminal and
 *           C-terminal flanking gap symbols to missing data symbols.
 *
 *           NOTE: p7_Fastmodelmaker() will slightly revise the
 *           alignment if the assignment of columns implies
 *           DI and ID transitions.
 *           
 *           Returns the HMM in counts form (ready for applying Dirichlet
 *           priors as the next step). Also returns fake traceback
 *           for each training sequence.
 *           
 *           Models must have at least one node, so if the <msa> defined 
 *           no consensus columns, a <eslENORESULT> error is returned.
 *           
 * Args:     msa       - multiple sequence alignment
 *           symfrac   - threshold for residue occupancy; >= assigns MATCH
 *           bld       - holds information on regions requiring masking, optionally NULL -> no masking
 *           ret_hmm   - RETURN: counts-form HMM
 *           opt_tr    - optRETURN: array of tracebacks for aseq's
 *           
 * Return:   <eslOK> on success. ret_hmm and opt_tr allocated here,
 *           and must be free'd by the caller (FreeTrace(tr[i]), free(tr),
 *           FreeHMM(hmm)).       
 *
 *           Returns <eslENORESULT> if no consensus columns were annotated;
 *           in this case, <ret_hmm> and <opt_tr> are returned NULL.
 *           
 * Throws:   <eslEMEM> on allocation failure; <eslEINVAL> if the 
 *           <msa> isn't in digital mode.
 */
int
p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
{
  int      status;	     /* return status flag                  */
  int     *matassign = NULL; /* MAT state assignments if 1; 1..alen */
  int      idx;              /* counter over sequences              */
  int      apos;             /* counter for aligned columns         */
  float    r;		         /* weighted residue count              */
  float    totwgt;	     /* weighted residue+gap count          */

  if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA");

  /* Allocations: matassign is 1..alen array of bit flags.
   */
  ESL_ALLOC(matassign, sizeof(int)     * (msa->alen+1));

  /* Determine weighted sym freq in each column, set matassign[] accordingly.
   */
  for (apos = 1; apos <= msa->alen; apos++) 
    {  
      r = totwgt = 0.;
      for (idx = 0; idx < msa->nseq; idx++) 
      {
        if       (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; }
        else if  (esl_abc_XIsGap(msa->abc,     msa->ax[idx][apos])) {                     totwgt += msa->wgt[idx]; }
        else if  (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue;
      }
      if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE;
      else                                 matassign[apos] = FALSE;
    }


  /* Once we have matassign calculated, modelmakers behave
   * the same; matassign2hmm() does this stuff (traceback construction,
   * trace counting) and sets up ret_hmm and opt_tr.
   */
  if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) {
    fprintf (stderr, "hmm construction error during trace counting\n");
    goto ERROR;
  }

  free(matassign);
  return eslOK;

 ERROR:
  if (matassign != NULL) free(matassign);
  return status;
}
/* Function:  esl_randomness_Create()
 * Synopsis:  Create an RNG with a given seed.
 * Incept:    SRE, Wed Jul 14 13:02:18 2004 [St. Louis]
 *
 * Purpose:   Create a random number generator using
 *            a given random seed. Seed must be $>0$.
 *            
 * Args:      seed $>= 0$.
 *
 * Returns:   an initialized <ESL_RANDOMNESS *> on success.
 *            Caller free's with <esl_randomness_Destroy()>.
 *              
 * Throws:    <NULL> on failure.
 * 
 * Xref:      STL8/p57.
 */
ESL_RANDOMNESS *
esl_randomness_Create(long seed)
{
  ESL_RANDOMNESS *r      = NULL;
  int             burnin = 7;
  int             status;

  if (seed <= 0) ESL_XEXCEPTION(eslEINVAL, "bad seed");
  ESL_ALLOC(r, sizeof(ESL_RANDOMNESS));
  r->seed      = seed;
  r->reseeding = TRUE;

  /* we observe that the first random number isn't very random, if
   * closely spaced seeds are used, like what we get with using
   * time().  So, "burn in" the random chain just a little.
   */
  while (burnin--) esl_random(r);
  return r;

 ERROR:
  return NULL;
}
/* Function:  esl_dst_XJukesCantorMx()
 * Synopsis:  NxN Jukes/Cantor distance matrix for N aligned digital seqs.
 * Incept:    SRE, Thu Apr 27 08:38:08 2006 [New York City]
 *
 * Purpose:   Given a digitized multiple sequence alignment <ax>,
 *            consisting of <nseq> aligned digital sequences in
 *            bioalphabet <abc>, calculate a symmetric Jukes/Cantor
 *            pairwise distance matrix for all sequence pairs;
 *            optionally return the distance matrix in <ret_D> and 
 *            a matrix of the large-sample variances for those ML distance
 *            estimates in <ret_V>.
 *            
 *            Infinite distances (and variances) are possible. They
 *            are represented as <HUGE_VAL> in <D> and <V>. Caller must
 *            be prepared to deal with them as appropriate.
 *
 * Args:      abc    - bioalphabet for <aseq>
 *            ax     - aligned digital sequences [0.nseq-1][1..L]
 *            nseq   - number of aseqs
 *            opt_D  - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
 *            opt_V  - optRETURN: matrix of variances.
 *
 * Returns:   <eslOK> on success. <D> (and optionally <V>) contain the
 *            distance matrix (and variances). Caller frees these with
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if any pair of sequences have differing lengths
 *            (and thus cannot have been properly aligned). 
 *            <eslEDIVZERO> if some pair of sequences had no aligned
 *            residues. On failure, <D> and <V> are both returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq, 
		       ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
{
  ESL_DMATRIX *D = NULL;
  ESL_DMATRIX *V = NULL;
  int          status;
  int          i,j;

  if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR;
  if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR;

  for (i = 0; i < nseq; i++)
    {
      D->mx[i][i] = 0.;
      V->mx[i][i] = 0.;
      for (j = i+1; j < nseq; j++)
	{
	  status = esl_dst_XJukesCantor(abc, ax[i], ax[j], 
					&(D->mx[i][j]), &(V->mx[i][j]));
	  if (status != eslOK) 
	    ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j);

	  D->mx[j][i] = D->mx[i][j];
	  V->mx[j][i] = V->mx[i][j];
	}
    }
  if (opt_D != NULL) *opt_D = D;  else esl_dmatrix_Destroy(D);
  if (opt_V != NULL) *opt_V = V;  else esl_dmatrix_Destroy(V);
  return eslOK;

 ERROR:
  if (D     != NULL) esl_dmatrix_Destroy(D);
  if (V     != NULL) esl_dmatrix_Destroy(V);
  if (opt_D != NULL) *opt_D = NULL;
  if (opt_V != NULL) *opt_V = NULL;
  return status;
}
Exemple #14
0
/* Function:  p7_oprofile_MPISend()
 * Synopsis:  Send an OPROFILE as an MPI work unit.
 * Incept:    MSF, Wed Oct 21, 2009 [Janelia]
 *
 * Purpose:   Sends an OPROFILE <om> as a work unit to MPI process
 *            <dest> (where <dest> ranges from 0..<nproc-1>), tagged
 *            with MPI tag <tag>, for MPI communicator <comm>, as 
 *            the sole workunit or result. 
 *            
 *            Work units are prefixed by a status code. If <hmm> is
 *            <non-NULL>, the work unit is an <eslOK> code followed by
 *            the packed HMM. If <hmm> is NULL, the work unit is an
 *            <eslEOD> code, which <p7_hmm_MPIRecv()> knows how to
 *            interpret; this is typically used for an end-of-data
 *            signal to cleanly shut down worker processes.
 *            
 *            In order to minimize alloc/free cycles in this routine,
 *            caller passes a pointer to a working buffer <*buf> of
 *            size <*nalloc> characters. If necessary (i.e. if <hmm> is
 *            too big to fit), <*buf> will be reallocated and <*nalloc>
 *            increased to the new size. As a special case, if <*buf>
 *            is <NULL> and <*nalloc> is 0, the buffer will be
 *            allocated appropriately, but the caller is still
 *            responsible for free'ing it.
 *            
 * Returns:   <eslOK> on success; <*buf> may have been reallocated and
 *            <*nalloc> may have been increased.
 * 
 * Throws:    <eslESYS> if an MPI call fails; <eslEMEM> if a malloc/realloc
 *            fails. In either case, <*buf> and <*nalloc> remain valid and useful
 *            memory (though the contents of <*buf> are undefined). 
 * 
 * Note:      Compare to p7_hmmfile_WriteBinary(). The two operations (sending
 *            an HMM via MPI, or saving it as a binary file to disk) are
 *            similar.
 */
int
p7_oprofile_MPISend(P7_OPROFILE *om, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc)
{
  int   status;
  int   code;
  int   sz, n, pos;

  /* Figure out size */
  if (MPI_Pack_size(1, MPI_INT, comm, &n) != 0) ESL_XEXCEPTION(eslESYS, "mpi pack size failed");
  if (om != NULL) {
    if ((status = p7_oprofile_MPIPackSize(om, comm, &sz)) != eslOK) return status;
    n += sz;
  }

  /* Make sure the buffer is allocated appropriately */
  if (*buf == NULL || n > *nalloc) {
    void *tmp;
    ESL_RALLOC(*buf, tmp, sizeof(char) * n);
    *nalloc = n; 
  }

  /* Pack the status code and OPROFILE into the buffer */
  pos  = 0;
  code = (om == NULL) ? eslEOD : eslOK;
  if (MPI_Pack(&code, 1, MPI_INT, *buf, n, &pos, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi pack failed");
  if (om != NULL) {
    if ((status = p7_oprofile_MPIPack(om, *buf, n, &pos, comm)) != eslOK) return status;
  }

  /* Send the packed OPROFILE to the destination. */
  if (MPI_Send(*buf, n, MPI_PACKED, dest, tag, comm) != 0)  ESL_EXCEPTION(eslESYS, "mpi send failed");
  return eslOK;

 ERROR:
  return status;
}
Exemple #15
0
/* Function: esl_msashuffle_CQRNA()
 * Synopsis: Gap-preserving column shuffle of a pairwise alignment.
 * Incept:   SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg]
 *
 * Purpose:  Shuffle a pairwise alignment <x>,<y> while preserving the
 *           position of gaps, using the random number generator <r>.
 *           Return the shuffled alignment in <xs>,
 *           <ys>. Caller provides allocated space for <xs> and <ys>.
 *           
 *           An alphabet <abc> must also be provided, solely for the
 *           definition of gap characters. Because Easel's default
 *           alphabets (DNA, RNA, and protein) all use the same
 *           definition of gap characters <-_.>, you can actually
 *           provide any alphabet here, and get the same results.
 *           (This may save having to determine the alphabet of input
 *           sequences.)
 *           
 *           Works by doing three separate
 *           shuffles, of (1) columns with residues in both
 *           <x> and <y>, (2) columns with residue in <x> and gap in <y>,
 *           and (3) columns with gap in <x> and residue in <y>.
 *           
 *           <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
 *           an alignment "in place", destroying the original
 *           alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>.
 *
 * Returns:  <eslOK> on success, and the shuffled alignment is 
 *           returned in <xs>, <ys>.
 *           
 * Throws:   <eslEMEM> on allocation failure.          
 */
int
esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys)
{
  int  L;
  int *xycol = NULL;
  int *xcol  = NULL;
  int *ycol  = NULL;
  int  nxy, nx, ny;
  int  i;
  int  pos, c;
  char xsym, ysym;
  int  status;

  if (xs != x) strcpy(xs, x);
  if (ys != y) strcpy(ys, y);

  /* First, construct three arrays containing lists of the column positions
   * of the three types of columns. (If a column contains gaps in both x and y,
   * we've already simply copied it to the shuffled sequence.)
   */
  L = strlen(x);
  if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
  ESL_ALLOC(xycol, sizeof(int) * L);
  ESL_ALLOC(xcol,  sizeof(int) * L);
  ESL_ALLOC(ycol,  sizeof(int) * L);
  nxy = nx = ny = 0;

  for (i = 0; i < L; i++)
    {
      if      (  esl_abc_CIsGap(abc, x[i]) &&   esl_abc_CIsGap(abc, y[i])) { continue; }
      else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
      else if (  esl_abc_CIsGap(abc, x[i]))                                { ycol[ny] = i;   ny++;  }
      else if (  esl_abc_CIsGap(abc, y[i]))                                { xcol[nx] = i;   nx++;  }
    }

  /* Second, shuffle the sequences indirectly, via shuffling these arrays.
   * Yow, careful with those indices, and with order of the statements...
   */
  for (; nxy > 1; nxy--) {
    pos              = esl_rnd_Roll(r, nxy);
    xsym             = xs[xycol[pos]];   ysym             = ys[xycol[pos]];    c            = xycol[pos];   
    xs[xycol[pos]]   = xs[xycol[nxy-1]]; ys[xycol[pos]]   = ys[xycol[nxy-1]];  xycol[pos]   = xycol[nxy-1];
    xs[xycol[nxy-1]] = xsym;             ys[xycol[nxy-1]] = ysym;              xycol[pos]   = xycol[nxy-1];
  }
  for (; nx > 1; nx--) {
    pos            = esl_rnd_Roll(r, nx); 
    xsym           = xs[xcol[pos]];  ysym           = ys[xcol[pos]];  c          = xcol[pos];  
    xs[xcol[pos]]  = xs[xcol[nx-1]]; ys[xcol[pos]]  = ys[xcol[nx-1]]; xcol[pos]  = xcol[nx-1]; 
    xs[xcol[nx-1]] = xsym;           ys[xcol[nx-1]] = ysym;           xcol[nx-1] = c;          
  }
  for (; ny > 1; ny--) {
    pos            = esl_rnd_Roll(r, ny); 
    xsym           = xs[ycol[pos]];  ysym           = ys[ycol[pos]];  c          = ycol[pos]; 
    xs[ycol[pos]]  = xs[ycol[ny-1]]; ys[ycol[pos]]  = ys[ycol[ny-1]]; ycol[pos]  = ycol[ny-1];
    xs[ycol[ny-1]] = xsym;           ys[ycol[ny-1]] = ysym;           ycol[ny-1] = c;          
  }

  free(xycol); free(xcol); free(ycol);
  return eslOK;

 ERROR:
  if (xycol != NULL) free(xycol);
  if (xcol  != NULL) free(xcol);
  if (ycol  != NULL) free(ycol);
  return status;
}
Exemple #16
0
/* ideal_local_endpoints()
 *
 * Purpose:  Implementation of the "two-step" fragment sampling
 *           algorithm, sampling a uniform local fragment w.r.t.
 *           sequence coords, by first sampling a complete
 *           sequence of length L from <hmm>; then choosing
 *           a random fragment <i1..i2> uniformly from all
 *           possible $\frac{L(L+1)/2}$ fragments;  then finding
 *           local alignment coordinates wrt model and sequence,
 *           using convention that local alignment starts/stops
 *           with match states. (Thus, if the initially selected
 *           i1 or i2 were generated by insert states, bounds
 *           are moved to reach first/last match state.)
 *           
 *           The caller also provides an allocated sequence <sq> and
 *           traceback <tr>, as storage to be provided to
 *           <p7_CoreEmit()>. They contain the generated global
 *           sequence and trace upon return (not a local trace, note).
 *           
 *           i endpoints are normalized/discretized to 1..<Lbins>, so
 *           we can collate i statistics from sampled sequences of
 *           varying L. Note this causes discretization artifacts,
 *           leading to underrepresentation of j=M and
 *           overrepresentation of i=1.
 *           
 *           This routine is only intended for collecting endpoint
 *           statistics (i1,i2,k1,k2); it does not generate a local
 *           alignment trace. (xref milestone 2, STL11/115).
 *           
 * Returns:  <eslOK> on success; returns normalized/binned sequence
 *           coords in <*ret_i1> and <*ret_i2> in range <1..Lbins> and
 *           the model entry/exit coords in <*ret_k1> and <*ret_k2> in
 *           range <1..M>. By internal def'n of local alignment endpoints,
 *           M_k1 emits residue x_i1, M_k2 emits residue x_i2.
 *           
 * Xref:     STL11/142-143 
 */
static int
ideal_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr, int Lbins,
		      int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2)
{
  int status;
  int tpos;
  int i1, i2, k1,k2, t1,t2;
  int all_insert;
  int failsafe = 0;		/* a failsafe timer for rejection sampling */

  do {
    if (failsafe++ == 1000) ESL_XEXCEPTION(eslENOHALT, "failed to obtain local alignment that wasn't all inserts");

    if ((status = p7_CoreEmit(r, hmm, sq, tr)) != eslOK) goto ERROR;

    /* a simple way to sample uniformly from upper triangle is by rejection 
     * this do/while cannot infinite loop, doesn't need failsafe 
     */
    do {
      i1 = 1 + esl_rnd_Roll(r, sq->n);
      i2 = 1 + esl_rnd_Roll(r, sq->n);
    } while (i1 > i2);

    /* Get initial k1,k2 coords: this step must work in a core model, 
     * i1/i2 were generated by an M or I. Also record t1,t2 endpoints
     * on core's trace.
     */
    for (tpos = 0; tpos < tr->N; tpos++)
      if (tr->i[tpos] == i1) { t1 = tpos; k1 = tr->k[tpos]; break; }
    for (tpos = tr->N-1; tpos >= 0; tpos--)
      if (tr->i[tpos] == i2) { t2 = tpos; k2 = tr->k[tpos]; break; }

    /* Enforce the definition of local alignment endpoints being
     * match-delimited - roll up any leading/trailing I states. 
     * Watch out for pathological case of a local fragment that
     * includes no M state at all.
     */
    all_insert = FALSE;
    for (; t1 <= t2; t1++) if (tr->st[t1] == p7T_M) break;
    for (; t2 >= t1; t2--) if (tr->st[t2] == p7T_M) break;
    if (t2 < t1) all_insert = TRUE; /* sufficient to check both. */
    i1 = tr->i[t1];  i2 = tr->i[t2];
    k1 = tr->k[t1];  k2 = tr->k[t2];
  } while (all_insert);

  /* Normalize sequence coords.
   * They're 1..L now; make them 1..Lbins
   */
  *ret_i1 = ((i1-1) * Lbins / sq->n) + 1;
  *ret_i2 = ((i2-1) * Lbins / sq->n) + 1;
  *ret_k1 = k1;
  *ret_k2 = k2;
  return eslOK;

 ERROR:
  *ret_i1 = 0.;
  *ret_i2 = 0.;
  *ret_k1 = 0;
  *ret_k2 = 0;
  return status;
}
Exemple #17
0
/* profile_local_endpoints()
 *
 * Purpose:   Wrapper around <p7_ProfileEmit()>, sampling a local
 *            alignment fragment from the profile's probabilistic model
 *            (which may be the implicit model of HMMER3, or the
 *            Plan7 model of HMMER2), and reporting coordinates
 *            of the fragment w.r.t. both model and sequence.
 *            
 *            To simplify the implementation, the profile must be in
 *            <p7_UNILOCAL> mode, not <p7_LOCAL> mode, so we know we
 *            only have to deal with a single hit per sampled
 *            sequence. 
 *            
 *            We want <i1..i2> to be relative to the sequence coords
 *            of a complete (global) sampled sequence that we could
 *            have sampled this local alignment from; but the <i1..i2>
 *            we initially get are relative to our profile-sampled
 *            trace, so they are offset both by N-generated residues
 *            that occur in the profile and by residues that the
 *            profile's local entry skipped. To translate from
 *            profile/sequence coords to core model/sequence coords,
 *            we use rejection sampling: sample traces from the core
 *            model until we find one that uses the same statetypes
 *            at *initial* entry/exit points <k1>,<k2>, then use
 *            that sample's sequence to determine offsets and correct
 *            <i1..i2> reference frame.
 *            
 *            Local alignment endpoints are defined to be
 *            match-delimited. However, an H3 model allows exit on
 *            either a D or M state. Thus, the initially sampled end
 *            point k2 may need to be rolled back to last M state, to
 *            satisfy local alignment endpoint definition. Entries are
 *            not a problem; both H2 and H3 profiles can only enter on
 *            a M state. (This rollback has to occur after we've
 *            matched a core trace to the profile trace to determine
 *            i offsets.)
 *            
 *            Then, sampling from both the core model and the profile
 *            in the same routine introduces a complication:
 *            conceivably, profile configuration alters the transition
 *            probabilities in the core model (by adding <M->E>
 *            transitions and renormalizing the M transition
 *            distributions, for example; H2 configuration does this,
 *            though H3 does not). So you can't <CoreSample()> the
 *            <gm->hmm> safely. To avoid such things, the caller
 *            provides a clean copy of the core model in <core>.
 *            
 *           i endpoints are normalized/discretized to 1..<Lbins>, so
 *           we can collate i statistics from sampled sequences of
 *           varying L. Note this causes discretization artifacts,
 *           leading to underrepresentation of j=M and
 *           overrepresentation of i=1.
 *           
 * Returns:  <eslOK> on success; returns normalized sequence coords in
 *           <*ret_i1> and <*ret_i2>, and the model entry/exit coords
 *           in <*ret_k1> and <*ret_k2>. 
 *           
 * Xref:     STL11/142-143 
 */
static int
profile_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *core, P7_PROFILE *gm, ESL_SQ *sq, P7_TRACE *tr, int Lbins,
			int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2)
{
  int status;
  int i1,i2;
  int k1,k2;
  int t1,t2;			/* entry/exit positions in local trace, tr */
  int tg1, tg2;			/* entry/exit positions in global trace, tr2 */
  int tpos;
  int nterm, cterm;		/* offsets at N, C terminus. */
  int L;			/* inferred length from 3-part patching */
  ESL_SQ *sq2   = NULL;
  P7_TRACE *tr2 = NULL;
  int failsafe  = 0;
  
  if (gm->mode != p7_UNILOCAL) ESL_XEXCEPTION(eslEINVAL, "profile must be unilocal");
  if ((sq2 = esl_sq_CreateDigital(gm->abc))  == NULL)   { status = eslEMEM; goto ERROR; }
  if ((tr  = p7_trace_Create())              == NULL)   { status = eslEMEM; goto ERROR; }

  /* sample local alignment from the implicit model */
  if (gm->h2_mode) {
    if ((status = p7_H2_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR;
  } else {
    if ((status = p7_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR;
  }
    
  /* Get initial trace coords */
  for (tpos = 0;       tpos < tr->N; tpos++)  if (tr->st[tpos] == p7T_B) { t1 = tpos+1; break; }
  for (tpos = tr->N-1; tpos >= 0;    tpos--)  if (tr->st[tpos] == p7T_E) { t2 = tpos-1; break; }
  
  /* Match a core trace to this local trace by rejection sampling;
   * this is to let us calculate sequence offsets; see comments above in preamble
   */
  do {
    if (failsafe++ == 100000) ESL_XEXCEPTION(eslENOHALT, "failed to match core,local traces in %d tries\n", failsafe);

    if ((status = p7_CoreEmit(r, core, sq2, tr2)) != eslOK) goto ERROR;
    for (tpos = 0; tpos < tr2->N; tpos++)
      if (tr2->k[tpos] == tr->k[t1]) { tg1 = tpos; break; }
    for (tpos = tr2->N-1; tpos >= 0; tpos--)
      if (tr2->k[tpos] == tr->k[t2]) { tg2 = tpos; break; }
  }  while (tr2->st[tg1] != tr->st[t1] && tr2->st[tg2] != tr->st[t2]);

  /* tg1..tg2 in core trace is now matched to t1..t2 in the profile trace.
   * Calculate # of residues preceding tg1 and following tg2 in the core trace.
   * A core trace can only generate residues from M or I states.
   */
  for (nterm = 0, tpos = 0; tpos < tg1; tpos++) 
    if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) nterm++;
  for (cterm = 0, tpos = tr2->N-1; tpos > tg2; tpos--)
    if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) cterm++;

  /* rectify the t2 endpoint, rolling back any trailing D path 
   */
  for (; t2 >= 0; t2--) if (tr->st[t2] == p7T_M) break;
  if (t2 < t1) ESL_XEXCEPTION(eslEINCONCEIVABLE, "this only happens on an all-D path through profile");  
  
  /* determine initial endpoint coords from t1 and t2 */
  i1 = tr->i[t1];  i2 = tr->i[t2];
  k1 = tr->k[t1];  k2 = tr->k[t2];

  /* offset the i coords. */
  L  = (i2-i1+1) + nterm + cterm;
  i2 = (i2-i1+1) + nterm;
  i1 = nterm+1;

  /* normalize the i coords into range 1..Lbins, instead of 1..L */
  i1 = ((i1-1) * Lbins / L) + 1;
  i2 = ((i2-1) * Lbins / L) + 1;

  *ret_i1 = i1;
  *ret_i2 = i2;
  *ret_k1 = k1;
  *ret_k2 = k2;
  p7_trace_Destroy(tr2);
  esl_sq_Destroy(sq2);
  return eslOK;

 ERROR:
  if (sq2 != NULL)  esl_sq_Destroy(sq2);
  if (tr2 != NULL)  p7_trace_Destroy(tr2);
  *ret_i1 = 0.;
  *ret_i2 = 0.;
  *ret_k1 = 0;
  *ret_k2 = 0;
  return status;
}
/* All input sources funnel through here.
 * Here, <afp> is already allocated and initialized, and the input
 * <bf> is opened successfully.
 */
static int
profillic_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESLX_MSAFILE_FMTDATA *fmtd,  ESLX_MSAFILE *afp)
{
  ESL_ALPHABET        *abc       = NULL;
  int                  alphatype = eslUNKNOWN;
  int                  status;

  /* if caller provided <fmtd>, copy it into afp->fmtd */
  if (fmtd) eslx_msafile_fmtdata_Copy(fmtd, &(afp->fmtd));

  /* Determine the format */
  if (format == eslMSAFILE_UNKNOWN) 
    {
      status = eslx_msafile_GuessFileFormat(afp->bf, &format, &(afp->fmtd));
      if      (status == eslENOFORMAT) ESL_XFAIL(eslENOFORMAT, afp->errmsg, "couldn't determine alignment input format"); /* ENOFORMAT is normal failure */
      else if (status != eslOK)        goto ERROR;
    }
  afp->format = format;

  /* Determine the alphabet; set <abc>. (<abc> == NULL means text mode.)  */
  /* Note that GuessAlphabet() functions aren't allowed to use the inmap, because it isn't set yet */
#ifdef eslAUGMENT_ALPHABET
  if (byp_abc && *byp_abc)	/* Digital mode, and caller provided the alphabet */
    { 
      abc       = *byp_abc;
      alphatype = abc->type;
    } 
  else if (byp_abc)		/* Digital mode, and caller wants us to guess and create an alphabet */
    {
      status = eslx_msafile_GuessAlphabet(afp, &alphatype);
      if      (status == eslENOALPHABET) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "couldn't guess alphabet (maybe try --dna/--rna/--amino if available)");
      else if (status != eslOK)          goto ERROR;
      if ( (abc = esl_alphabet_Create(alphatype))                == NULL) { status = eslEMEM; goto ERROR; }
    }    
#endif
  if (abc && ! byp_abc) ESL_EXCEPTION(eslEINCONCEIVABLE, "Your version of Easel does not include digital alphabet code."); 
  /* ^^^^^^^^^^^^^^^^^  this test interacts tricksily with the #ifdef above */
  afp->abc = abc;	/* with afp->abc set, the inmap config functions know whether to do digital/text    */

  /**
   * <pre>
   * Configure the format-specific, digital or text mode character
   * input map in afp->inmap.
   * All of these must:
   *    
   *    set inmap[0] to an appropriate 'unknown' character, to replace
   *       invalid input with.
   *    set ' ' to eslDSQ_IGNORE (if we're supposed to accept and skip
   *       it), or map it to a gap, or set it as eslDSQ_ILLEGAL.
   *    in digital mode, copy the abc->inmap
   *    in text mode, decide if we should accept most any
   *        non-whitespace character (isgraph()), or if the format is
   *        inherently restrictive and we should go with isalpha() +
   *        some other valid characters "_-.~*" instead.
   * </pre>
   */
  switch (afp->format) {
  case eslMSAFILE_A2M:          status = esl_msafile_a2m_SetInmap(      afp); break;
  case eslMSAFILE_AFA:          status = esl_msafile_afa_SetInmap(      afp); break;
  case eslMSAFILE_CLUSTAL:      status = esl_msafile_clustal_SetInmap(  afp); break;
  case eslMSAFILE_CLUSTALLIKE:  status = esl_msafile_clustal_SetInmap(  afp); break;
  case eslMSAFILE_PFAM:         status = esl_msafile_stockholm_SetInmap(afp); break;
  case eslMSAFILE_PHYLIP:       status = esl_msafile_phylip_SetInmap(   afp); break;
  case eslMSAFILE_PHYLIPS:      status = esl_msafile_phylip_SetInmap(   afp); break;
  case eslMSAFILE_PSIBLAST:     status = esl_msafile_psiblast_SetInmap( afp); break;
  case eslMSAFILE_SELEX:        status = esl_msafile_selex_SetInmap(    afp); break;
  case eslMSAFILE_STOCKHOLM:    status = esl_msafile_stockholm_SetInmap(afp); break;
  case eslMSAFILE_PROFILLIC:    status = eslOK;                               break; /// \todo status = profillic_esl_msafile_profile_SetInmap(afp); */ break;
  default: ESL_XEXCEPTION(eslENOFORMAT, "no such alignment file format");     break;
  }

  if (esl_byp_IsReturned(byp_abc)) *byp_abc = abc;
  return eslOK;

 ERROR:  /* on normal errors, afp is returned in an error state */
  if (abc && ! esl_byp_IsProvided(byp_abc)) { esl_alphabet_Destroy(abc); }
  if (esl_byp_IsReturned(byp_abc)) *byp_abc = NULL;
  afp->abc = NULL;
  return status;
}
Exemple #19
0
/* Function:  p7_ProfileConfig()
 * Synopsis:  Configure a search profile.
 *
 * Purpose:   Given a model <hmm> with core probabilities, the null1
 *            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
 *            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
 *            expected target sequence length <L>; configure the
 *            search model in <gm> with lod scores relative to the
 *            background frequencies in <bg>.
 *            
 * Returns:   <eslOK> on success; the profile <gm> now contains 
 *            scores and is ready for searching target sequences.
 *            
 * Throws:    <eslEMEM> on allocation error.
 */
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
  int   k, x, z;	/* counters over states, residues, annotation */
  int   status;
  float *occ = NULL;
  float *tp, *rp;
  float  sc[p7_MAXCODE];
  float  Z;
 
  /* Contract checks */
  if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
  if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");
  if (! (hmm->flags & p7H_CONS))       ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile");

  /* Copy some pointer references and other info across from HMM  */
  gm->M                = hmm->M;
  gm->max_length       = hmm->max_length;
  gm->mode             = mode;
  gm->roff             = -1;
  gm->eoff             = -1;
  gm->offs[p7_MOFFSET] = -1;
  gm->offs[p7_FOFFSET] = -1;
  gm->offs[p7_POFFSET] = -1;
  if (gm->name != NULL) free(gm->name);
  if (gm->acc  != NULL) free(gm->acc);
  if (gm->desc != NULL) free(gm->desc);
  if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
  if (hmm->flags & p7H_RF)    strcpy(gm->rf,        hmm->rf);
  if (hmm->flags & p7H_MMASK) strcpy(gm->mm,        hmm->mm);
  if (hmm->flags & p7H_CONS)  strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */
  if (hmm->flags & p7H_CS)    strcpy(gm->cs,        hmm->cs);
  for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
  for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
  for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

  /* Entry scores. */
  if (p7_profile_IsLocal(gm))
    {
      /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
       * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
      Z = 0.;
      ESL_ALLOC(occ, sizeof(float) * (hmm->M+1));

      if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
      for (k = 1; k <= hmm->M; k++) 
	Z += occ[k] * (float) (hmm->M-k+1);
      for (k = 1; k <= hmm->M; k++) 
	p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */

      free(occ);
    }
  else	/* glocal modes: left wing retraction; must be in log space for precision */
    {
      Z = log(hmm->t[0][p7H_MD]);
      p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]);
      for (k = 1; k < hmm->M; k++) 
	{
	   p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]);
	   Z += log(hmm->t[k][p7H_DD]);
	}
    }

  /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits
   * N,C,J transitions are set later by length config 
   */
  if (p7_profile_IsMultihit(gm)) {
    gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2;   
    gm->nj                   = 1.0f;
  } else {
    gm->xsc[p7P_E][p7P_MOVE] = 0.0f;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY;  
    gm->nj                   = 0.0f;
  }

  /* Transition scores. */
  for (k = 1; k < gm->M; k++) {
    tp = gm->tsc + k * p7P_NTRANS;
    tp[p7P_MM] = log(hmm->t[k][p7H_MM]);
    tp[p7P_MI] = log(hmm->t[k][p7H_MI]);
    tp[p7P_MD] = log(hmm->t[k][p7H_MD]);
    tp[p7P_IM] = log(hmm->t[k][p7H_IM]);
    tp[p7P_II] = log(hmm->t[k][p7H_II]);
    tp[p7P_DM] = log(hmm->t[k][p7H_DM]);
    tp[p7P_DD] = log(hmm->t[k][p7H_DD]);
  }
  
  /* Match emission scores. */
  sc[hmm->abc->K]     = -eslINFINITY; /* gap character */
  sc[hmm->abc->Kp-2]  = -eslINFINITY; /* nonresidue character */
  sc[hmm->abc->Kp-1]  = -eslINFINITY; /* missing data character */
  for (k = 1; k <= hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++)
     sc[x] = log((double)hmm->mat[k][x] / bg->f[x]);

    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 

    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k * p7P_NR;
      rp[p7P_MSC] = sc[x];
    }
  }

  /* Insert emission scores */
  /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores
   * to 0, i.e. corresponding to the insertion emission probabilities
   * being equal to the background probabilities. Benchmarking shows
   * that setting inserts to informative emission distributions causes
   * more problems than it's worth: polar biased composition hits
   * driven by stretches of "insertion" occur, and are difficult to
   * correct for.
   */
  for (x = 0; x < gm->abc->Kp; x++)
    {
      for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f;
      p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
    }
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K)    = -eslINFINITY; /* gap symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */


#if 0
  /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */
  for (k = 1; k < hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++) 
      sc[x] = log(hmm->ins[k][x] / bg->f[x]); 
    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 
    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k*p7P_NR;
      rp[p7P_ISC] = sc[x];
    }
  }    
  for (x = 0; x < hmm->abc->Kp; x++)
    p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
#endif

  /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength()
   */
  gm->L = 0;			/* force ReconfigLength to reconfig */
  if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR;
  return eslOK;

 ERROR:
  if (occ != NULL) free(occ);
  return status;
}
Exemple #20
0
/* Function:  p7_ProfileConfig()
* Synopsis:  Configure a search profile.
* Incept:    SRE, Sun Sep 25 12:21:25 2005 [St. Louis]
*
* Purpose:   Given a model <hmm> with core probabilities, the null1
*            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
*            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
*            expected target sequence length <L>; configure the
*            search model in <gm> with lod scores relative to the
*            background frequencies in <bg>.
*            
* Returns:   <eslOK> on success; the profile <gm> now contains 
*            scores and is ready for searching target sequences.
*            
* Throws:    <eslEMEM> on allocation error.
*/
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
    int   k, x, z;	/* counters over states, residues, annotation */
    int   status;
    float *occ = NULL;
    float *tp, *rp;
    float  sc[p7_MAXCODE];
    float  mthresh;
    float  Z;

    /* Contract checks */
    if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
    if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");

    /* Copy some pointer references and other info across from HMM  */
    gm->M      = hmm->M;
    gm->mode   = mode;
    gm->roff   = -1;
    gm->eoff   = -1;
    gm->offs[p7_MOFFSET] = -1;
    gm->offs[p7_FOFFSET] = -1;
    gm->offs[p7_POFFSET] = -1;
    if (gm->name != NULL) free(gm->name);
    if (gm->acc  != NULL) free(gm->acc);
    if (gm->desc != NULL) free(gm->desc);
    if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
    if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf);
    if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs);
    for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
    for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
    for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

    /* Determine the "consensus" residue for each match position.
    * This is only used for alignment displays, not in any calculations.
    */
    if      (hmm->abc->type == eslAMINO) mthresh = 0.5;
    else if (hmm->abc->type == eslDNA)   mthresh = 0.9;
    else if (hmm->abc->type == eslRNA)   mthresh = 0.9;
    else                                 mthresh = 0.5;
    gm->consensus[0] = ' ';
    for (k = 1; k <= hmm->M; k++) {
        x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K);
        gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x]));
    }
    gm->consensus[hmm->M+1] = '\0';

    /* Entry scores. */
    if (p7_profile_IsLocal(gm))
    {
        /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
        * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
        Z = 0.;
        ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1));

        if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
        for (k = 1; k <= hmm->M; k++) 
            Z += occ[k] * (float) (hmm->M-k+1);
        for (k = 1; k <= hmm->M; k++) 
            p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */

        free(occ);
    }
    else	/* glocal modes: left wing retraction; must be in log space for precision */
    {
Exemple #21
0
/* Function:  p7_hmm_mpi_Unpack()
 * Synopsis:  Unpacks one HMM from an MPI buffer.
 *
 * Purpose:   Unpack one HMM from MPI packed buffer
 *            <buf>, starting from position <*pos>, where the total length
 *            of the buffer in bytes is <n>. The new HMM is allocated here.
 *            
 *            Caller may or may not already know what alphabet the HMM
 *            is expected to be in.  A reference to the current
 *            alphabet is passed in <byp_abc>. If the alphabet is unknown,
 *            pass <*byp_abc = NULL>, and when the HMM is received, an
 *            appropriate new alphabet object is allocated and passed
 *            back to the caller via <*byp_abc>.  If the alphabet is
 *            already known, <*byp_abc> is that alphabet, and the new
 *            HMM's alphabet type is verified to agree with it. This
 *            mechanism allows an application to let the first HMM
 *            determine the alphabet type for the application, while
 *            still keeping the alphabet under the application's scope
 *            of control.
 *
 * Args:      buf      - MPI packed buffer to unpack
 *            n        - total length of <buf> in bytes
 *            pos      - current parsing/unpacking position in <buf>
 *            comm     - MPI communicator
 *            byp_abc  - BYPASS: <*byp_abc> == ESL_ALPHABET *> if known;
 *                               <*byp_abc> == NULL> if alphabet unknown;
 *            ret_hmm  - RETURN: ptr to newly allocated, unpacked profile
 *
 * Returns:   <eslOK> on success. <*pos> is updated to the position of
 *            the next element in <buf> to unpack (if any). <*ret_hmm>
 *            contains a newly allocated HMM, which the caller is
 *            responsible for free'ing.  If <*byp_abc> was passed as
 *            <NULL>, it now points to an <ESL_ALPHABET> object that
 *            was allocated here; caller is responsible for free'ing
 *            this.
 *            
 *            Returns <eslEINCOMPAT> if the HMM is in a different
 *            alphabet than <*byp_abc> said to expect. In this case,
 *            <*byp_abc> is unchanged, <*buf> and <*nalloc> may have been
 *            changed, and <*ret_hmm> is <NULL>.
 *            
 * Throws:    <eslESYS> on an MPI call failure. <eslEMEM> on allocation failure.
 *            In either case, <*ret_hmm> is <NULL>, and the state of <buf>
 *            and <*pos> is undefined and should be considered to be corrupted.
 */
int
p7_hmm_mpi_Unpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **byp_abc, P7_HMM **ret_hmm)
{
  P7_HMM       *hmm = NULL;
  ESL_ALPHABET *abc = NULL;
  int64_t offset;
  int     M, K, atype;
  int     status;

  /* Use the CreateShell/CreateBody interface, because that interface allocates our optional fields, using <flags> */
  if (( hmm = p7_hmm_CreateShell() ) == NULL) { status = eslEMEM; goto ERROR; }

  /* First, unpack info that we need for HMM body allocation */
  if (MPI_Unpack(buf, n, pos, &M,             1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &(hmm->flags),  1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &atype,         1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");

  /* Set or verify the alphabet */
  if (*byp_abc == NULL) {	/* alphabet unknown. create new one */
    if ( (abc = esl_alphabet_Create(atype)) == NULL)  { status = eslEMEM; goto ERROR; }
  } else {			/* already known: check it */
    abc = *byp_abc;
    if (abc->type != atype){ status = eslEINCOMPAT; goto ERROR; }
  }
  K = abc->K; /* For convenience below. */

  /* Allocate the HMM body */
  if ((status = p7_hmm_CreateBody(hmm, M, abc)) != eslOK) goto ERROR;

  /* Unpack the rest of the HMM */
  if (MPI_Unpack( buf, n, pos, hmm->t[0],   p7H_NTRANSITIONS*(M+1), MPI_FLOAT, comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack( buf, n, pos, hmm->mat[0],                K*(M+1), MPI_FLOAT, comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack( buf, n, pos, hmm->ins[0],                K*(M+1), MPI_FLOAT, comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed");

  if ((status = esl_mpi_UnpackOpt( buf, n, pos,  (void**) &(hmm->name), NULL, MPI_CHAR,  comm)) != eslOK) goto ERROR;
  if ((status = esl_mpi_UnpackOpt( buf, n, pos,  (void**)  &(hmm->acc), NULL, MPI_CHAR,  comm)) != eslOK) goto ERROR;
  if ((status = esl_mpi_UnpackOpt( buf, n, pos,  (void**) &(hmm->desc), NULL, MPI_CHAR,  comm)) != eslOK) goto ERROR;

  if (hmm->flags & p7H_RF)    { if (MPI_Unpack(buf, n, pos,  hmm->rf,        M+2, MPI_CHAR,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }
  if (hmm->flags & p7H_MMASK) { if (MPI_Unpack(buf, n, pos,  hmm->mm,        M+2, MPI_CHAR,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }
  if (hmm->flags & p7H_CONS)  { if (MPI_Unpack(buf, n, pos,  hmm->consensus, M+2, MPI_CHAR,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }
  if (hmm->flags & p7H_CS)    { if (MPI_Unpack(buf, n, pos,  hmm->cs,        M+2, MPI_CHAR,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }
  if (hmm->flags & p7H_CA)    { if (MPI_Unpack(buf, n, pos,  hmm->ca,        M+2, MPI_CHAR,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }

  if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**)&(hmm->comlog),  NULL, MPI_CHAR,  comm)) != eslOK) goto ERROR;
  if (MPI_Unpack(                  buf, n, pos,           &(hmm->nseq),     1, MPI_INT,   comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack(                  buf, n, pos,       &(hmm->eff_nseq),     1, MPI_FLOAT, comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack(                  buf, n, pos,     &(hmm->max_length),     1, MPI_FLOAT, comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**) &(hmm->ctime),  NULL, MPI_CHAR,  comm)) != eslOK) goto ERROR;

  if (hmm->flags & p7H_MAP)   { if (MPI_Unpack(buf, n, pos,               hmm->map,         M+1, MPI_INT,   comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); }

  if (MPI_Unpack( buf, n, pos, &(hmm->checksum),           1, MPI_UINT32_T,  comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack( buf, n, pos,   hmm->evparam,   p7_NEVPARAM, MPI_FLOAT,     comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack( buf, n, pos,   hmm->cutoff,    p7_NCUTOFFS, MPI_FLOAT,     comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack( buf, n, pos,   hmm->compo,      p7_MAXABET, MPI_FLOAT,     comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  if (MPI_Unpack( buf, n, pos,  &offset,                   1, MPI_INT64_T,   comm)  != MPI_SUCCESS)     ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); 
  hmm->offset = offset;		/* receive as int64_t, then cast to off_t, which is probably int64_t (but not guaranteed) */

  *byp_abc = abc; 	/* works even if caller provided *byp_abc, because then abc==*byp_abc already */
  *ret_hmm = hmm;
  return eslOK;

 ERROR:
  if (hmm) p7_hmm_Destroy(hmm);
  if (abc && *byp_abc == NULL) esl_alphabet_Destroy(abc);
  *ret_hmm = NULL;
  return status;
}
Exemple #22
0
/* Function:  esl_stats_LinearRegression()
 * Synopsis:  Fit data to a straight line.
 * Incept:    SRE, Sat May 26 11:33:46 2007 [Janelia]
 *
 * Purpose:   Fit <n> points <x[i]>, <y[i]> to a straight line
 *            $y = a + bx$ by linear regression. 
 *            
 *            The $x_i$ are taken to be known, and the $y_i$ are taken
 *            to be observed quantities associated with a sampling
 *            error $\sigma_i$. If known, the standard deviations
 *            $\sigma_i$ for $y_i$ are provided in the <sigma> array.
 *            If they are unknown, pass <sigma = NULL>, and the
 *            routine will proceed with the assumption that $\sigma_i
 *            = 1$ for all $i$.
 *            
 *            The maximum likelihood estimates for $a$ and $b$ are
 *            optionally returned in <opt_a> and <opt_b>.
 *            
 *            The estimated standard deviations of $a$ and $b$ and
 *            their estimated covariance are optionally returned in
 *            <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>.
 *            
 *            The Pearson correlation coefficient is optionally
 *            returned in <opt_cc>. 
 *            
 *            The $\chi^2$ P-value for the regression fit is
 *            optionally returned in <opt_Q>. This P-value may only be
 *            obtained when the $\sigma_i$ are known. If <sigma> is
 *            passed as <NULL> and <opt_Q> is requested, <*opt_Q> is
 *            set to 1.0.
 *            
 *            This routine follows the description and algorithm in
 *            \citep[pp.661-666]{Press93}.
 *
 *            <n> must be greater than 2; at least two x[i] must
 *            differ; and if <sigma> is provided, all <sigma[i]> must
 *            be $>0$. If any of these conditions isn't met, the
 *            routine throws <eslEINVAL>.
 *
 * Args:      x            - x[0..n-1]
 *            y            - y[0..n-1]
 *            sigma        - sample error in observed y_i
 *            n            - number of data points
 *            opt_a        - optRETURN: intercept estimate		
 *            opt_b        - optRETURN: slope estimate
 *            opt_sigma_a  - optRETURN: error in estimate of a
 *            opt_sigma_b  - optRETURN: error in estimate of b
 *            opt_cov_ab   - optRETURN: covariance of a,b estimates
 *            opt_cc       - optRETURN: Pearson correlation coefficient for x,y
 *            opt_Q        - optRETURN: X^2 P-value for linear fit
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error;
 *            <eslEINVAL> if a contract condition isn't met;
 *            <eslENORESULT> if the chi-squared test fails.
 *            In these cases, all optional return values are set to 0.
 */
int
esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n,
			   double *opt_a,       double *opt_b,
			   double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab,
			   double *opt_cc,      double *opt_Q)
{
  int     status;
  double *t      = NULL;
  double  S, Sx, Sy, Stt;
  double  Sxy, Sxx, Syy;
  double  a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q;
  double  xdev, ydev;
  double  tmp;
  int     i;

  /* Contract checks. */
  if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting");
  if (sigma != NULL) 
    for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i);
  status = eslEINVAL;
  for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; }
  if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0.");

  /* Allocations */
  ESL_ALLOC(t, sizeof(double) * n);

  /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}.  (S > 0.) */
  if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]);  }
  else S = (double) n;

  /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2}  (Sx real.) */
  for (Sx = 0., i = 0; i < n; i++) { 
    if (sigma == NULL) Sx += x[i];
    else               Sx += x[i] / (sigma[i] * sigma[i]);
  }

  /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2}  (Sy real.) */
  for (Sy = 0., i = 0; i < n; i++) { 
    if (sigma == NULL) Sy += y[i];
    else               Sy += y[i] / (sigma[i] * sigma[i]);
  }

  /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right)   (t_i real) */
  for (i = 0; i < n; i++) {
    t[i] = x[i] - Sx/S;
    if (sigma != NULL) t[i] /= sigma[i];
  }

  /* S_{tt} = \sum_{i=1}^n t_i^2  (if at least one x is != 0, Stt > 0) */
  for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; }

  /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i}  */
  for (b = 0., i = 0; i < n; i++) {
    if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; }
    else               { b += t[i]*y[i]; }
  }
  b /= Stt;

  /* a = \frac{ S_y - S_x b } {S}   */
  a = (Sy - Sx * b) / S;
  
  /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */
  sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S);

  /* \sigma_b = \frac{1}{S_{tt}} */
  sigma_b = sqrt (1. / Stt);

  /* Cov(a,b) = - \frac{S_x}{S S_{tt}}    */
  cov_ab = -Sx / (S * Stt);
  
  /* Pearson correlation coefficient */
  Sxy = Sxx = Syy = 0.;
  for (i = 0; i < n; i++) {
    if (sigma != NULL) { 
      xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n);
      ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n);
    } else {
      xdev = x[i] - (Sx / n);
      ydev = y[i] - (Sy / n);
    }
    Sxy += xdev * ydev;
    Sxx += xdev * xdev;
    Syy += ydev * ydev;
  }
  cc = Sxy / (sqrt(Sxx) * sqrt(Syy));

  /* \chi^2 */
  for (X2 = 0., i = 0; i < n; i++) {
    tmp =  y[i] - a - b*x[i];
    if (sigma != NULL) tmp /= sigma[i];
    X2 += tmp*tmp;
  }
  
  /* We can calculate a goodness of fit if we know the \sigma_i */
  if (sigma != NULL) {
    if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; }
  } else Q = 1.0;

  /* If we didn't use \sigma_i, adjust the sigmas for a,b */
  if (sigma == NULL) {
    tmp = sqrt(X2 / (double)(n-2));
    sigma_a *= tmp;
    sigma_b *= tmp;
  }
    
  /* Done. Set up for normal return.
   */
  free(t);
  if (opt_a       != NULL) *opt_a       = a;
  if (opt_b       != NULL) *opt_b       = b;
  if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a;
  if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b;
  if (opt_cov_ab  != NULL) *opt_cov_ab  = cov_ab;
  if (opt_cc      != NULL) *opt_cc      = cc;
  if (opt_Q       != NULL) *opt_Q       = Q;
  return eslOK;
  
 ERROR:
  if (t != NULL) free(t);
  if (opt_a       != NULL) *opt_a       = 0.;
  if (opt_b       != NULL) *opt_b       = 0.;
  if (opt_sigma_a != NULL) *opt_sigma_a = 0.;
  if (opt_sigma_b != NULL) *opt_sigma_b = 0.;
  if (opt_cov_ab  != NULL) *opt_cov_ab  = 0.;
  if (opt_cc      != NULL) *opt_cc      = 0.;
  if (opt_Q       != NULL) *opt_Q       = 0.;
  return status;
}
Exemple #23
0
/* Function: matassign2hmm()
 * 
 * Purpose:  Given an assignment of alignment columns to match vs.
 *           insert, finish the final part of the model construction 
 *           calculation that is constant between model construction
 *           algorithms.
 *           
 * Args:     msa       - multiple sequence alignment
 *           matassign - 1..alen bit flags for column assignments
 *           ret_hmm   - RETURN: counts-form HMM
 *           opt_tr    - optRETURN: array of tracebacks for aseq's
 *                         
 * Return:   <eslOK> on success.
 *           <eslENORESULT> if no consensus columns are identified.
 *
 *           ret_hmm and opt_tr alloc'ed here.
 */
static int
matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
{
  int        status;		/* return status                       */
  P7_HMM    *hmm = NULL;        /* RETURN: new hmm                     */
  P7_TRACE **tr  = NULL;        /* RETURN: 0..nseq-1 fake traces       */
  int      M;                   /* length of new model in match states */
  int      idx;                 /* counter over sequences              */
  int      apos;                /* counter for aligned columns         */
#ifdef p7_DEBUGGING
  char     errbuf[eslERRBUFSIZE];
#endif

  /* apply the model mask in the 'GC MM' row */
  do_modelmask(msa);

  /* How many match states in the HMM? */
  for (M = 0, apos = 1; apos <= msa->alen; apos++) 
    if (matassign[apos]) M++;
  if (M == 0) { status = eslENORESULT; goto ERROR; }

  /* Make fake tracebacks for each seq */
  ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
  if ((status = p7_trace_FauxFromMSA(msa, matassign, p7_MSA_COORDS, tr))        != eslOK) goto ERROR;
  for (idx = 0; idx < msa->nseq; idx++)
    {
      if ((status = p7_trace_Doctor(tr[idx], NULL, NULL))                       != eslOK) goto ERROR;
#ifdef p7_DEBUGGING
      if ((status = p7_trace_Validate(tr[idx], msa->abc, msa->ax[idx], errbuf)) != eslOK) 
	ESL_XEXCEPTION(eslFAIL, "validation failed: %s", errbuf);
#endif
    }

  /* Build count model from tracebacks */
  if ((hmm    = p7_hmm_Create(M, msa->abc)) == NULL)  { status = eslEMEM; goto ERROR; }
  if ((status = p7_hmm_Zero(hmm))           != eslOK) goto ERROR;
  for (idx = 0; idx < msa->nseq; idx++) {
    if (tr[idx] == NULL) continue; /* skip rare examples of empty sequences */
    if ((status = p7_trace_Count(hmm, msa->ax[idx], msa->wgt[idx], tr[idx])) != eslOK) goto ERROR;
  }

  hmm->nseq     = msa->nseq;
  hmm->eff_nseq = msa->nseq;

  /* Transfer annotation from the MSA to the new model
   */
  if ((status = annotate_model(hmm, matassign, msa)) != eslOK) goto ERROR;

  /* Reset #=RF line of alignment to reflect our assignment
   * of match, delete. matassign is valid from 1..alen and is off
   * by one from msa->rf.
   */
  if (msa->rf == NULL)  ESL_ALLOC(msa->rf, sizeof(char) * (msa->alen + 1));
  for (apos = 1; apos <= msa->alen; apos++)
    msa->rf[apos-1] = matassign[apos] ? 'x' : '.';
  msa->rf[msa->alen] = '\0';

  if (opt_tr  != NULL) *opt_tr  = tr; 
  else                  p7_trace_DestroyArray(tr, msa->nseq);
  *ret_hmm = hmm;
  return eslOK;

 ERROR:
  if (tr     != NULL) p7_trace_DestroyArray(tr, msa->nseq);
  if (hmm    != NULL) p7_hmm_Destroy(hmm);
  if (opt_tr != NULL) *opt_tr = NULL;
  *ret_hmm = NULL;
  return status;
}
Exemple #24
0
/* Function:  p7_CoreEmit()
 * Incept:    SRE, Tue Jan  9 10:20:51 2007 [Janelia]
 *
 * Purpose:   Generate (sample) a sequence from a core HMM <hmm>.
 *            
 *            Optionally return the sequence and/or its trace in <sq>
 *            and <tr>, respectively, which the caller has
 *            allocated. Having the caller provide these reusable
 *            objects allows re-use of both <sq> and <tr> in repeated
 *            calls, saving malloc/free wastage. Either can be passed
 *            as <NULL> if it isn't needed.
 *            
 *            This does not set any fields in the <sq> except for the
 *            sequence itself. Caller must set the name, and any other
 *            annotation it wants to add.
 *
 *            Trace is relative to the core model: it may include
 *            I_0 and I_M states, B->DD->M entry is explicit, and a
 *            0 length generated sequence is possible.
 *            
 * Args:      r     -  source of randomness
 *            hmm   -  core HMM to generate from
 *            sq    -  opt: digital sequence sampled (or NULL)
 *            tr    -  opt: trace sampled            (or NULL)
 *
 * Returns:   <eslOK> on success; 
 *            optionally return the digital sequence through <ret_sq>,
 *            and optionally return its trace in <ret_tr>.
 *
 * Throws:    <eslECORRUPT> if emission gets us into an illegal state, 
 *            probably indicating that a probability that should have
 *            been zero wasn't. 
 *
 *            Throws <eslEMEM> on a reallocation error.
 * 
 *            In these cases, the contents of <sq> and <tr> may be
 *            corrupted. Caller should not trust their data, but may
 *            safely reuse them.
 *
 * Xref:      STL11/124.
 */
int
p7_CoreEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr)
{
  int       k   = 0;		/* position in model nodes 1..M */
  int       i   = 0;		/* position in sequence 1..L */
  char      st  = p7T_B;	/* state type */
  int       x;			/* sampled residue */
  int       status;

  if (sq != NULL) esl_sq_Reuse(sq);    
  if (tr != NULL) {
    if ((status = p7_trace_Reuse(tr))            != eslOK) goto ERROR;
    if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR;
  }
  while (st != p7T_E)
    {
      /* Sample next state type, given current state type (and current k) */
      switch (st) {
      case p7T_B:
      case p7T_M:
	switch (esl_rnd_FChoose(r, hmm->t[k], 3)) {
	case 0:  st = p7T_M; break;
	case 1:  st = p7T_I; break;
	case 2:  st = p7T_D; break;
	default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible.");  	    
	}
	break;

      case p7T_I:
	switch (esl_rnd_FChoose(r, hmm->t[k]+3, 2)) {
	case 0: st = p7T_M; break;
	case 1: st = p7T_I; break;
	default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible.");  	    
	}
	break;

      case p7T_D:
	switch (esl_rnd_FChoose(r, hmm->t[k]+5, 2)) {
	case 0: st = p7T_M; break;
	case 1: st = p7T_D; break;
	default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible.");  	    
	}
	break;

      default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission");
      }

      /* Bump k,i if needed, depending on new state type */
      if (st == p7T_M || st == p7T_D) k++;
      if (st == p7T_M || st == p7T_I) i++;

      /* a transit to M_M+1 is a transit to the E state */
      if (k == hmm->M+1) {
	if   (st == p7T_M) { st = p7T_E; k = 0; }
	else ESL_XEXCEPTION(eslECORRUPT, "failed to reach E state properly");
      }

      /* Sample new residue x if in match or insert */
      if      (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K);
      else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K);
      else                   x = eslDSQ_SENTINEL;

      /* Add state to trace */
      if (tr != NULL) {
	if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR;
      }
      /* Add x to sequence */
      if (sq != NULL && x != eslDSQ_SENTINEL) 
	if ((status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR;
    }

  /* Terminate the trace and sequence (both are optional, remember) */
  if (tr != NULL) {  tr->M = hmm->M; tr->L = i; }
  if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR;
  return eslOK;

ERROR:
  return status;
}
Exemple #25
0
/* Function:  p7_ProfileEmit()
 * Synopsis:  Sample a sequence from the search form of the model.
 * Incept:    SRE, Mon Jan 22 10:23:28 2007 [Janelia]
 *
 * Purpose:   Sample a sequence from the implicit 
 *            probabilistic model of a Plan7 profile <gm>. This
 *            requires also having the core probabilities of
 *            the accompanying <hmm>, and the background 
 *            frequencies of null1 model <bg>.
 *            
 *            Optionally return the sequence and/or its trace in <sq>
 *            and <tr>, respectively. Caller has allocated space for
 *            both of these, though they may get reallocated/grown
 *            here. Either can be passed as <NULL> if unneeded.
 *            
 *            Only the sequence field is set in the <sq>. Caller must
 *            set the name, plus any other fields it wants to set. If
 *            the <sq> was created in digital mode, this is the <sq->dsq>;
 *            if the <sq> was created in text mode, this is <sq->seq>.
 *            
 *            <p7_ProfileEmit()> deliberately uses an <ESL_SQ> object
 *            instead of a plain <ESL_DSQ *> or <char *> string, to
 *            take advantage of the object's support for dynamic
 *            reallocation of seq length, and to allow both digital and
 *            text mode generation.
 *
 * Args:      r    - source of randomness
 *            hmm  - core probabilities of the profile
 *            gm   - configured search profile
 *            sq   - optRETURN: sampled sequence
 *            tr   - optRETURN: sampled trace
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr)
{
  char      prv, st;		/* prev, current state type */
  int       k = 0;	        /* position in model nodes 1..M */
  int       i = 0;		/* position in sequence 1..L */
  int       x;			/* sampled residue */
  int       kend = hmm->M;      /* predestined end node */
  int       status;
  float     xt[p7P_NXSTATES][p7P_NXTRANS];

  /* Backcalculate the probabilities in the special states (loop and length model) */
  for (i = 0; i < p7P_NXSTATES; i++)
    for (x = 0; x < p7P_NXTRANS; x++)
      xt[i][x] = exp(gm->xsc[i][x]);

  if (sq != NULL) esl_sq_Reuse(sq);    
  if (tr != NULL) {
    if ((status = p7_trace_Reuse(tr))               != eslOK) goto ERROR;
    if ((status = p7_trace_Append(tr, p7T_S, k, i)) != eslOK) goto ERROR;
    if ((status = p7_trace_Append(tr, p7T_N, k, i)) != eslOK) goto ERROR;
  }
  st    = p7T_N;
  i     = 0;
  while (st != p7T_T)
    {
      /* Sample a state transition. After this section, prv and st (prev->current state) are set;
       * k also gets set if we make a B->Mk entry transition.
       */
      prv = st;
      switch (st) {
      case p7T_B:  
	if (p7_profile_IsLocal(gm)) 
	  { /* local mode: enter the implicit profile: choose our entry and our predestined exit */
	    if ((status = sample_endpoints(r, gm, &k, &kend)) != eslOK) goto ERROR;
	    st = p7T_M;		/* must be, because left wing is retracted */
	  }
	else
	  { /* glocal mode: treat B as M_0, use its transitions to MID. */
	    /* FIXME: this is wrong. It should sample from B->Mk distribution! */
	    switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, 0), p7H_NTMAT)) {
	    case 0:  st = p7T_M; k = 1; break;
	    case 1:  st = p7T_I; k = 0; break;
	    case 2:  st = p7T_D; k = 1; break;
	    default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible.");  	    
	    }
	  }
	break;
	
      case p7T_M:
	if (k == kend) st = p7T_E; /* check our preordained fate */
	else {
	  switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, k), p7H_NTMAT)) {
	  case 0:  st = p7T_M; break;
	  case 1:  st = p7T_I; break;
	  case 2:  st = p7T_D; break;
	  default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible.");  	    
	  }
	}
	break;

      case p7T_D:
	if (k == kend) st = p7T_E; 
	else           st = (esl_rnd_FChoose(r, P7H_TDEL(hmm, k), p7H_NTDEL) == 0) ? p7T_M : p7T_D; 
	break;

      case p7T_I: st = (esl_rnd_FChoose(r, P7H_TINS(hmm, k), p7H_NTINS) == 0)        ? p7T_M : p7T_I;  break;
      case p7T_N: st = (esl_rnd_FChoose(r, xt[p7P_N],     p7P_NXTRANS)  == p7P_MOVE) ? p7T_B : p7T_N;  break;
      case p7T_E: st = (esl_rnd_FChoose(r, xt[p7P_E],     p7P_NXTRANS)  == p7P_MOVE) ? p7T_C : p7T_J;  break;
      case p7T_C: st = (esl_rnd_FChoose(r, xt[p7P_C],     p7P_NXTRANS)  == p7P_MOVE) ? p7T_T : p7T_C;  break;
      case p7T_J: st = (esl_rnd_FChoose(r, xt[p7P_J],     p7P_NXTRANS)  == p7P_MOVE) ? p7T_B : p7T_J;  break;
      default:     ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission");
      }
     
      /* Based on the transition we just sampled, update k. */
      if      (st == p7T_E)                 k = 0;
      else if (st == p7T_M && prv != p7T_B) k++;    /* be careful about B->Mk, where we already set k */
      else if (st == p7T_D)                 k++;

      /* Based on the transition we just sampled, generate a residue. */
      if      (st == p7T_M)                                            x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K);
      else if (st == p7T_I)                                            x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K);
      else if ((st == p7T_N || st == p7T_C || st == p7T_J) && prv==st) x = esl_rnd_FChoose(r, bg->f,       hmm->abc->K);
      else    x = eslDSQ_SENTINEL;

      if (x != eslDSQ_SENTINEL) i++;

      /* Add residue (if any) to sequence */
      if (sq != NULL && x != eslDSQ_SENTINEL && (status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR;

      /* Add state to trace. */
      if (tr != NULL) {
	if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR;
      } 
    }
  /* Terminate the trace and sequence (both are optional, remember) */
  if (tr != NULL) {  tr->M = hmm->M; tr->L = i; }
  if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR;
  return eslOK;

 ERROR:
  return status;
}
Exemple #26
0
/* Function:  p7_alidisplay_Create()
 * Synopsis:  Create an alignment display, from trace and oprofile.
 * Incept:    SRE, Sun Dec 30 09:13:31 2007 [Janelia]
 *
 * Purpose:   Creates and returns an alignment display for domain number
 *            <which> in traceback <tr>, where the traceback
 *            corresponds to an alignment of optimized profile <om> to digital sequence
 *            <dsq>, and the unique name of that target
 *            sequence <dsq> is <sqname>. The <which> index starts at 0.
 *            
 *            It will be a little faster if the trace is indexed with
 *            <p7_trace_Index()> first. The number of domains is then
 *            in <tr->ndom>. If the caller wants to create alidisplays
 *            for all of these, it would loop <which> from
 *            <0..tr->ndom-1>.
 *           
 *            However, even without an index, the routine will work fine.
 *
 * Args:      tr     - traceback
 *            which  - domain number, 0..tr->ndom-1
 *            om     - optimized profile (query)
 *            sq     - digital sequence (target)
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <NULL> on allocation failure, or if something's internally corrupt 
 *            in the data.
 */
P7_ALIDISPLAY *
p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq)
{
  P7_ALIDISPLAY *ad       = NULL;
  char          *Alphabet = om->abc->sym;
  int            n, pos, z;
  int            z1,z2;
  int            k,x,i,s;
  int            hmm_namelen, hmm_acclen, hmm_desclen;
  int            sq_namelen,  sq_acclen,  sq_desclen;
  int            status;
  
  /* First figure out which piece of the trace (from first match to last match) 
   * we're going to represent, and how big it is.
   */
  if (tr->ndom > 0) {		/* if we have an index, this is a little faster: */
    for (z1 = tr->tfrom[which]; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break;  /* find next M state      */
    if (z1 == tr->N) return NULL;                                                  /* no M? corrupt trace    */
    for (z2 = tr->tto[which];   z2 >= 0 ;   z2--) if (tr->st[z2] == p7T_M) break;  /* find prev M state      */
    if (z2 == -1) return NULL;                                                     /* no M? corrupt trace    */
  } else {			/* without an index, we can still do it fine:    */
    for (z1 = 0; which >= 0 && z1 < tr->N; z1++) if (tr->st[z1] == p7T_B) which--; /* find the right B state */
    if (z1 == tr->N) return NULL;                                                  /* no such domain <which> */
    for (; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break;                       /* find next M state      */
    if (z1 == tr->N) return NULL;                                                  /* no M? corrupt trace    */
    for (z2 = z1; z2 < tr->N; z2++) if (tr->st[z2] == p7T_E) break;                /* find the next E state  */
    for (; z2 >= 0;    z2--) if (tr->st[z2] == p7T_M) break;                       /* find prev M state      */
    if (z2 == -1) return NULL;                                                     /* no M? corrupt trace    */
  }

  /* Now we know that z1..z2 in the trace will be represented in the
   * alidisplay; that's z2-z1+1 positions. We need a \0 trailer on all
   * our display lines, so allocate z2-z1+2. We know each position is
   * M, D, or I, so there's a 1:1 correspondence of trace positions
   * with alignment display positions.  We also know the display
   * starts and ends with M states.
   * 
   * So now let's allocate. The alidisplay is packed into a single
   * memory space, so this appears to be intricate, but it's just
   * bookkeeping.  
   */
  n = (z2-z1+2) * 3;                     /* model, mline, aseq mandatory         */
  if (om->rf[0]  != 0)    n += z2-z1+2;  /* optional reference line              */
  if (om->cs[0]  != 0)    n += z2-z1+2;  /* optional structure line              */
  if (tr->pp     != NULL) n += z2-z1+2;  /* optional posterior prob line         */
  hmm_namelen = strlen(om->name);                           n += hmm_namelen + 1;
  hmm_acclen  = (om->acc  != NULL ? strlen(om->acc)  : 0);  n += hmm_acclen  + 1;
  hmm_desclen = (om->desc != NULL ? strlen(om->desc) : 0);  n += hmm_desclen + 1;
  sq_namelen  = strlen(sq->name);                           n += sq_namelen  + 1;
  sq_acclen   = strlen(sq->acc);                            n += sq_acclen   + 1; /* sq->acc is "\0" when unset */
  sq_desclen  = strlen(sq->desc);                           n += sq_desclen  + 1; /* same for desc              */
  
  ESL_ALLOC(ad, sizeof(P7_ALIDISPLAY));
  ad->mem = NULL;

  pos = 0; 
  ad->memsize = sizeof(char) * n;
  ESL_ALLOC(ad->mem, ad->memsize);
  if (om->rf[0]  != 0) { ad->rfline = ad->mem + pos; pos += z2-z1+2; } else { ad->rfline = NULL; }
  if (om->cs[0]  != 0) { ad->csline = ad->mem + pos; pos += z2-z1+2; } else { ad->csline = NULL; }
  ad->model   = ad->mem + pos;  pos += z2-z1+2;
  ad->mline   = ad->mem + pos;  pos += z2-z1+2;
  ad->aseq    = ad->mem + pos;  pos += z2-z1+2;
  if (tr->pp != NULL)  { ad->ppline = ad->mem + pos;  pos += z2-z1+2;} else { ad->ppline = NULL; }
  ad->hmmname = ad->mem + pos;  pos += hmm_namelen +1;
  ad->hmmacc  = ad->mem + pos;  pos += hmm_acclen +1;
  ad->hmmdesc = ad->mem + pos;  pos += hmm_desclen +1;
  ad->sqname  = ad->mem + pos;  pos += sq_namelen +1;
  ad->sqacc   = ad->mem + pos;  pos += sq_acclen +1;
  ad->sqdesc  = ad->mem + pos;  pos += sq_desclen +1;

  strcpy(ad->hmmname, om->name);
  if (om->acc  != NULL) strcpy(ad->hmmacc,  om->acc);  else ad->hmmacc[0]  = 0;
  if (om->desc != NULL) strcpy(ad->hmmdesc, om->desc); else ad->hmmdesc[0] = 0;
  strcpy(ad->sqname,  sq->name);
  strcpy(ad->sqacc,   sq->acc);
  strcpy(ad->sqdesc,  sq->desc);

  /* Determine hit coords */
  ad->hmmfrom = tr->k[z1];
  ad->hmmto   = tr->k[z2];
  ad->M       = om->M;
  ad->sqfrom  = tr->i[z1];
  ad->sqto    = tr->i[z2];
  ad->L       = sq->n;

  /* optional rf line */
  if (ad->rfline != NULL) {
    for (z = z1; z <= z2; z++) ad->rfline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->rf[tr->k[z]]);
    ad->rfline[z-z1] = '\0';
  }

  /* optional cs line */
  if (ad->csline != NULL) {
    for (z = z1; z <= z2; z++) ad->csline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->cs[tr->k[z]]);
    ad->csline[z-z1] = '\0';
  }

  /* optional pp line */
  if (ad->ppline != NULL) {
    for (z = z1; z <= z2; z++) ad->ppline[z-z1] = ( (tr->st[z] == p7T_D) ? '.' : p7_alidisplay_EncodePostProb(tr->pp[z]));
    ad->ppline[z-z1] = '\0';
  }

  /* mandatory three alignment display lines: model, mline, aseq */
  for (z = z1; z <= z2; z++) 
    {
      k = tr->k[z];
      i = tr->i[z];
      x = sq->dsq[i];
      s = tr->st[z];

      switch (s) {
      case p7T_M:
	ad->model[z-z1] = om->consensus[k]; 
	if      (x == esl_abc_DigitizeSymbol(om->abc, om->consensus[k])) ad->mline[z-z1] = ad->model[z-z1];
	else if (p7_oprofile_FGetEmission(om, k, x) > 1.0)               ad->mline[z-z1] = '+'; /* >1 not >0; om has odds ratios, not scores */
	else                                                             ad->mline[z-z1] = ' ';
	ad->aseq  [z-z1] = toupper(Alphabet[x]);
	break;
	
      case p7T_I:
	ad->model [z-z1] = '.';
	ad->mline [z-z1] = ' ';
	ad->aseq  [z-z1] = tolower(Alphabet[x]);
	break;
	
      case p7T_D:
	ad->model [z-z1] = om->consensus[k]; 
	ad->mline [z-z1] = ' ';
	ad->aseq  [z-z1] = '-';
	break;

      default: ESL_XEXCEPTION(eslEINVAL, "invalid state in trace: not M,D,I");
      }
    }
  ad->model [z2-z1+1] = '\0';
  ad->mline [z2-z1+1] = '\0';
  ad->aseq  [z2-z1+1] = '\0';
  ad->N = z2-z1+1;
  return ad;

 ERROR:
  p7_alidisplay_Destroy(ad);
  return NULL;
}
Exemple #27
0
/* Function:  p7_alidisplay_Backconvert()
 * Synopsis:  Convert an alidisplay to a faux trace and subsequence.
 * Incept:    SRE, Wed Dec 10 09:49:28 2008 [Janelia]
 *
 * Purpose:   Convert alignment display object <ad> to a faux subsequence
 *            and faux subsequence trace, returning them in <ret_sq> and
 *            <ret_tr>. 
 *            
 *            The subsequence <*ret_sq> is digital; ascii residues in
 *            <ad> are digitized using digital alphabet <abc>.
 *            
 *            The subsequence and trace are suitable for passing as
 *            array elements to <p7_MultipleAlignment>. This is the
 *            main purpose of backconversion. Results of a profile
 *            search are stored in a hit list as a processed
 *            <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to
 *            reduce space and to reduce communication overhead in
 *            parallelized search implementations. After reduction
 *            to a final hit list, a master may want to construct a
 *            multiple alignment of all the significant hits. 
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal
 *            data corruption. On any exception, <*ret_sq> and <*ret_tr> are
 *            <NULL>.
 *
 * Xref:      J4/29.
 */
int
p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr)
{
  ESL_SQ   *sq   = NULL;	/* RETURN: faux subsequence          */
  P7_TRACE *tr   = NULL;	/* RETURN: faux trace                */
  int       subL = 0;		/* subsequence length in the <ad>    */
  int       a, i, k;        	/* coords for <ad>, <sq->dsq>, model */
  char      st;			/* state type: MDI                   */
  int       status;
  
  /* Make a first pass over <ad> just to calculate subseq length */
  for (a = 0; a < ad->N; a++)
    if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++;

  /* Allocations */
  if ((sq = esl_sq_CreateDigital(abc)) == NULL)   { status = eslEMEM; goto ERROR; }
  if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR;

  if ((tr = (ad->ppline == NULL) ?  p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; }
  if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR;   /* +6 is for SNB/ECT */
  
  /* Construction of dsq, trace */
  sq->dsq[0] = eslDSQ_SENTINEL;
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR;
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR;
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR;
  k = ad->hmmfrom;
  i = 1; 
  for (a = 0; a < ad->N; a++)
    {
      if (esl_abc_CIsResidue(abc, ad->model[a])) { st = (esl_abc_CIsResidue(abc, ad->aseq[a]) ? p7T_M : p7T_D); } else st = p7T_I;

      if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, st, k, i) : p7_trace_AppendWithPP(tr, st, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR;

      switch (st) {
      case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); k++; i++; break;
      case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]);      i++; break;
      case p7T_D:                                                        k++;      break;
      }
    }
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR;
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR;
  if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR;
  sq->dsq[i] = eslDSQ_SENTINEL;

  /* some sanity checks */
  if (tr->N != ad->N + 6)      ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)",         ad->sqname, ad->hmmname);
  if (k     != ad->hmmto + 1)  ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname);
  if (i     != subL + 1)       ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)",        ad->sqname, ad->hmmname);

  /* Set up <sq> annotation as a subseq of a source sequence */
  if ((status = esl_sq_FormatName(sq, "%s/%ld-%ld", ad->sqname, ad->sqfrom, ad->sqto))                      != eslOK) goto ERROR;
  if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR;
  if ((status = esl_sq_SetSource (sq, ad->sqname))                                                          != eslOK) goto ERROR;
  if (ad->sqacc[0]  != '\0') { if ((status = esl_sq_SetAccession  (sq, ad->sqacc)) != eslOK) goto ERROR; }
  sq->n     = subL;
  sq->start = ad->sqfrom;
  sq->end   = ad->sqto;
  sq->C     = 0;
  sq->W     = subL;
  sq->L     = ad->L;
  
  tr->M     = ad->M;
  tr->L     = ad->L;

  *ret_sq = sq;
  *ret_tr = tr;
  return eslOK;

 ERROR:
  if (sq != NULL) esl_sq_Destroy(sq);
  if (tr != NULL) p7_trace_Destroy(tr);
  *ret_sq = NULL;
  *ret_tr = NULL;
  return status;
}
/* Function:  p7_GStochasticTrace()
 * Synopsis:  Stochastic traceback of a Forward matrix.
 * Incept:    SRE, Thu Jan  3 15:39:20 2008 [Janelia]
 *
 * Purpose:   Stochastic traceback of Forward matrix <gx> to
 *            sample an alignment of digital sequence <dsq>
 *            (of length <L>) to the profile <gm>. 
 *            
 *            The sampled traceback is returned in <tr>, which the
 *            caller must have at least made an initial allocation of
 *            (the <tr> will be grown as needed here).
 *
 * Args:      r      - source of random numbers
 *            dsq    - digital sequence aligned to, 1..L 
 *            L      - length of dsq
 *            gm     - profile
 *            mx     - Forward matrix to trace, L x M
 *            tr     - storage for the recovered traceback.
 *
 * Returns:   <eslOK> on success.
 */
int
p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr)
{
  int     status;
  int     i;			/* position in seq (1..L) */
  int     k;			/* position in model (1..M) */
  int     M   = gm->M;
  float **dp  = gx->dp;
  float  *xmx = gx->xmx;
  float const *tsc  = gm->tsc;
  float  *sc;			/* scores of possible choices: up to 2M-1, in the case of exits to E  */
  int     scur, sprv;

  /* we'll index M states as 1..M, and D states as 2..M = M+2..2M: M0, D1 are impossibles. */
  ESL_ALLOC(sc, sizeof(float) * (2*M+1)); 

  k = 0;
  i = L;			
  if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) goto ERROR;
  if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) goto ERROR;
  sprv = p7T_C;
  while (sprv != p7T_S) 
    {
      switch (tr->st[tr->N-1]) {
      /* C(i) comes from C(i-1) or E(i) */
      case p7T_C:		
	if   (XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i);

	sc[0] = XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP];
	sc[1] = XMX(i,   p7G_E) + gm->xsc[p7P_E][p7P_MOVE];
	esl_vec_FLogNorm(sc, 2); 
	scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_C : p7T_E;
	break;

      /* E connects from any M or D state. k set here */
      case p7T_E:	
	if (XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i);
	
	if (p7_profile_IsLocal(gm)) { /* local models come from any M, D */
	  sc[0] = sc[M+1] = -eslINFINITY;
	  for (k = 1; k <= M; k++) sc[k]   = MMX(i,k);
	  for (k = 2; k <= M; k++) sc[k+M] = DMX(i,k);
	  esl_vec_FLogNorm(sc, 2*M+1); /* now sc is a prob vector */
	  k = esl_rnd_FChoose(r, sc, 2*M+1);
	  if (k <= M)    scur = p7T_M;
	  else { k -= M; scur = p7T_D; }
	} else { 		/* glocal models come from M_M or D_M  */
	  k     = M;
	  sc[0] = MMX(i,M);
	  sc[1] = DMX(i,M);
	  esl_vec_FLogNorm(sc, 2); /* now sc is a prob vector */
	  scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D;
	}
	break;

      /* M connects from {MDI} i-1,k-1, or B */
      case p7T_M:
	if (MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i);
	
	sc[0] = XMX(i-1,p7G_B) + TSC(p7P_BM, k-1);
	sc[1] = MMX(i-1,k-1)   + TSC(p7P_MM, k-1);
	sc[2] = IMX(i-1,k-1)   + TSC(p7P_IM, k-1);
	sc[3] = DMX(i-1,k-1)   + TSC(p7P_DM, k-1);
	esl_vec_FLogNorm(sc, 4); 
	switch (esl_rnd_FChoose(r, sc, 4)) {
	case 0: scur = p7T_B;   break;
	case 1: scur = p7T_M;   break;
	case 2: scur = p7T_I;   break;
	case 3: scur = p7T_D;   break;
	}
	k--; 
	i--;
	break;

      /* D connects from M,D at i,k-1 */
      case p7T_D:
	if (DMX(i, k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i);

	sc[0] = MMX(i, k-1) + TSC(p7P_MD, k-1);
	sc[1] = DMX(i, k-1) + TSC(p7P_DD, k-1);
	esl_vec_FLogNorm(sc, 2); 
	scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D;
	k--;
	break;

      /* I connects from M,I at i-1,k */
      case p7T_I:
	if (IMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i);
	
	sc[0] = MMX(i-1,k) + TSC(p7P_MI, k);
	sc[1] = IMX(i-1,k) + TSC(p7P_II, k);
	esl_vec_FLogNorm(sc, 2); 
	scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_I;
	i--;
	break;

      /* N connects from S, N */
      case p7T_N:
	if (XMX(i, p7G_N) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i);
	scur = (i == 0) ? p7T_S : p7T_N;
	break;

      /* B connects from N, J */
      case p7T_B:			
	if (XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i);

	sc[0] = XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE];
	sc[1] = XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE];
	esl_vec_FLogNorm(sc, 2); 
	scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_N : p7T_J;
	break;

      /* J connects from E(i) or J(i-1) */
      case p7T_J:	
	if (XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i);
	
	sc[0] = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP];
	sc[1] = XMX(i,  p7G_E) + gm->xsc[p7P_E][p7P_LOOP];
	esl_vec_FLogNorm(sc, 2); 
	scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_J : p7T_E;
	break;

      default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback");
      } /* end switch over statetype[tpos-1] */

      /* Append this state and the current i,k to be explained to the growing trace */
      if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) goto ERROR;

      /* For NCJ, we had to defer i decrement. */
      if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--;

      sprv = scur;
    } /* end traceback, at S state */

  if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR;
  tr->M = gm->M;
  tr->L = L;
  free(sc);
  return eslOK;

 ERROR:
  if (sc != NULL) free(sc);
  return status;
}
Exemple #29
0
/* Function:  p7_oprofile_MPIPackSize()
 * Synopsis:  Calculates size needed to pack an OPROFILE.
 * Incept:    MSF, Wed Oct 21, 2009 [Janelia]
 *
 * Purpose:   Calculate an upper bound on the number of bytes
 *            that <p7_oprofile_MPIPack()> will need to pack an 
 *            OPROFILE <om> in a packed MPI message for MPI 
 *            communicator <comm>; return that number of bytes
 *            in <*ret_n>.
 *
 * Returns:   <eslOK> on success, and <*ret_n> contains the answer.
 *
 * Throws:    <eslESYS> if an MPI call fails, and <*ret_n> is 0.
 */
int
p7_oprofile_MPIPackSize(P7_OPROFILE *om, MPI_Comm comm, int *ret_n)
{
  int   status;
  int   n = 0;
  int   K = om->abc->Kp;
  int   len = 0;
  int   cnt;
  int   sz;

  int   Q4  = p7O_NQF(om->M);
  int   Q8  = p7O_NQW(om->M);
  int   Q16 = p7O_NQB(om->M);
  int   vsz = sizeof(vector float);

  /* MSV Filter information */
  if (MPI_Pack_size(5,          MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(1,         MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(vsz*Q16,    MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += (K*sz);

  /* Viterbi Filter information */
  if (MPI_Pack_size(1,         MPI_SHORT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += ((p7O_NXSTATES*p7O_NXTRANS+2)*sz);
  if (MPI_Pack_size(2,         MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(K*vsz*Q8,   MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(8*vsz*Q8,   MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;

  /* Forward/Backward information */
  if (MPI_Pack_size(1,         MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += (p7O_NXSTATES*p7O_NXTRANS*sz);
  if (MPI_Pack_size(K*vsz*Q4,   MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(8*vsz*Q4,   MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;

  /* disk offsets */
  if (MPI_Pack_size(1, MPI_LONG_LONG_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += ((p7_NOFFSETS+2)*sz);

  /* annotation info */
  if (om->name      != NULL) len += strlen(om->name)      + 1;
  if (om->acc       != NULL) len += strlen(om->acc)       + 1;
  if (om->desc      != NULL) len += strlen(om->desc)      + 1;
  if (om->rf        != NULL) len += strlen(om->rf)        + 1;
  if (om->cs        != NULL) len += strlen(om->cs)        + 1;
  if (om->consensus != NULL) len += strlen(om->consensus) + 1;
  if (MPI_Pack_size(6,           MPI_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(len,        MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  cnt = p7_NEVPARAM + p7_NCUTOFFS + p7_MAXABET;
  if (MPI_Pack_size(cnt,       MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;

  /* current model size */
  if (MPI_Pack_size(4,           MPI_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;
  if (MPI_Pack_size(1,         MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed");   n += sz;

  *ret_n = n;
  return eslOK;

 ERROR:
  *ret_n = 0;
  return status;

}
Exemple #30
0
/* Function:  p7_oprofile_MPIUnpack()
 * Synopsis:  Unpacks an OPROFILE from an MPI buffer.
 * Incept:    MSF, Wed Oct 21, 2009 [Janelia]
 *
 * Purpose:   Unpack a newly allocated OPROFILE from MPI packed buffer
 *            <buf>, starting from position <*pos>, where the total length
 *            of the buffer in bytes is <n>. 
 *            
 *            Caller may or may not already know what alphabet the OPROFILE
 *            is expected to be in.  A reference to the current
 *            alphabet is passed in <abc>. If the alphabet is unknown,
 *            pass <*abc = NULL>, and when the OPROFILE is received, an
 *            appropriate new alphabet object is allocated and passed
 *            back to the caller via <*abc>.  If the alphabet is
 *            already known, <*abc> is that alphabet, and the new
 *            OPROFILE's alphabet type is verified to agree with it. This
 *            mechanism allows an application to let the first OPROFILE
 *            determine the alphabet type for the application, while
 *            still keeping the alphabet under the application's scope
 *            of control.
 *
 * Returns:   <eslOK> on success. <*pos> is updated to the position of
 *            the next element in <buf> to unpack (if any). <*ret_om>
 *            contains a newly allocated OPROFILE, which the caller is
 *            responsible for free'ing.  If <*abc> was passed as
 *            <NULL>, it now points to an <ESL_ALPHABET> object that
 *            was allocated here; caller is responsible for free'ing
 *            this.
 *            
 *            Returns <eslEINCOMPAT> if the OPROFILE is in a different
 *            alphabet than <*abc> said to expect. In this case,
 *            <*abc> is unchanged, <*buf> and <*nalloc> may have been
 *            changed, and <*ret_om> is <NULL>.
 *            
 * Throws:    <eslESYS> on an MPI call failure. <eslEMEM> on allocation failure.
 *            In either case, <*ret_om> is <NULL>, and the state of <buf>
 *            and <*pos> is undefined and should be considered to be corrupted.
 */
int
p7_oprofile_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_OPROFILE **ret_om)
{
  int   status;
  int   M, K, atype;
  int   len;
  int   x;

  int   Q4, Q8, Q16;
  int   vsz = sizeof(vector float);

  P7_OPROFILE *om = NULL;

  if (MPI_Unpack(buf, n, pos, &M,                1,                      MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &atype,            1,                      MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed");

  /* Set or verify the alphabet */
  if (*abc == NULL)	{	/* still unknown: set it, pass control of it back to caller */
    if ((*abc = esl_alphabet_Create(atype)) == NULL)       { status = eslEMEM;      goto ERROR; }
  } else {			/* already known: check it */
    if ((*abc)->type != atype)                             { status = eslEINCOMPAT; goto ERROR; }
  }

  Q4  = p7O_NQF(M);
  Q8  = p7O_NQW(M);
  Q16 = p7O_NQB(M);

  if ((om = p7_oprofile_Create(M, *abc)) == NULL) { status = eslEMEM; goto ERROR;    }
  om->M = M;

  K = (*abc)->Kp;

  /* model configuration */
  if (MPI_Unpack(buf, n, pos, &om->L,            1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->mode,         1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->nj,           1,                    MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  /* MSV Filter information */
  if (MPI_Unpack(buf, n, pos, &om->tbm_b,        1,                     MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->tec_b,        1,                     MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->tjb_b,        1,                     MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->scale_b,      1,                    MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->base_b,       1,                     MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->bias_b,       1,                     MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  for (x = 0; x < K; x++)
    if (MPI_Unpack(buf, n, pos,  om->rbv[x],     vsz*Q16,               MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  /* Viterbi Filter information */
  if (MPI_Unpack(buf, n, pos, &om->scale_w,      1,                    MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->base_w,       1,                    MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->ddbound_w,    1,                    MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->ncj_roundoff, 1,                    MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos,  om->twv,          8*vsz*Q8,              MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  for (x = 0; x < p7O_NXSTATES; x++)
    if (MPI_Unpack(buf, n, pos,  om->xw[x],      p7O_NXTRANS,          MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  for (x = 0; x < K; x++)
    if (MPI_Unpack(buf, n, pos,  om->rwv[x],     vsz*Q8,                MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  /* Forward/Backward information */
  if (MPI_Unpack(buf, n, pos,  om->tfv,          8*vsz*Q4,              MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  for (x = 0; x < p7O_NXSTATES; x++)
    if (MPI_Unpack(buf, n, pos,  om->xf[x],      p7O_NXTRANS,          MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  for (x = 0; x < K; x++)
    if (MPI_Unpack(buf, n, pos,  om->rfv[x],     vsz*Q4,                MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  /* Forward/Backward information */
  if (MPI_Unpack(buf, n, pos,  om->offs,         p7_NOFFSETS,  MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->roff,         1,            MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos, &om->eoff,         1,            MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  /* Annotation information */
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->name, len);
    if (MPI_Unpack(buf, n, pos,  om->name,       len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->name[len-1] = '\0';
  }
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->acc, len);
    if (MPI_Unpack(buf, n, pos,  om->acc,        len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->acc[len-1] = '\0';
  }
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->desc, len);
    if (MPI_Unpack(buf, n, pos,  om->desc,       len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->desc[len-1] = '\0';
  }
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->rf, len);
    if (MPI_Unpack(buf, n, pos,  om->rf,         len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->rf[len-1] = '\0';
  }
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->cs, len);
    if (MPI_Unpack(buf, n, pos,  om->cs,         len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->cs[len-1] = '\0';
  }
  if (MPI_Unpack(buf, n, pos, &len,              1,                      MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (len > 0) {
    ESL_ALLOC(om->consensus, len);
    if (MPI_Unpack(buf, n, pos,  om->consensus,  len,                   MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
    om->consensus[len-1] = '\0';
  }

  if (MPI_Unpack(buf, n, pos,  om->evparam,      p7_NEVPARAM,          MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos,  om->cutoff,       p7_NCUTOFFS,          MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");
  if (MPI_Unpack(buf, n, pos,  om->compo,        p7_MAXABET,           MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed");

  *ret_om = om;
  return eslOK;

 ERROR:
  if (om != NULL) p7_oprofile_Destroy(om);
  return status;
}