/* Function: esl_dst_CJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned strings. * Incept: SRE, Tue Apr 18 14:00:37 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between * two aligned character strings <as1> and <as2>, in * substitutions/site, for an alphabet of <K> residues * (<K=4> for nucleic acid, <K=20> for proteins). The * maximum likelihood estimate for the distance is * optionally returned in <opt_distance>. The large-sample * variance for the distance estimate is * optionally returned in <opt_variance>. * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively to count the number of identities * (<n1>) and mismatches (<n2>>). Any nonalphabetic * character is assumed to be a gap symbol, and aligned * columns containing gap symbols are ignored. The * fractional difference <D> used to calculate the * Jukes/Cantor distance is <n2/n1+n2>. * * Args: K - size of the alphabet (4 or 20) * as1 - 1st aligned seq, 0..L-1, \0-terminated * as2 - 2nd aligned seq, 0..L-1, \0-terminated * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: <eslOK> on success. * * Infinite distances are possible, in which case distance * and variance are both <HUGE_VAL>. Caller has to deal * with this case as it sees fit, perhaps by enforcing * an arbitrary maximum distance. * * Throws: <eslEINVAL> if the two strings aren't the same length (and * thus can't have been properly aligned). * <eslEDIVZERO> if no aligned residues were counted. * On either failure, distance and variance are both returned * as <HUGE_VAL>. */ int esl_dst_CJukesCantor(int K, const char *as1, const char *as2, double *opt_distance, double *opt_variance) { int status; int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ /* 1. Count identities, mismatches. */ n1 = n2 = 0; for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++) { if (isalpha(as1[i]) && isalpha(as2[i])) { if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++; } } if (as1[i] != '\0' || as2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */ ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; }
/* Function: p7_Handmodelmaker() * * Purpose: Manual model construction. * Construct an HMM from a digital alignment, where the * <#=RF> line of the alignment file is used to indicate the * columns assigned to matches vs. inserts. * * The <msa> must be in digital mode, and it must have * a reference annotation line. * * NOTE: <p7_Handmodelmaker()> will slightly revise the * alignment if necessary, if the assignment of columns * implies DI and ID transitions. * * Returns both the HMM in counts form (ready for applying * Dirichlet priors as the next step), and fake tracebacks * for each aligned sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. <ret_hmm> and <opt_tr> are allocated * here, and must be free'd by caller. * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Returns <eslEFORMAT> if the <msa> doesn't have a reference * annotation line. * * Throws: <eslEMEM> on allocation failure. Throws <eslEINVAL> if the <msa> * isn't in digital mode. */ int p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int apos; /* counter for aligned columns */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need a digital msa"); if (msa->rf == NULL) return eslEFORMAT; ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Watch for off-by-one. rf is [0..alen-1]; matassign is [1..alen] */ for (apos = 1; apos <= msa->alen; apos++) matassign[apos] = (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])? FALSE : TRUE); /* matassign2hmm leaves ret_hmm, opt_tr in their proper state: */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) goto ERROR; free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* Function: esl_dst_XPairId() * Synopsis: Pairwise identity of two aligned digital seqs. * Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis] * * Purpose: Digital version of <esl_dst_PairId()>: <adsq1> and * <adsq2> are digitized aligned sequences, in alphabet * <abc>. Otherwise, same as <esl_dst_PairId()>. * * Args: abc - digital alphabet in use * ax1 - aligned digital seq 1 * ax2 - aligned digital seq 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n> * contain the answers, for any of these that were passed * non-<NULL> pointers. * * Throws: <eslEINVAL> if the strings are different lengths (not aligned). */ int esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2, double *opt_distance, int *opt_nid, int *opt_n) { int status; int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ idents = len1 = len2 = 0; for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax1[i])) len1++; if (esl_abc_XIsCanonical(abc, ax2[i])) len2++; if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i]) && ax1[i] == ax2[i]) idents++; } if (len2 < len1) len1 = len2; if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_distance != NULL) *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 ); if (opt_nid != NULL) *opt_nid = idents; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; }
/* Function: esl_dst_XJukesCantor() * Synopsis: Jukes-Cantor distance for two aligned digitized seqs. * Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis] * * Purpose: Calculate the generalized Jukes-Cantor distance between two * aligned digital strings <ax> and <ay>, in substitutions/site, * using alphabet <abc> to evaluate identities and differences. * The maximum likelihood estimate for the distance is optionally returned in * <opt_distance>. The large-sample variance for the distance * estimate is optionally returned in <opt_variance>. * * Identical to <esl_dst_CJukesCantor()>, except that it takes * digital sequences instead of character strings. * * Args: abc - bioalphabet to use for comparisons * ax - 1st digital aligned seq * ay - 2nd digital aligned seq * opt_distance - optRETURN: ML estimate of distance d * opt_variance - optRETURN: large-sample variance of d * * Returns: <eslOK> on success. As in <esl_dst_CJukesCantor()>, the * distance and variance may be infinite, in which case they * are returned as <HUGE_VAL>. * * Throws: <eslEINVAL> if the two strings aren't the same length (and * thus can't have been properly aligned). * <eslEDIVZERO> if no aligned residues were counted. * On either failure, the distance and variance are set * to <HUGE_VAL>. */ int esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay, double *opt_distance, double *opt_variance) { int status; int n1, n2; /* number of observed identities, substitutions */ int i; /* position in aligned seqs */ n1 = n2 = 0; for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++) { if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i])) { if (ax[i] == ay[i]) n1++; else n2++; } } if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL) ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); return jukescantor(n1, n2, abc->K, opt_distance, opt_variance); ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; }
/* Function: esl_dst_CPairId() * Synopsis: Pairwise identity of two aligned text strings. * Incept: SRE, Mon Apr 17 20:06:07 2006 [St. Louis] * * Purpose: Calculates pairwise fractional identity between two * aligned character strings <asq1> and <asq2>. * Return this distance in <opt_pid>; return the * number of identities counted in <opt_nid>; and * return the denominator <MIN(len1,len2)> in * <opt_n>. * * Alphabetic symbols <[a-zA-Z]> are compared * case-insensitively for identity. Any nonalphabetic * character is assumed to be a gap symbol. * * This simple comparison rule is unaware of synonyms and * degeneracies in biological alphabets. For a more * sophisticated and biosequence-aware comparison, use * digitized sequences and the <esl_dst_XPairId()> function * instead. * * Args: asq1 - aligned character string 1 * asq2 - aligned character string 2 * opt_pid - optRETURN: pairwise identity, 0<=x<=1 * opt_nid - optRETURN: # of identities * opt_n - optRETURN: denominator MIN(len1,len2) * * Returns: <eslOK> on success. <opt_pid>, <opt_nid>, <opt_n> * contain the answers (for whichever were passed non-NULL). * * Throws: <eslEINVAL> if the strings are different lengths * (not aligned). */ int esl_dst_CPairId(const char *asq1, const char *asq2, double *opt_pid, int *opt_nid, int *opt_n) { int status; int idents; /* total identical positions */ int len1, len2; /* lengths of seqs */ int i; /* position in aligned seqs */ idents = len1 = len2 = 0; for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++) { if (isalpha(asq1[i])) len1++; if (isalpha(asq2[i])) len2++; if (isalpha(asq1[i]) && isalpha(asq2[i]) && toupper(asq1[i]) == toupper(asq2[i])) idents++; } if (asq1[i] != '\0' || asq2[i] != '\0') ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned"); if (opt_pid != NULL) *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2)); if (opt_nid != NULL) *opt_nid = idents; if (opt_n != NULL) *opt_n = len1; return eslOK; ERROR: if (opt_pid != NULL) *opt_pid = 0.; if (opt_nid != NULL) *opt_nid = 0; if (opt_n != NULL) *opt_n = 0; return status; }
/* Function: esl_dst_XPairIdMx() * Synopsis: NxN identity matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York] * * Purpose: Given a digitized multiple sequence alignment <ax>, consisting * of <N> aligned digital sequences in alphabet <abc>; calculate * a symmetric pairwise fractional identity matrix by $N(N-1)/2$ * calls to <esl_dst_XPairId()>, and return it in <ret_S>. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_S - RETURN: NxN matrix of fractional identities * * Returns: <eslOK> on success, and <ret_S> contains the distance * matrix. Caller is obligated to free <S> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if a seq has a different * length than others. On failure, <ret_S> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S) { int status; ESL_DMATRIX *S = NULL; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR; for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; }
/* Function: p7_oprofile_MPIRecv() * Synopsis: Receives an OPROFILE as a work unit from an MPI sender. * Incept: MSF, Wed Oct 21, 2009 [Janelia] * * Purpose: Receive a work unit that consists of a single OPROFILE * sent by MPI <source> (<0..nproc-1>, or * <MPI_ANY_SOURCE>) tagged as <tag> for MPI communicator <comm>. * * Work units are prefixed by a status code. If the unit's * code is <eslOK> and no errors are encountered, this * routine will return <eslOK> and a non-<NULL> <*ret_om>. * If the unit's code is <eslEOD> (a shutdown signal), * this routine returns <eslEOD> and <*ret_om> is <NULL>. * * Caller provides a working buffer <*buf> of size * <*nalloc> characters. These are passed by reference, so * that <*buf> can be reallocated and <*nalloc> increased * if necessary. As a special case, if <*buf> is <NULL> and * <*nalloc> is 0, the buffer will be allocated * appropriately, but the caller is still responsible for * free'ing it. * * Caller may or may not already know what alphabet the OPROFILE * is expected to be in. A reference to the current * alphabet is passed in <abc>. If the alphabet is unknown, * pass <*abc = NULL>, and when the OPROFILE is received, an * appropriate new alphabet object is allocated and passed * back to the caller via <*abc>. If the alphabet is * already known, <*ret_abc> is that alphabet, and the new * OPROFILE's alphabet type is verified to agree with it. This * mechanism allows an application to let the first OPROFILE * determine the alphabet type for the application, while * still keeping the alphabet under the application's scope * of control. * * Returns: <eslOK> on success. <*ret_om> contains the received OPROFILE; * it is allocated here, and the caller is responsible for * free'ing it. <*buf> may have been reallocated to a * larger size, and <*nalloc> may have been increased. If * <*abc> was passed as <NULL>, it now points to an * <ESL_ALPHABET> object that was allocated here; caller is * responsible for free'ing this. * * Returns <eslEOD> if an end-of-data signal was received. * In this case, <*buf>, <*nalloc>, and <*abc> are left unchanged, * and <*ret_om> is <NULL>. * * Returns <eslEINCOMPAT> if the OPROFILE is in a different alphabet * than <*abc> said to expect. In this case, <*abc> is unchanged, * <*buf> and <*nalloc> may have been changed, and <*ret_om> is * <NULL>. * * Throws: <eslEMEM> on allocation error, in which case <*ret_om> is * <NULL>. */ int p7_oprofile_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_OPROFILE **ret_om) { int status; int code; P7_OPROFILE *om = NULL; int n; int pos; MPI_Status mpistatus; /* Probe first, because we need to know if our buffer is big enough. */ MPI_Probe(source, tag, comm, &mpistatus); MPI_Get_count(&mpistatus, MPI_PACKED, &n); /* Make sure the buffer is allocated appropriately */ if (*buf == NULL || n > *nalloc) { void *tmp; ESL_RALLOC(*buf, tmp, sizeof(char) * n); *nalloc = n; } /* Receive the packed work unit */ MPI_Recv(*buf, n, MPI_PACKED, source, tag, comm, &mpistatus); /* Unpack it, looking at the status code prefix for EOD/EOK */ pos = 0; if (MPI_Unpack(*buf, n, &pos, &code, 1, MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (code == eslEOD) { *ret_om = NULL; return eslEOD; } return p7_oprofile_MPIUnpack(*buf, *nalloc, &pos, comm, abc, ret_om); ERROR: if (om != NULL) p7_oprofile_Destroy(om); return status; }
/* Function: esl_workqueue_Create() * Synopsis: Create a work queue object. * Incept: MSF, Thu Jun 18 11:51:39 2009 * * Purpose: Creates an <ESL_WORK_QUEUE> object of <size>. The * queues are used to handle objects <void *> that * are ready to be processed and that have been * processed by worker threads. * * Returns: ptr to the new <ESL_WORK_QUEUE> object. * * Throws: <eslESYS> on allocation or initialization failure. */ ESL_WORK_QUEUE * esl_workqueue_Create(int size) { int i; int status; ESL_WORK_QUEUE *queue = NULL; ESL_ALLOC(queue, sizeof(ESL_WORK_QUEUE)); queue->readerQueue = NULL; queue->readerQueueCnt = 0; queue->readerQueueHead = 0; queue->workerQueue = NULL; queue->workerQueueCnt = 0; queue->workerQueueHead = 0; queue->queueSize = size; queue->pendingWorkers = 0; if (pthread_mutex_init(&queue->queueMutex, NULL) != 0) ESL_XEXCEPTION(eslESYS, "mutex init failed"); if (pthread_cond_init(&queue->readerQueueCond, NULL) != 0) ESL_XEXCEPTION(eslESYS, "cond reader init failed"); if (pthread_cond_init(&queue->workerQueueCond, NULL) != 0) ESL_XEXCEPTION(eslESYS, "cond worker init failed"); ESL_ALLOC(queue->readerQueue, sizeof(void *) * size); ESL_ALLOC(queue->workerQueue, sizeof(void *) * size); for (i = 0; i < queue->queueSize; ++i) { queue->readerQueue[i] = NULL; queue->workerQueue[i] = NULL; } return queue; ERROR: esl_workqueue_Destroy(queue); return NULL; }
/* Function: esl_workqueue_WorkerUpdate() * Synopsis: Consumer routine. * Incept: MSF, Thu Jun 18 11:51:39 2009 * * Purpose: The consumer (i.e. Worker) places an object that has * been processed on the producers (i.e. Readers) queue. * * If the <in> object is not null, it is placed on the * readers queue. If the reader is waiting for an object, * it is signaled it to wake up. * * If the worker routine has supplied an <out> pointer, * an object that is ready for processing by a worker, * is placed in <out> so the worker thread can continue. * * Returns: <eslOK> on success. * * Throws: <eslESYS> if thread synchronization fails somewhere. * <eslEINVAL> if something's wrong with <queue>. */ int esl_workqueue_WorkerUpdate(ESL_WORK_QUEUE *queue, void *in, void **out) { int cnt; int inx; int queueSize; int status; if (queue == NULL) ESL_XEXCEPTION(eslEINVAL, "Invalid queue object"); if (pthread_mutex_lock (&queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS, "mutex lock failed"); queueSize = queue->queueSize; /* check if the caller is queuing up an item */ if (in != NULL) { /* check to make sure we don't overflow */ if (queue->readerQueueCnt >= queueSize) ESL_XEXCEPTION(eslEINVAL, "Reader queue overflow"); inx = (queue->readerQueueHead + queue->readerQueueCnt) % queueSize; queue->readerQueue[inx] = in; cnt = queue->readerQueueCnt++; if (cnt == 0) { if (pthread_cond_signal (&queue->readerQueueCond) != 0) ESL_XEXCEPTION(eslESYS, "cond signal failed"); } } /* check if the caller is waiting for a queued item */ if (out != NULL) { if (queue->workerQueueCnt == 0) { /* wait for a processed buffers to be returned */ ++queue->pendingWorkers; while (queue->workerQueueCnt == 0) { if (pthread_cond_wait (&queue->workerQueueCond, &queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS, "cond wait failed"); } --queue->pendingWorkers; } inx = queue->workerQueueHead; *out = queue->workerQueue[inx]; queue->workerQueue[inx] = NULL; queue->workerQueueHead = (queue->workerQueueHead + 1) % queueSize; --queue->workerQueueCnt; } if (pthread_mutex_unlock (&queue->queueMutex) != 0) ESL_XEXCEPTION(eslESYS, "mutex unlock failed"); return eslOK; ERROR: if (out) *out = NULL; return status; }
/* Function: p7_SingleBuilder() * Synopsis: Build a new HMM from a single sequence. * * Purpose: Take the sequence <sq> and a build configuration <bld>, and * build a new HMM. * * The single sequence scoring system in the <bld> * configuration must have been previously initialized by * <p7_builder_SetScoreSystem()>. * * Args: bld - build configuration * sq - query sequence * bg - null model (needed to paramaterize insert emission probs) * opt_hmm - optRETURN: new HMM * opt_gm - optRETURN: profile corresponding to <hmm> * opt_om - optRETURN: optimized profile corresponding to <gm> * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * <eslEINVAL> if <bld> isn't properly configured somehow. */ int p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE **opt_tr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om) { P7_HMM *hmm = NULL; P7_TRACE *tr = NULL; int k; int status; bld->errbuf[0] = '\0'; if (! bld->Q) ESL_XEXCEPTION(eslEINVAL, "score system not initialized"); if ((status = p7_Seqmodel(bld->abc, sq->dsq, sq->n, sq->name, bld->Q, bg->f, bld->popen, bld->pextend, &hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetComposition(hmm)) != eslOK) goto ERROR; if ((status = p7_hmm_SetConsensus(hmm, sq)) != eslOK) goto ERROR; if ((status = calibrate(bld, hmm, bg, opt_gm, opt_om)) != eslOK) goto ERROR; if ( bld->abc->type == eslDNA || bld->abc->type == eslRNA ) { if (bld->w_len > 0) hmm->max_length = bld->w_len; else if (bld->w_beta == 0.0) hmm->max_length = hmm->M *4; else if ( (status = p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR; } /* build a faux trace: relative to core model (B->M_1..M_L->E) */ if (opt_tr != NULL) { if ((tr = p7_trace_Create()) == NULL) goto ERROR; if ((status = p7_trace_Append(tr, p7T_B, 0, 0)) != eslOK) goto ERROR; for (k = 1; k <= sq->n; k++) if ((status = p7_trace_Append(tr, p7T_M, k, k)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_E, 0, 0)) != eslOK) goto ERROR; tr->M = sq->n; tr->L = sq->n; } /* note that <opt_gm> and <opt_om> were already set by calibrate() call above. */ if (opt_hmm != NULL) *opt_hmm = hmm; else p7_hmm_Destroy(hmm); if (opt_tr != NULL) *opt_tr = tr; return eslOK; ERROR: p7_hmm_Destroy(hmm); if (tr != NULL) p7_trace_Destroy(tr); if (opt_gm != NULL) p7_profile_Destroy(*opt_gm); if (opt_om != NULL) p7_oprofile_Destroy(*opt_om); return status; }
/* Function: p7_Fastmodelmaker() * * Purpose: Heuristic model construction. * Construct an HMM from an alignment by a simple rule, * based on the fractional occupancy of each columns w/ * residues vs gaps. Any column w/ a fractional * occupancy of $\geq$ <symfrac> is assigned as a MATCH column; * for instance, if thresh = 0.5, columns w/ $\geq$ 50\% * residues are assigned to match... roughly speaking. * * "Roughly speaking" because sequences may be weighted * in the input <msa>, and because missing data symbols are * ignored, in order to deal with sequence fragments. * * The <msa> must be in digital mode. * * If the caller wants to designate any sequences as * fragments, it does so by converting all N-terminal and * C-terminal flanking gap symbols to missing data symbols. * * NOTE: p7_Fastmodelmaker() will slightly revise the * alignment if the assignment of columns implies * DI and ID transitions. * * Returns the HMM in counts form (ready for applying Dirichlet * priors as the next step). Also returns fake traceback * for each training sequence. * * Models must have at least one node, so if the <msa> defined * no consensus columns, a <eslENORESULT> error is returned. * * Args: msa - multiple sequence alignment * symfrac - threshold for residue occupancy; >= assigns MATCH * bld - holds information on regions requiring masking, optionally NULL -> no masking * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. ret_hmm and opt_tr allocated here, * and must be free'd by the caller (FreeTrace(tr[i]), free(tr), * FreeHMM(hmm)). * * Returns <eslENORESULT> if no consensus columns were annotated; * in this case, <ret_hmm> and <opt_tr> are returned NULL. * * Throws: <eslEMEM> on allocation failure; <eslEINVAL> if the * <msa> isn't in digital mode. */ int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; /* return status flag */ int *matassign = NULL; /* MAT state assignments if 1; 1..alen */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ float r; /* weighted residue count */ float totwgt; /* weighted residue+gap count */ if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA"); /* Allocations: matassign is 1..alen array of bit flags. */ ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1)); /* Determine weighted sym freq in each column, set matassign[] accordingly. */ for (apos = 1; apos <= msa->alen; apos++) { r = totwgt = 0.; for (idx = 0; idx < msa->nseq; idx++) { if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; } else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; } else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue; } if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE; else matassign[apos] = FALSE; } /* Once we have matassign calculated, modelmakers behave * the same; matassign2hmm() does this stuff (traceback construction, * trace counting) and sets up ret_hmm and opt_tr. */ if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) { fprintf (stderr, "hmm construction error during trace counting\n"); goto ERROR; } free(matassign); return eslOK; ERROR: if (matassign != NULL) free(matassign); return status; }
/* Function: esl_randomness_Create() * Synopsis: Create an RNG with a given seed. * Incept: SRE, Wed Jul 14 13:02:18 2004 [St. Louis] * * Purpose: Create a random number generator using * a given random seed. Seed must be $>0$. * * Args: seed $>= 0$. * * Returns: an initialized <ESL_RANDOMNESS *> on success. * Caller free's with <esl_randomness_Destroy()>. * * Throws: <NULL> on failure. * * Xref: STL8/p57. */ ESL_RANDOMNESS * esl_randomness_Create(long seed) { ESL_RANDOMNESS *r = NULL; int burnin = 7; int status; if (seed <= 0) ESL_XEXCEPTION(eslEINVAL, "bad seed"); ESL_ALLOC(r, sizeof(ESL_RANDOMNESS)); r->seed = seed; r->reseeding = TRUE; /* we observe that the first random number isn't very random, if * closely spaced seeds are used, like what we get with using * time(). So, "burn in" the random chain just a little. */ while (burnin--) esl_random(r); return r; ERROR: return NULL; }
/* Function: esl_dst_XJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City] * * Purpose: Given a digitized multiple sequence alignment <ax>, * consisting of <nseq> aligned digital sequences in * bioalphabet <abc>, calculate a symmetric Jukes/Cantor * pairwise distance matrix for all sequence pairs; * optionally return the distance matrix in <ret_D> and * a matrix of the large-sample variances for those ML distance * estimates in <ret_V>. * * Infinite distances (and variances) are possible. They * are represented as <HUGE_VAL> in <D> and <V>. Caller must * be prepared to deal with them as appropriate. * * Args: abc - bioalphabet for <aseq> * ax - aligned digital sequences [0.nseq-1][1..L] * nseq - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: <eslOK> on success. <D> (and optionally <V>) contain the * distance matrix (and variances). Caller frees these with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * <eslEDIVZERO> if some pair of sequences had no aligned * residues. On failure, <D> and <V> are both returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int status; int i,j; if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR; if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR; for (i = 0; i < nseq; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < nseq; j++) { status = esl_dst_XJukesCantor(abc, ax[i], ax[j], &(D->mx[i][j]), &(V->mx[i][j])); if (status != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (V != NULL) esl_dmatrix_Destroy(V); if (opt_D != NULL) *opt_D = NULL; if (opt_V != NULL) *opt_V = NULL; return status; }
/* Function: p7_oprofile_MPISend() * Synopsis: Send an OPROFILE as an MPI work unit. * Incept: MSF, Wed Oct 21, 2009 [Janelia] * * Purpose: Sends an OPROFILE <om> as a work unit to MPI process * <dest> (where <dest> ranges from 0..<nproc-1>), tagged * with MPI tag <tag>, for MPI communicator <comm>, as * the sole workunit or result. * * Work units are prefixed by a status code. If <hmm> is * <non-NULL>, the work unit is an <eslOK> code followed by * the packed HMM. If <hmm> is NULL, the work unit is an * <eslEOD> code, which <p7_hmm_MPIRecv()> knows how to * interpret; this is typically used for an end-of-data * signal to cleanly shut down worker processes. * * In order to minimize alloc/free cycles in this routine, * caller passes a pointer to a working buffer <*buf> of * size <*nalloc> characters. If necessary (i.e. if <hmm> is * too big to fit), <*buf> will be reallocated and <*nalloc> * increased to the new size. As a special case, if <*buf> * is <NULL> and <*nalloc> is 0, the buffer will be * allocated appropriately, but the caller is still * responsible for free'ing it. * * Returns: <eslOK> on success; <*buf> may have been reallocated and * <*nalloc> may have been increased. * * Throws: <eslESYS> if an MPI call fails; <eslEMEM> if a malloc/realloc * fails. In either case, <*buf> and <*nalloc> remain valid and useful * memory (though the contents of <*buf> are undefined). * * Note: Compare to p7_hmmfile_WriteBinary(). The two operations (sending * an HMM via MPI, or saving it as a binary file to disk) are * similar. */ int p7_oprofile_MPISend(P7_OPROFILE *om, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc) { int status; int code; int sz, n, pos; /* Figure out size */ if (MPI_Pack_size(1, MPI_INT, comm, &n) != 0) ESL_XEXCEPTION(eslESYS, "mpi pack size failed"); if (om != NULL) { if ((status = p7_oprofile_MPIPackSize(om, comm, &sz)) != eslOK) return status; n += sz; } /* Make sure the buffer is allocated appropriately */ if (*buf == NULL || n > *nalloc) { void *tmp; ESL_RALLOC(*buf, tmp, sizeof(char) * n); *nalloc = n; } /* Pack the status code and OPROFILE into the buffer */ pos = 0; code = (om == NULL) ? eslEOD : eslOK; if (MPI_Pack(&code, 1, MPI_INT, *buf, n, &pos, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi pack failed"); if (om != NULL) { if ((status = p7_oprofile_MPIPack(om, *buf, n, &pos, comm)) != eslOK) return status; } /* Send the packed OPROFILE to the destination. */ if (MPI_Send(*buf, n, MPI_PACKED, dest, tag, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi send failed"); return eslOK; ERROR: return status; }
/* Function: esl_msashuffle_CQRNA() * Synopsis: Gap-preserving column shuffle of a pairwise alignment. * Incept: SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg] * * Purpose: Shuffle a pairwise alignment <x>,<y> while preserving the * position of gaps, using the random number generator <r>. * Return the shuffled alignment in <xs>, * <ys>. Caller provides allocated space for <xs> and <ys>. * * An alphabet <abc> must also be provided, solely for the * definition of gap characters. Because Easel's default * alphabets (DNA, RNA, and protein) all use the same * definition of gap characters <-_.>, you can actually * provide any alphabet here, and get the same results. * (This may save having to determine the alphabet of input * sequences.) * * Works by doing three separate * shuffles, of (1) columns with residues in both * <x> and <y>, (2) columns with residue in <x> and gap in <y>, * and (3) columns with gap in <x> and residue in <y>. * * <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle * an alignment "in place", destroying the original * alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>. * * Returns: <eslOK> on success, and the shuffled alignment is * returned in <xs>, <ys>. * * Throws: <eslEMEM> on allocation failure. */ int esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys) { int L; int *xycol = NULL; int *xcol = NULL; int *ycol = NULL; int nxy, nx, ny; int i; int pos, c; char xsym, ysym; int status; if (xs != x) strcpy(xs, x); if (ys != y) strcpy(ys, y); /* First, construct three arrays containing lists of the column positions * of the three types of columns. (If a column contains gaps in both x and y, * we've already simply copied it to the shuffled sequence.) */ L = strlen(x); if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle"); ESL_ALLOC(xycol, sizeof(int) * L); ESL_ALLOC(xcol, sizeof(int) * L); ESL_ALLOC(ycol, sizeof(int) * L); nxy = nx = ny = 0; for (i = 0; i < L; i++) { if ( esl_abc_CIsGap(abc, x[i]) && esl_abc_CIsGap(abc, y[i])) { continue; } else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; } else if ( esl_abc_CIsGap(abc, x[i])) { ycol[ny] = i; ny++; } else if ( esl_abc_CIsGap(abc, y[i])) { xcol[nx] = i; nx++; } } /* Second, shuffle the sequences indirectly, via shuffling these arrays. * Yow, careful with those indices, and with order of the statements... */ for (; nxy > 1; nxy--) { pos = esl_rnd_Roll(r, nxy); xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos]; xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1]; xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1]; } for (; nx > 1; nx--) { pos = esl_rnd_Roll(r, nx); xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos]; xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1]; xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c; } for (; ny > 1; ny--) { pos = esl_rnd_Roll(r, ny); xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos]; xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1]; xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c; } free(xycol); free(xcol); free(ycol); return eslOK; ERROR: if (xycol != NULL) free(xycol); if (xcol != NULL) free(xcol); if (ycol != NULL) free(ycol); return status; }
/* ideal_local_endpoints() * * Purpose: Implementation of the "two-step" fragment sampling * algorithm, sampling a uniform local fragment w.r.t. * sequence coords, by first sampling a complete * sequence of length L from <hmm>; then choosing * a random fragment <i1..i2> uniformly from all * possible $\frac{L(L+1)/2}$ fragments; then finding * local alignment coordinates wrt model and sequence, * using convention that local alignment starts/stops * with match states. (Thus, if the initially selected * i1 or i2 were generated by insert states, bounds * are moved to reach first/last match state.) * * The caller also provides an allocated sequence <sq> and * traceback <tr>, as storage to be provided to * <p7_CoreEmit()>. They contain the generated global * sequence and trace upon return (not a local trace, note). * * i endpoints are normalized/discretized to 1..<Lbins>, so * we can collate i statistics from sampled sequences of * varying L. Note this causes discretization artifacts, * leading to underrepresentation of j=M and * overrepresentation of i=1. * * This routine is only intended for collecting endpoint * statistics (i1,i2,k1,k2); it does not generate a local * alignment trace. (xref milestone 2, STL11/115). * * Returns: <eslOK> on success; returns normalized/binned sequence * coords in <*ret_i1> and <*ret_i2> in range <1..Lbins> and * the model entry/exit coords in <*ret_k1> and <*ret_k2> in * range <1..M>. By internal def'n of local alignment endpoints, * M_k1 emits residue x_i1, M_k2 emits residue x_i2. * * Xref: STL11/142-143 */ static int ideal_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr, int Lbins, int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2) { int status; int tpos; int i1, i2, k1,k2, t1,t2; int all_insert; int failsafe = 0; /* a failsafe timer for rejection sampling */ do { if (failsafe++ == 1000) ESL_XEXCEPTION(eslENOHALT, "failed to obtain local alignment that wasn't all inserts"); if ((status = p7_CoreEmit(r, hmm, sq, tr)) != eslOK) goto ERROR; /* a simple way to sample uniformly from upper triangle is by rejection * this do/while cannot infinite loop, doesn't need failsafe */ do { i1 = 1 + esl_rnd_Roll(r, sq->n); i2 = 1 + esl_rnd_Roll(r, sq->n); } while (i1 > i2); /* Get initial k1,k2 coords: this step must work in a core model, * i1/i2 were generated by an M or I. Also record t1,t2 endpoints * on core's trace. */ for (tpos = 0; tpos < tr->N; tpos++) if (tr->i[tpos] == i1) { t1 = tpos; k1 = tr->k[tpos]; break; } for (tpos = tr->N-1; tpos >= 0; tpos--) if (tr->i[tpos] == i2) { t2 = tpos; k2 = tr->k[tpos]; break; } /* Enforce the definition of local alignment endpoints being * match-delimited - roll up any leading/trailing I states. * Watch out for pathological case of a local fragment that * includes no M state at all. */ all_insert = FALSE; for (; t1 <= t2; t1++) if (tr->st[t1] == p7T_M) break; for (; t2 >= t1; t2--) if (tr->st[t2] == p7T_M) break; if (t2 < t1) all_insert = TRUE; /* sufficient to check both. */ i1 = tr->i[t1]; i2 = tr->i[t2]; k1 = tr->k[t1]; k2 = tr->k[t2]; } while (all_insert); /* Normalize sequence coords. * They're 1..L now; make them 1..Lbins */ *ret_i1 = ((i1-1) * Lbins / sq->n) + 1; *ret_i2 = ((i2-1) * Lbins / sq->n) + 1; *ret_k1 = k1; *ret_k2 = k2; return eslOK; ERROR: *ret_i1 = 0.; *ret_i2 = 0.; *ret_k1 = 0; *ret_k2 = 0; return status; }
/* profile_local_endpoints() * * Purpose: Wrapper around <p7_ProfileEmit()>, sampling a local * alignment fragment from the profile's probabilistic model * (which may be the implicit model of HMMER3, or the * Plan7 model of HMMER2), and reporting coordinates * of the fragment w.r.t. both model and sequence. * * To simplify the implementation, the profile must be in * <p7_UNILOCAL> mode, not <p7_LOCAL> mode, so we know we * only have to deal with a single hit per sampled * sequence. * * We want <i1..i2> to be relative to the sequence coords * of a complete (global) sampled sequence that we could * have sampled this local alignment from; but the <i1..i2> * we initially get are relative to our profile-sampled * trace, so they are offset both by N-generated residues * that occur in the profile and by residues that the * profile's local entry skipped. To translate from * profile/sequence coords to core model/sequence coords, * we use rejection sampling: sample traces from the core * model until we find one that uses the same statetypes * at *initial* entry/exit points <k1>,<k2>, then use * that sample's sequence to determine offsets and correct * <i1..i2> reference frame. * * Local alignment endpoints are defined to be * match-delimited. However, an H3 model allows exit on * either a D or M state. Thus, the initially sampled end * point k2 may need to be rolled back to last M state, to * satisfy local alignment endpoint definition. Entries are * not a problem; both H2 and H3 profiles can only enter on * a M state. (This rollback has to occur after we've * matched a core trace to the profile trace to determine * i offsets.) * * Then, sampling from both the core model and the profile * in the same routine introduces a complication: * conceivably, profile configuration alters the transition * probabilities in the core model (by adding <M->E> * transitions and renormalizing the M transition * distributions, for example; H2 configuration does this, * though H3 does not). So you can't <CoreSample()> the * <gm->hmm> safely. To avoid such things, the caller * provides a clean copy of the core model in <core>. * * i endpoints are normalized/discretized to 1..<Lbins>, so * we can collate i statistics from sampled sequences of * varying L. Note this causes discretization artifacts, * leading to underrepresentation of j=M and * overrepresentation of i=1. * * Returns: <eslOK> on success; returns normalized sequence coords in * <*ret_i1> and <*ret_i2>, and the model entry/exit coords * in <*ret_k1> and <*ret_k2>. * * Xref: STL11/142-143 */ static int profile_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *core, P7_PROFILE *gm, ESL_SQ *sq, P7_TRACE *tr, int Lbins, int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2) { int status; int i1,i2; int k1,k2; int t1,t2; /* entry/exit positions in local trace, tr */ int tg1, tg2; /* entry/exit positions in global trace, tr2 */ int tpos; int nterm, cterm; /* offsets at N, C terminus. */ int L; /* inferred length from 3-part patching */ ESL_SQ *sq2 = NULL; P7_TRACE *tr2 = NULL; int failsafe = 0; if (gm->mode != p7_UNILOCAL) ESL_XEXCEPTION(eslEINVAL, "profile must be unilocal"); if ((sq2 = esl_sq_CreateDigital(gm->abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((tr = p7_trace_Create()) == NULL) { status = eslEMEM; goto ERROR; } /* sample local alignment from the implicit model */ if (gm->h2_mode) { if ((status = p7_H2_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR; } else { if ((status = p7_ProfileEmit(r, gm, sq, tr)) != eslOK) goto ERROR; } /* Get initial trace coords */ for (tpos = 0; tpos < tr->N; tpos++) if (tr->st[tpos] == p7T_B) { t1 = tpos+1; break; } for (tpos = tr->N-1; tpos >= 0; tpos--) if (tr->st[tpos] == p7T_E) { t2 = tpos-1; break; } /* Match a core trace to this local trace by rejection sampling; * this is to let us calculate sequence offsets; see comments above in preamble */ do { if (failsafe++ == 100000) ESL_XEXCEPTION(eslENOHALT, "failed to match core,local traces in %d tries\n", failsafe); if ((status = p7_CoreEmit(r, core, sq2, tr2)) != eslOK) goto ERROR; for (tpos = 0; tpos < tr2->N; tpos++) if (tr2->k[tpos] == tr->k[t1]) { tg1 = tpos; break; } for (tpos = tr2->N-1; tpos >= 0; tpos--) if (tr2->k[tpos] == tr->k[t2]) { tg2 = tpos; break; } } while (tr2->st[tg1] != tr->st[t1] && tr2->st[tg2] != tr->st[t2]); /* tg1..tg2 in core trace is now matched to t1..t2 in the profile trace. * Calculate # of residues preceding tg1 and following tg2 in the core trace. * A core trace can only generate residues from M or I states. */ for (nterm = 0, tpos = 0; tpos < tg1; tpos++) if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) nterm++; for (cterm = 0, tpos = tr2->N-1; tpos > tg2; tpos--) if (tr2->st[tpos] == p7T_M || tr2->st[tpos] == p7T_I) cterm++; /* rectify the t2 endpoint, rolling back any trailing D path */ for (; t2 >= 0; t2--) if (tr->st[t2] == p7T_M) break; if (t2 < t1) ESL_XEXCEPTION(eslEINCONCEIVABLE, "this only happens on an all-D path through profile"); /* determine initial endpoint coords from t1 and t2 */ i1 = tr->i[t1]; i2 = tr->i[t2]; k1 = tr->k[t1]; k2 = tr->k[t2]; /* offset the i coords. */ L = (i2-i1+1) + nterm + cterm; i2 = (i2-i1+1) + nterm; i1 = nterm+1; /* normalize the i coords into range 1..Lbins, instead of 1..L */ i1 = ((i1-1) * Lbins / L) + 1; i2 = ((i2-1) * Lbins / L) + 1; *ret_i1 = i1; *ret_i2 = i2; *ret_k1 = k1; *ret_k2 = k2; p7_trace_Destroy(tr2); esl_sq_Destroy(sq2); return eslOK; ERROR: if (sq2 != NULL) esl_sq_Destroy(sq2); if (tr2 != NULL) p7_trace_Destroy(tr2); *ret_i1 = 0.; *ret_i2 = 0.; *ret_k1 = 0; *ret_k2 = 0; return status; }
/* All input sources funnel through here. * Here, <afp> is already allocated and initialized, and the input * <bf> is opened successfully. */ static int profillic_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESLX_MSAFILE_FMTDATA *fmtd, ESLX_MSAFILE *afp) { ESL_ALPHABET *abc = NULL; int alphatype = eslUNKNOWN; int status; /* if caller provided <fmtd>, copy it into afp->fmtd */ if (fmtd) eslx_msafile_fmtdata_Copy(fmtd, &(afp->fmtd)); /* Determine the format */ if (format == eslMSAFILE_UNKNOWN) { status = eslx_msafile_GuessFileFormat(afp->bf, &format, &(afp->fmtd)); if (status == eslENOFORMAT) ESL_XFAIL(eslENOFORMAT, afp->errmsg, "couldn't determine alignment input format"); /* ENOFORMAT is normal failure */ else if (status != eslOK) goto ERROR; } afp->format = format; /* Determine the alphabet; set <abc>. (<abc> == NULL means text mode.) */ /* Note that GuessAlphabet() functions aren't allowed to use the inmap, because it isn't set yet */ #ifdef eslAUGMENT_ALPHABET if (byp_abc && *byp_abc) /* Digital mode, and caller provided the alphabet */ { abc = *byp_abc; alphatype = abc->type; } else if (byp_abc) /* Digital mode, and caller wants us to guess and create an alphabet */ { status = eslx_msafile_GuessAlphabet(afp, &alphatype); if (status == eslENOALPHABET) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "couldn't guess alphabet (maybe try --dna/--rna/--amino if available)"); else if (status != eslOK) goto ERROR; if ( (abc = esl_alphabet_Create(alphatype)) == NULL) { status = eslEMEM; goto ERROR; } } #endif if (abc && ! byp_abc) ESL_EXCEPTION(eslEINCONCEIVABLE, "Your version of Easel does not include digital alphabet code."); /* ^^^^^^^^^^^^^^^^^ this test interacts tricksily with the #ifdef above */ afp->abc = abc; /* with afp->abc set, the inmap config functions know whether to do digital/text */ /** * <pre> * Configure the format-specific, digital or text mode character * input map in afp->inmap. * All of these must: * * set inmap[0] to an appropriate 'unknown' character, to replace * invalid input with. * set ' ' to eslDSQ_IGNORE (if we're supposed to accept and skip * it), or map it to a gap, or set it as eslDSQ_ILLEGAL. * in digital mode, copy the abc->inmap * in text mode, decide if we should accept most any * non-whitespace character (isgraph()), or if the format is * inherently restrictive and we should go with isalpha() + * some other valid characters "_-.~*" instead. * </pre> */ switch (afp->format) { case eslMSAFILE_A2M: status = esl_msafile_a2m_SetInmap( afp); break; case eslMSAFILE_AFA: status = esl_msafile_afa_SetInmap( afp); break; case eslMSAFILE_CLUSTAL: status = esl_msafile_clustal_SetInmap( afp); break; case eslMSAFILE_CLUSTALLIKE: status = esl_msafile_clustal_SetInmap( afp); break; case eslMSAFILE_PFAM: status = esl_msafile_stockholm_SetInmap(afp); break; case eslMSAFILE_PHYLIP: status = esl_msafile_phylip_SetInmap( afp); break; case eslMSAFILE_PHYLIPS: status = esl_msafile_phylip_SetInmap( afp); break; case eslMSAFILE_PSIBLAST: status = esl_msafile_psiblast_SetInmap( afp); break; case eslMSAFILE_SELEX: status = esl_msafile_selex_SetInmap( afp); break; case eslMSAFILE_STOCKHOLM: status = esl_msafile_stockholm_SetInmap(afp); break; case eslMSAFILE_PROFILLIC: status = eslOK; break; /// \todo status = profillic_esl_msafile_profile_SetInmap(afp); */ break; default: ESL_XEXCEPTION(eslENOFORMAT, "no such alignment file format"); break; } if (esl_byp_IsReturned(byp_abc)) *byp_abc = abc; return eslOK; ERROR: /* on normal errors, afp is returned in an error state */ if (abc && ! esl_byp_IsProvided(byp_abc)) { esl_alphabet_Destroy(abc); } if (esl_byp_IsReturned(byp_abc)) *byp_abc = NULL; afp->abc = NULL; return status; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); if (! (hmm->flags & p7H_CONS)) ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->max_length = hmm->max_length; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_MMASK) strcpy(gm->mm, hmm->mm); if (hmm->flags & p7H_CONS) strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */ if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC(occ, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ { Z = log(hmm->t[0][p7H_MD]); p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]); for (k = 1; k < hmm->M; k++) { p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]); Z += log(hmm->t[k][p7H_DD]); } } /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits * N,C,J transitions are set later by length config */ if (p7_profile_IsMultihit(gm)) { gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2; gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2; gm->nj = 1.0f; } else { gm->xsc[p7P_E][p7P_MOVE] = 0.0f; gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY; gm->nj = 0.0f; } /* Transition scores. */ for (k = 1; k < gm->M; k++) { tp = gm->tsc + k * p7P_NTRANS; tp[p7P_MM] = log(hmm->t[k][p7H_MM]); tp[p7P_MI] = log(hmm->t[k][p7H_MI]); tp[p7P_MD] = log(hmm->t[k][p7H_MD]); tp[p7P_IM] = log(hmm->t[k][p7H_IM]); tp[p7P_II] = log(hmm->t[k][p7H_II]); tp[p7P_DM] = log(hmm->t[k][p7H_DM]); tp[p7P_DD] = log(hmm->t[k][p7H_DD]); } /* Match emission scores. */ sc[hmm->abc->K] = -eslINFINITY; /* gap character */ sc[hmm->abc->Kp-2] = -eslINFINITY; /* nonresidue character */ sc[hmm->abc->Kp-1] = -eslINFINITY; /* missing data character */ for (k = 1; k <= hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log((double)hmm->mat[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k * p7P_NR; rp[p7P_MSC] = sc[x]; } } /* Insert emission scores */ /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores * to 0, i.e. corresponding to the insertion emission probabilities * being equal to the background probabilities. Benchmarking shows * that setting inserts to informative emission distributions causes * more problems than it's worth: polar biased composition hits * driven by stretches of "insertion" occur, and are difficult to * correct for. */ for (x = 0; x < gm->abc->Kp; x++) { for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f; p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ } for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K) = -eslINFINITY; /* gap symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */ #if 0 /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */ for (k = 1; k < hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log(hmm->ins[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k*p7P_NR; rp[p7P_ISC] = sc[x]; } } for (x = 0; x < hmm->abc->Kp; x++) p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ #endif /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength() */ gm->L = 0; /* force ReconfigLength to reconfig */ if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR; return eslOK; ERROR: if (occ != NULL) free(occ); return status; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * Incept: SRE, Sun Sep 25 12:21:25 2005 [St. Louis] * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float mthresh; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Determine the "consensus" residue for each match position. * This is only used for alignment displays, not in any calculations. */ if (hmm->abc->type == eslAMINO) mthresh = 0.5; else if (hmm->abc->type == eslDNA) mthresh = 0.9; else if (hmm->abc->type == eslRNA) mthresh = 0.9; else mthresh = 0.5; gm->consensus[0] = ' '; for (k = 1; k <= hmm->M; k++) { x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K); gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x])); } gm->consensus[hmm->M+1] = '\0'; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ {
/* Function: p7_hmm_mpi_Unpack() * Synopsis: Unpacks one HMM from an MPI buffer. * * Purpose: Unpack one HMM from MPI packed buffer * <buf>, starting from position <*pos>, where the total length * of the buffer in bytes is <n>. The new HMM is allocated here. * * Caller may or may not already know what alphabet the HMM * is expected to be in. A reference to the current * alphabet is passed in <byp_abc>. If the alphabet is unknown, * pass <*byp_abc = NULL>, and when the HMM is received, an * appropriate new alphabet object is allocated and passed * back to the caller via <*byp_abc>. If the alphabet is * already known, <*byp_abc> is that alphabet, and the new * HMM's alphabet type is verified to agree with it. This * mechanism allows an application to let the first HMM * determine the alphabet type for the application, while * still keeping the alphabet under the application's scope * of control. * * Args: buf - MPI packed buffer to unpack * n - total length of <buf> in bytes * pos - current parsing/unpacking position in <buf> * comm - MPI communicator * byp_abc - BYPASS: <*byp_abc> == ESL_ALPHABET *> if known; * <*byp_abc> == NULL> if alphabet unknown; * ret_hmm - RETURN: ptr to newly allocated, unpacked profile * * Returns: <eslOK> on success. <*pos> is updated to the position of * the next element in <buf> to unpack (if any). <*ret_hmm> * contains a newly allocated HMM, which the caller is * responsible for free'ing. If <*byp_abc> was passed as * <NULL>, it now points to an <ESL_ALPHABET> object that * was allocated here; caller is responsible for free'ing * this. * * Returns <eslEINCOMPAT> if the HMM is in a different * alphabet than <*byp_abc> said to expect. In this case, * <*byp_abc> is unchanged, <*buf> and <*nalloc> may have been * changed, and <*ret_hmm> is <NULL>. * * Throws: <eslESYS> on an MPI call failure. <eslEMEM> on allocation failure. * In either case, <*ret_hmm> is <NULL>, and the state of <buf> * and <*pos> is undefined and should be considered to be corrupted. */ int p7_hmm_mpi_Unpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **byp_abc, P7_HMM **ret_hmm) { P7_HMM *hmm = NULL; ESL_ALPHABET *abc = NULL; int64_t offset; int M, K, atype; int status; /* Use the CreateShell/CreateBody interface, because that interface allocates our optional fields, using <flags> */ if (( hmm = p7_hmm_CreateShell() ) == NULL) { status = eslEMEM; goto ERROR; } /* First, unpack info that we need for HMM body allocation */ if (MPI_Unpack(buf, n, pos, &M, 1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &(hmm->flags), 1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &atype, 1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); /* Set or verify the alphabet */ if (*byp_abc == NULL) { /* alphabet unknown. create new one */ if ( (abc = esl_alphabet_Create(atype)) == NULL) { status = eslEMEM; goto ERROR; } } else { /* already known: check it */ abc = *byp_abc; if (abc->type != atype){ status = eslEINCOMPAT; goto ERROR; } } K = abc->K; /* For convenience below. */ /* Allocate the HMM body */ if ((status = p7_hmm_CreateBody(hmm, M, abc)) != eslOK) goto ERROR; /* Unpack the rest of the HMM */ if (MPI_Unpack( buf, n, pos, hmm->t[0], p7H_NTRANSITIONS*(M+1), MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, hmm->mat[0], K*(M+1), MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, hmm->ins[0], K*(M+1), MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**) &(hmm->name), NULL, MPI_CHAR, comm)) != eslOK) goto ERROR; if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**) &(hmm->acc), NULL, MPI_CHAR, comm)) != eslOK) goto ERROR; if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**) &(hmm->desc), NULL, MPI_CHAR, comm)) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) { if (MPI_Unpack(buf, n, pos, hmm->rf, M+2, MPI_CHAR, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if (hmm->flags & p7H_MMASK) { if (MPI_Unpack(buf, n, pos, hmm->mm, M+2, MPI_CHAR, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if (hmm->flags & p7H_CONS) { if (MPI_Unpack(buf, n, pos, hmm->consensus, M+2, MPI_CHAR, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if (hmm->flags & p7H_CS) { if (MPI_Unpack(buf, n, pos, hmm->cs, M+2, MPI_CHAR, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if (hmm->flags & p7H_CA) { if (MPI_Unpack(buf, n, pos, hmm->ca, M+2, MPI_CHAR, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**)&(hmm->comlog), NULL, MPI_CHAR, comm)) != eslOK) goto ERROR; if (MPI_Unpack( buf, n, pos, &(hmm->nseq), 1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, &(hmm->eff_nseq), 1, MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, &(hmm->max_length), 1, MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if ((status = esl_mpi_UnpackOpt( buf, n, pos, (void**) &(hmm->ctime), NULL, MPI_CHAR, comm)) != eslOK) goto ERROR; if (hmm->flags & p7H_MAP) { if (MPI_Unpack(buf, n, pos, hmm->map, M+1, MPI_INT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); } if (MPI_Unpack( buf, n, pos, &(hmm->checksum), 1, MPI_UINT32_T, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, hmm->evparam, p7_NEVPARAM, MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, hmm->cutoff, p7_NCUTOFFS, MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, hmm->compo, p7_MAXABET, MPI_FLOAT, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack( buf, n, pos, &offset, 1, MPI_INT64_T, comm) != MPI_SUCCESS) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); hmm->offset = offset; /* receive as int64_t, then cast to off_t, which is probably int64_t (but not guaranteed) */ *byp_abc = abc; /* works even if caller provided *byp_abc, because then abc==*byp_abc already */ *ret_hmm = hmm; return eslOK; ERROR: if (hmm) p7_hmm_Destroy(hmm); if (abc && *byp_abc == NULL) esl_alphabet_Destroy(abc); *ret_hmm = NULL; return status; }
/* Function: esl_stats_LinearRegression() * Synopsis: Fit data to a straight line. * Incept: SRE, Sat May 26 11:33:46 2007 [Janelia] * * Purpose: Fit <n> points <x[i]>, <y[i]> to a straight line * $y = a + bx$ by linear regression. * * The $x_i$ are taken to be known, and the $y_i$ are taken * to be observed quantities associated with a sampling * error $\sigma_i$. If known, the standard deviations * $\sigma_i$ for $y_i$ are provided in the <sigma> array. * If they are unknown, pass <sigma = NULL>, and the * routine will proceed with the assumption that $\sigma_i * = 1$ for all $i$. * * The maximum likelihood estimates for $a$ and $b$ are * optionally returned in <opt_a> and <opt_b>. * * The estimated standard deviations of $a$ and $b$ and * their estimated covariance are optionally returned in * <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>. * * The Pearson correlation coefficient is optionally * returned in <opt_cc>. * * The $\chi^2$ P-value for the regression fit is * optionally returned in <opt_Q>. This P-value may only be * obtained when the $\sigma_i$ are known. If <sigma> is * passed as <NULL> and <opt_Q> is requested, <*opt_Q> is * set to 1.0. * * This routine follows the description and algorithm in * \citep[pp.661-666]{Press93}. * * <n> must be greater than 2; at least two x[i] must * differ; and if <sigma> is provided, all <sigma[i]> must * be $>0$. If any of these conditions isn't met, the * routine throws <eslEINVAL>. * * Args: x - x[0..n-1] * y - y[0..n-1] * sigma - sample error in observed y_i * n - number of data points * opt_a - optRETURN: intercept estimate * opt_b - optRETURN: slope estimate * opt_sigma_a - optRETURN: error in estimate of a * opt_sigma_b - optRETURN: error in estimate of b * opt_cov_ab - optRETURN: covariance of a,b estimates * opt_cc - optRETURN: Pearson correlation coefficient for x,y * opt_Q - optRETURN: X^2 P-value for linear fit * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error; * <eslEINVAL> if a contract condition isn't met; * <eslENORESULT> if the chi-squared test fails. * In these cases, all optional return values are set to 0. */ int esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n, double *opt_a, double *opt_b, double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab, double *opt_cc, double *opt_Q) { int status; double *t = NULL; double S, Sx, Sy, Stt; double Sxy, Sxx, Syy; double a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q; double xdev, ydev; double tmp; int i; /* Contract checks. */ if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting"); if (sigma != NULL) for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i); status = eslEINVAL; for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; } if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0."); /* Allocations */ ESL_ALLOC(t, sizeof(double) * n); /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}. (S > 0.) */ if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]); } else S = (double) n; /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2} (Sx real.) */ for (Sx = 0., i = 0; i < n; i++) { if (sigma == NULL) Sx += x[i]; else Sx += x[i] / (sigma[i] * sigma[i]); } /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2} (Sy real.) */ for (Sy = 0., i = 0; i < n; i++) { if (sigma == NULL) Sy += y[i]; else Sy += y[i] / (sigma[i] * sigma[i]); } /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right) (t_i real) */ for (i = 0; i < n; i++) { t[i] = x[i] - Sx/S; if (sigma != NULL) t[i] /= sigma[i]; } /* S_{tt} = \sum_{i=1}^n t_i^2 (if at least one x is != 0, Stt > 0) */ for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; } /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i} */ for (b = 0., i = 0; i < n; i++) { if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; } else { b += t[i]*y[i]; } } b /= Stt; /* a = \frac{ S_y - S_x b } {S} */ a = (Sy - Sx * b) / S; /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */ sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S); /* \sigma_b = \frac{1}{S_{tt}} */ sigma_b = sqrt (1. / Stt); /* Cov(a,b) = - \frac{S_x}{S S_{tt}} */ cov_ab = -Sx / (S * Stt); /* Pearson correlation coefficient */ Sxy = Sxx = Syy = 0.; for (i = 0; i < n; i++) { if (sigma != NULL) { xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n); ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n); } else { xdev = x[i] - (Sx / n); ydev = y[i] - (Sy / n); } Sxy += xdev * ydev; Sxx += xdev * xdev; Syy += ydev * ydev; } cc = Sxy / (sqrt(Sxx) * sqrt(Syy)); /* \chi^2 */ for (X2 = 0., i = 0; i < n; i++) { tmp = y[i] - a - b*x[i]; if (sigma != NULL) tmp /= sigma[i]; X2 += tmp*tmp; } /* We can calculate a goodness of fit if we know the \sigma_i */ if (sigma != NULL) { if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; } } else Q = 1.0; /* If we didn't use \sigma_i, adjust the sigmas for a,b */ if (sigma == NULL) { tmp = sqrt(X2 / (double)(n-2)); sigma_a *= tmp; sigma_b *= tmp; } /* Done. Set up for normal return. */ free(t); if (opt_a != NULL) *opt_a = a; if (opt_b != NULL) *opt_b = b; if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a; if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b; if (opt_cov_ab != NULL) *opt_cov_ab = cov_ab; if (opt_cc != NULL) *opt_cc = cc; if (opt_Q != NULL) *opt_Q = Q; return eslOK; ERROR: if (t != NULL) free(t); if (opt_a != NULL) *opt_a = 0.; if (opt_b != NULL) *opt_b = 0.; if (opt_sigma_a != NULL) *opt_sigma_a = 0.; if (opt_sigma_b != NULL) *opt_sigma_b = 0.; if (opt_cov_ab != NULL) *opt_cov_ab = 0.; if (opt_cc != NULL) *opt_cc = 0.; if (opt_Q != NULL) *opt_Q = 0.; return status; }
/* Function: matassign2hmm() * * Purpose: Given an assignment of alignment columns to match vs. * insert, finish the final part of the model construction * calculation that is constant between model construction * algorithms. * * Args: msa - multiple sequence alignment * matassign - 1..alen bit flags for column assignments * ret_hmm - RETURN: counts-form HMM * opt_tr - optRETURN: array of tracebacks for aseq's * * Return: <eslOK> on success. * <eslENORESULT> if no consensus columns are identified. * * ret_hmm and opt_tr alloc'ed here. */ static int matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr) { int status; /* return status */ P7_HMM *hmm = NULL; /* RETURN: new hmm */ P7_TRACE **tr = NULL; /* RETURN: 0..nseq-1 fake traces */ int M; /* length of new model in match states */ int idx; /* counter over sequences */ int apos; /* counter for aligned columns */ #ifdef p7_DEBUGGING char errbuf[eslERRBUFSIZE]; #endif /* apply the model mask in the 'GC MM' row */ do_modelmask(msa); /* How many match states in the HMM? */ for (M = 0, apos = 1; apos <= msa->alen; apos++) if (matassign[apos]) M++; if (M == 0) { status = eslENORESULT; goto ERROR; } /* Make fake tracebacks for each seq */ ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq); if ((status = p7_trace_FauxFromMSA(msa, matassign, p7_MSA_COORDS, tr)) != eslOK) goto ERROR; for (idx = 0; idx < msa->nseq; idx++) { if ((status = p7_trace_Doctor(tr[idx], NULL, NULL)) != eslOK) goto ERROR; #ifdef p7_DEBUGGING if ((status = p7_trace_Validate(tr[idx], msa->abc, msa->ax[idx], errbuf)) != eslOK) ESL_XEXCEPTION(eslFAIL, "validation failed: %s", errbuf); #endif } /* Build count model from tracebacks */ if ((hmm = p7_hmm_Create(M, msa->abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((status = p7_hmm_Zero(hmm)) != eslOK) goto ERROR; for (idx = 0; idx < msa->nseq; idx++) { if (tr[idx] == NULL) continue; /* skip rare examples of empty sequences */ if ((status = p7_trace_Count(hmm, msa->ax[idx], msa->wgt[idx], tr[idx])) != eslOK) goto ERROR; } hmm->nseq = msa->nseq; hmm->eff_nseq = msa->nseq; /* Transfer annotation from the MSA to the new model */ if ((status = annotate_model(hmm, matassign, msa)) != eslOK) goto ERROR; /* Reset #=RF line of alignment to reflect our assignment * of match, delete. matassign is valid from 1..alen and is off * by one from msa->rf. */ if (msa->rf == NULL) ESL_ALLOC(msa->rf, sizeof(char) * (msa->alen + 1)); for (apos = 1; apos <= msa->alen; apos++) msa->rf[apos-1] = matassign[apos] ? 'x' : '.'; msa->rf[msa->alen] = '\0'; if (opt_tr != NULL) *opt_tr = tr; else p7_trace_DestroyArray(tr, msa->nseq); *ret_hmm = hmm; return eslOK; ERROR: if (tr != NULL) p7_trace_DestroyArray(tr, msa->nseq); if (hmm != NULL) p7_hmm_Destroy(hmm); if (opt_tr != NULL) *opt_tr = NULL; *ret_hmm = NULL; return status; }
/* Function: p7_CoreEmit() * Incept: SRE, Tue Jan 9 10:20:51 2007 [Janelia] * * Purpose: Generate (sample) a sequence from a core HMM <hmm>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively, which the caller has * allocated. Having the caller provide these reusable * objects allows re-use of both <sq> and <tr> in repeated * calls, saving malloc/free wastage. Either can be passed * as <NULL> if it isn't needed. * * This does not set any fields in the <sq> except for the * sequence itself. Caller must set the name, and any other * annotation it wants to add. * * Trace is relative to the core model: it may include * I_0 and I_M states, B->DD->M entry is explicit, and a * 0 length generated sequence is possible. * * Args: r - source of randomness * hmm - core HMM to generate from * sq - opt: digital sequence sampled (or NULL) * tr - opt: trace sampled (or NULL) * * Returns: <eslOK> on success; * optionally return the digital sequence through <ret_sq>, * and optionally return its trace in <ret_tr>. * * Throws: <eslECORRUPT> if emission gets us into an illegal state, * probably indicating that a probability that should have * been zero wasn't. * * Throws <eslEMEM> on a reallocation error. * * In these cases, the contents of <sq> and <tr> may be * corrupted. Caller should not trust their data, but may * safely reuse them. * * Xref: STL11/124. */ int p7_CoreEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr) { int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ char st = p7T_B; /* state type */ int x; /* sampled residue */ int status; if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } while (st != p7T_E) { /* Sample next state type, given current state type (and current k) */ switch (st) { case p7T_B: case p7T_M: switch (esl_rnd_FChoose(r, hmm->t[k], 3)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_I: switch (esl_rnd_FChoose(r, hmm->t[k]+3, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; case p7T_D: switch (esl_rnd_FChoose(r, hmm->t[k]+5, 2)) { case 0: st = p7T_M; break; case 1: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Bump k,i if needed, depending on new state type */ if (st == p7T_M || st == p7T_D) k++; if (st == p7T_M || st == p7T_I) i++; /* a transit to M_M+1 is a transit to the E state */ if (k == hmm->M+1) { if (st == p7T_M) { st = p7T_E; k = 0; } else ESL_XEXCEPTION(eslECORRUPT, "failed to reach E state properly"); } /* Sample new residue x if in match or insert */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else x = eslDSQ_SENTINEL; /* Add state to trace */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } /* Add x to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL) if ((status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* Function: p7_ProfileEmit() * Synopsis: Sample a sequence from the search form of the model. * Incept: SRE, Mon Jan 22 10:23:28 2007 [Janelia] * * Purpose: Sample a sequence from the implicit * probabilistic model of a Plan7 profile <gm>. This * requires also having the core probabilities of * the accompanying <hmm>, and the background * frequencies of null1 model <bg>. * * Optionally return the sequence and/or its trace in <sq> * and <tr>, respectively. Caller has allocated space for * both of these, though they may get reallocated/grown * here. Either can be passed as <NULL> if unneeded. * * Only the sequence field is set in the <sq>. Caller must * set the name, plus any other fields it wants to set. If * the <sq> was created in digital mode, this is the <sq->dsq>; * if the <sq> was created in text mode, this is <sq->seq>. * * <p7_ProfileEmit()> deliberately uses an <ESL_SQ> object * instead of a plain <ESL_DSQ *> or <char *> string, to * take advantage of the object's support for dynamic * reallocation of seq length, and to allow both digital and * text mode generation. * * Args: r - source of randomness * hmm - core probabilities of the profile * gm - configured search profile * sq - optRETURN: sampled sequence * tr - optRETURN: sampled trace * * Throws: (no abnormal error conditions) */ int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr) { char prv, st; /* prev, current state type */ int k = 0; /* position in model nodes 1..M */ int i = 0; /* position in sequence 1..L */ int x; /* sampled residue */ int kend = hmm->M; /* predestined end node */ int status; float xt[p7P_NXSTATES][p7P_NXTRANS]; /* Backcalculate the probabilities in the special states (loop and length model) */ for (i = 0; i < p7P_NXSTATES; i++) for (x = 0; x < p7P_NXTRANS; x++) xt[i][x] = exp(gm->xsc[i][x]); if (sq != NULL) esl_sq_Reuse(sq); if (tr != NULL) { if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_S, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_N, k, i)) != eslOK) goto ERROR; } st = p7T_N; i = 0; while (st != p7T_T) { /* Sample a state transition. After this section, prv and st (prev->current state) are set; * k also gets set if we make a B->Mk entry transition. */ prv = st; switch (st) { case p7T_B: if (p7_profile_IsLocal(gm)) { /* local mode: enter the implicit profile: choose our entry and our predestined exit */ if ((status = sample_endpoints(r, gm, &k, &kend)) != eslOK) goto ERROR; st = p7T_M; /* must be, because left wing is retracted */ } else { /* glocal mode: treat B as M_0, use its transitions to MID. */ /* FIXME: this is wrong. It should sample from B->Mk distribution! */ switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, 0), p7H_NTMAT)) { case 0: st = p7T_M; k = 1; break; case 1: st = p7T_I; k = 0; break; case 2: st = p7T_D; k = 1; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_M: if (k == kend) st = p7T_E; /* check our preordained fate */ else { switch (esl_rnd_FChoose(r, P7H_TMAT(hmm, k), p7H_NTMAT)) { case 0: st = p7T_M; break; case 1: st = p7T_I; break; case 2: st = p7T_D; break; default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "impossible."); } } break; case p7T_D: if (k == kend) st = p7T_E; else st = (esl_rnd_FChoose(r, P7H_TDEL(hmm, k), p7H_NTDEL) == 0) ? p7T_M : p7T_D; break; case p7T_I: st = (esl_rnd_FChoose(r, P7H_TINS(hmm, k), p7H_NTINS) == 0) ? p7T_M : p7T_I; break; case p7T_N: st = (esl_rnd_FChoose(r, xt[p7P_N], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_N; break; case p7T_E: st = (esl_rnd_FChoose(r, xt[p7P_E], p7P_NXTRANS) == p7P_MOVE) ? p7T_C : p7T_J; break; case p7T_C: st = (esl_rnd_FChoose(r, xt[p7P_C], p7P_NXTRANS) == p7P_MOVE) ? p7T_T : p7T_C; break; case p7T_J: st = (esl_rnd_FChoose(r, xt[p7P_J], p7P_NXTRANS) == p7P_MOVE) ? p7T_B : p7T_J; break; default: ESL_XEXCEPTION(eslECORRUPT, "impossible state reached during emission"); } /* Based on the transition we just sampled, update k. */ if (st == p7T_E) k = 0; else if (st == p7T_M && prv != p7T_B) k++; /* be careful about B->Mk, where we already set k */ else if (st == p7T_D) k++; /* Based on the transition we just sampled, generate a residue. */ if (st == p7T_M) x = esl_rnd_FChoose(r, hmm->mat[k], hmm->abc->K); else if (st == p7T_I) x = esl_rnd_FChoose(r, hmm->ins[k], hmm->abc->K); else if ((st == p7T_N || st == p7T_C || st == p7T_J) && prv==st) x = esl_rnd_FChoose(r, bg->f, hmm->abc->K); else x = eslDSQ_SENTINEL; if (x != eslDSQ_SENTINEL) i++; /* Add residue (if any) to sequence */ if (sq != NULL && x != eslDSQ_SENTINEL && (status = esl_sq_XAddResidue(sq, x)) != eslOK) goto ERROR; /* Add state to trace. */ if (tr != NULL) { if ((status = p7_trace_Append(tr, st, k, i)) != eslOK) goto ERROR; } } /* Terminate the trace and sequence (both are optional, remember) */ if (tr != NULL) { tr->M = hmm->M; tr->L = i; } if (sq != NULL && (status = esl_sq_XAddResidue(sq, eslDSQ_SENTINEL)) != eslOK) goto ERROR; return eslOK; ERROR: return status; }
/* Function: p7_alidisplay_Create() * Synopsis: Create an alignment display, from trace and oprofile. * Incept: SRE, Sun Dec 30 09:13:31 2007 [Janelia] * * Purpose: Creates and returns an alignment display for domain number * <which> in traceback <tr>, where the traceback * corresponds to an alignment of optimized profile <om> to digital sequence * <dsq>, and the unique name of that target * sequence <dsq> is <sqname>. The <which> index starts at 0. * * It will be a little faster if the trace is indexed with * <p7_trace_Index()> first. The number of domains is then * in <tr->ndom>. If the caller wants to create alidisplays * for all of these, it would loop <which> from * <0..tr->ndom-1>. * * However, even without an index, the routine will work fine. * * Args: tr - traceback * which - domain number, 0..tr->ndom-1 * om - optimized profile (query) * sq - digital sequence (target) * * Returns: <eslOK> on success. * * Throws: <NULL> on allocation failure, or if something's internally corrupt * in the data. */ P7_ALIDISPLAY * p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq) { P7_ALIDISPLAY *ad = NULL; char *Alphabet = om->abc->sym; int n, pos, z; int z1,z2; int k,x,i,s; int hmm_namelen, hmm_acclen, hmm_desclen; int sq_namelen, sq_acclen, sq_desclen; int status; /* First figure out which piece of the trace (from first match to last match) * we're going to represent, and how big it is. */ if (tr->ndom > 0) { /* if we have an index, this is a little faster: */ for (z1 = tr->tfrom[which]; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break; /* find next M state */ if (z1 == tr->N) return NULL; /* no M? corrupt trace */ for (z2 = tr->tto[which]; z2 >= 0 ; z2--) if (tr->st[z2] == p7T_M) break; /* find prev M state */ if (z2 == -1) return NULL; /* no M? corrupt trace */ } else { /* without an index, we can still do it fine: */ for (z1 = 0; which >= 0 && z1 < tr->N; z1++) if (tr->st[z1] == p7T_B) which--; /* find the right B state */ if (z1 == tr->N) return NULL; /* no such domain <which> */ for (; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break; /* find next M state */ if (z1 == tr->N) return NULL; /* no M? corrupt trace */ for (z2 = z1; z2 < tr->N; z2++) if (tr->st[z2] == p7T_E) break; /* find the next E state */ for (; z2 >= 0; z2--) if (tr->st[z2] == p7T_M) break; /* find prev M state */ if (z2 == -1) return NULL; /* no M? corrupt trace */ } /* Now we know that z1..z2 in the trace will be represented in the * alidisplay; that's z2-z1+1 positions. We need a \0 trailer on all * our display lines, so allocate z2-z1+2. We know each position is * M, D, or I, so there's a 1:1 correspondence of trace positions * with alignment display positions. We also know the display * starts and ends with M states. * * So now let's allocate. The alidisplay is packed into a single * memory space, so this appears to be intricate, but it's just * bookkeeping. */ n = (z2-z1+2) * 3; /* model, mline, aseq mandatory */ if (om->rf[0] != 0) n += z2-z1+2; /* optional reference line */ if (om->cs[0] != 0) n += z2-z1+2; /* optional structure line */ if (tr->pp != NULL) n += z2-z1+2; /* optional posterior prob line */ hmm_namelen = strlen(om->name); n += hmm_namelen + 1; hmm_acclen = (om->acc != NULL ? strlen(om->acc) : 0); n += hmm_acclen + 1; hmm_desclen = (om->desc != NULL ? strlen(om->desc) : 0); n += hmm_desclen + 1; sq_namelen = strlen(sq->name); n += sq_namelen + 1; sq_acclen = strlen(sq->acc); n += sq_acclen + 1; /* sq->acc is "\0" when unset */ sq_desclen = strlen(sq->desc); n += sq_desclen + 1; /* same for desc */ ESL_ALLOC(ad, sizeof(P7_ALIDISPLAY)); ad->mem = NULL; pos = 0; ad->memsize = sizeof(char) * n; ESL_ALLOC(ad->mem, ad->memsize); if (om->rf[0] != 0) { ad->rfline = ad->mem + pos; pos += z2-z1+2; } else { ad->rfline = NULL; } if (om->cs[0] != 0) { ad->csline = ad->mem + pos; pos += z2-z1+2; } else { ad->csline = NULL; } ad->model = ad->mem + pos; pos += z2-z1+2; ad->mline = ad->mem + pos; pos += z2-z1+2; ad->aseq = ad->mem + pos; pos += z2-z1+2; if (tr->pp != NULL) { ad->ppline = ad->mem + pos; pos += z2-z1+2;} else { ad->ppline = NULL; } ad->hmmname = ad->mem + pos; pos += hmm_namelen +1; ad->hmmacc = ad->mem + pos; pos += hmm_acclen +1; ad->hmmdesc = ad->mem + pos; pos += hmm_desclen +1; ad->sqname = ad->mem + pos; pos += sq_namelen +1; ad->sqacc = ad->mem + pos; pos += sq_acclen +1; ad->sqdesc = ad->mem + pos; pos += sq_desclen +1; strcpy(ad->hmmname, om->name); if (om->acc != NULL) strcpy(ad->hmmacc, om->acc); else ad->hmmacc[0] = 0; if (om->desc != NULL) strcpy(ad->hmmdesc, om->desc); else ad->hmmdesc[0] = 0; strcpy(ad->sqname, sq->name); strcpy(ad->sqacc, sq->acc); strcpy(ad->sqdesc, sq->desc); /* Determine hit coords */ ad->hmmfrom = tr->k[z1]; ad->hmmto = tr->k[z2]; ad->M = om->M; ad->sqfrom = tr->i[z1]; ad->sqto = tr->i[z2]; ad->L = sq->n; /* optional rf line */ if (ad->rfline != NULL) { for (z = z1; z <= z2; z++) ad->rfline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->rf[tr->k[z]]); ad->rfline[z-z1] = '\0'; } /* optional cs line */ if (ad->csline != NULL) { for (z = z1; z <= z2; z++) ad->csline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->cs[tr->k[z]]); ad->csline[z-z1] = '\0'; } /* optional pp line */ if (ad->ppline != NULL) { for (z = z1; z <= z2; z++) ad->ppline[z-z1] = ( (tr->st[z] == p7T_D) ? '.' : p7_alidisplay_EncodePostProb(tr->pp[z])); ad->ppline[z-z1] = '\0'; } /* mandatory three alignment display lines: model, mline, aseq */ for (z = z1; z <= z2; z++) { k = tr->k[z]; i = tr->i[z]; x = sq->dsq[i]; s = tr->st[z]; switch (s) { case p7T_M: ad->model[z-z1] = om->consensus[k]; if (x == esl_abc_DigitizeSymbol(om->abc, om->consensus[k])) ad->mline[z-z1] = ad->model[z-z1]; else if (p7_oprofile_FGetEmission(om, k, x) > 1.0) ad->mline[z-z1] = '+'; /* >1 not >0; om has odds ratios, not scores */ else ad->mline[z-z1] = ' '; ad->aseq [z-z1] = toupper(Alphabet[x]); break; case p7T_I: ad->model [z-z1] = '.'; ad->mline [z-z1] = ' '; ad->aseq [z-z1] = tolower(Alphabet[x]); break; case p7T_D: ad->model [z-z1] = om->consensus[k]; ad->mline [z-z1] = ' '; ad->aseq [z-z1] = '-'; break; default: ESL_XEXCEPTION(eslEINVAL, "invalid state in trace: not M,D,I"); } } ad->model [z2-z1+1] = '\0'; ad->mline [z2-z1+1] = '\0'; ad->aseq [z2-z1+1] = '\0'; ad->N = z2-z1+1; return ad; ERROR: p7_alidisplay_Destroy(ad); return NULL; }
/* Function: p7_alidisplay_Backconvert() * Synopsis: Convert an alidisplay to a faux trace and subsequence. * Incept: SRE, Wed Dec 10 09:49:28 2008 [Janelia] * * Purpose: Convert alignment display object <ad> to a faux subsequence * and faux subsequence trace, returning them in <ret_sq> and * <ret_tr>. * * The subsequence <*ret_sq> is digital; ascii residues in * <ad> are digitized using digital alphabet <abc>. * * The subsequence and trace are suitable for passing as * array elements to <p7_MultipleAlignment>. This is the * main purpose of backconversion. Results of a profile * search are stored in a hit list as a processed * <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to * reduce space and to reduce communication overhead in * parallelized search implementations. After reduction * to a final hit list, a master may want to construct a * multiple alignment of all the significant hits. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal * data corruption. On any exception, <*ret_sq> and <*ret_tr> are * <NULL>. * * Xref: J4/29. */ int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr) { ESL_SQ *sq = NULL; /* RETURN: faux subsequence */ P7_TRACE *tr = NULL; /* RETURN: faux trace */ int subL = 0; /* subsequence length in the <ad> */ int a, i, k; /* coords for <ad>, <sq->dsq>, model */ char st; /* state type: MDI */ int status; /* Make a first pass over <ad> just to calculate subseq length */ for (a = 0; a < ad->N; a++) if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++; /* Allocations */ if ((sq = esl_sq_CreateDigital(abc)) == NULL) { status = eslEMEM; goto ERROR; } if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR; if ((tr = (ad->ppline == NULL) ? p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; } if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR; /* +6 is for SNB/ECT */ /* Construction of dsq, trace */ sq->dsq[0] = eslDSQ_SENTINEL; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR; k = ad->hmmfrom; i = 1; for (a = 0; a < ad->N; a++) { if (esl_abc_CIsResidue(abc, ad->model[a])) { st = (esl_abc_CIsResidue(abc, ad->aseq[a]) ? p7T_M : p7T_D); } else st = p7T_I; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, st, k, i) : p7_trace_AppendWithPP(tr, st, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR; switch (st) { case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); k++; i++; break; case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break; case p7T_D: k++; break; } } if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR; if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR; sq->dsq[i] = eslDSQ_SENTINEL; /* some sanity checks */ if (tr->N != ad->N + 6) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)", ad->sqname, ad->hmmname); if (k != ad->hmmto + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname); if (i != subL + 1) ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)", ad->sqname, ad->hmmname); /* Set up <sq> annotation as a subseq of a source sequence */ if ((status = esl_sq_FormatName(sq, "%s/%ld-%ld", ad->sqname, ad->sqfrom, ad->sqto)) != eslOK) goto ERROR; if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR; if ((status = esl_sq_SetSource (sq, ad->sqname)) != eslOK) goto ERROR; if (ad->sqacc[0] != '\0') { if ((status = esl_sq_SetAccession (sq, ad->sqacc)) != eslOK) goto ERROR; } sq->n = subL; sq->start = ad->sqfrom; sq->end = ad->sqto; sq->C = 0; sq->W = subL; sq->L = ad->L; tr->M = ad->M; tr->L = ad->L; *ret_sq = sq; *ret_tr = tr; return eslOK; ERROR: if (sq != NULL) esl_sq_Destroy(sq); if (tr != NULL) p7_trace_Destroy(tr); *ret_sq = NULL; *ret_tr = NULL; return status; }
/* Function: p7_GStochasticTrace() * Synopsis: Stochastic traceback of a Forward matrix. * Incept: SRE, Thu Jan 3 15:39:20 2008 [Janelia] * * Purpose: Stochastic traceback of Forward matrix <gx> to * sample an alignment of digital sequence <dsq> * (of length <L>) to the profile <gm>. * * The sampled traceback is returned in <tr>, which the * caller must have at least made an initial allocation of * (the <tr> will be grown as needed here). * * Args: r - source of random numbers * dsq - digital sequence aligned to, 1..L * L - length of dsq * gm - profile * mx - Forward matrix to trace, L x M * tr - storage for the recovered traceback. * * Returns: <eslOK> on success. */ int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr) { int status; int i; /* position in seq (1..L) */ int k; /* position in model (1..M) */ int M = gm->M; float **dp = gx->dp; float *xmx = gx->xmx; float const *tsc = gm->tsc; float *sc; /* scores of possible choices: up to 2M-1, in the case of exits to E */ int scur, sprv; /* we'll index M states as 1..M, and D states as 2..M = M+2..2M: M0, D1 are impossibles. */ ESL_ALLOC(sc, sizeof(float) * (2*M+1)); k = 0; i = L; if ((status = p7_trace_Append(tr, p7T_T, k, i)) != eslOK) goto ERROR; if ((status = p7_trace_Append(tr, p7T_C, k, i)) != eslOK) goto ERROR; sprv = p7T_C; while (sprv != p7T_S) { switch (tr->st[tr->N-1]) { /* C(i) comes from C(i-1) or E(i) */ case p7T_C: if (XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i); sc[0] = XMX(i-1, p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_C : p7T_E; break; /* E connects from any M or D state. k set here */ case p7T_E: if (XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i); if (p7_profile_IsLocal(gm)) { /* local models come from any M, D */ sc[0] = sc[M+1] = -eslINFINITY; for (k = 1; k <= M; k++) sc[k] = MMX(i,k); for (k = 2; k <= M; k++) sc[k+M] = DMX(i,k); esl_vec_FLogNorm(sc, 2*M+1); /* now sc is a prob vector */ k = esl_rnd_FChoose(r, sc, 2*M+1); if (k <= M) scur = p7T_M; else { k -= M; scur = p7T_D; } } else { /* glocal models come from M_M or D_M */ k = M; sc[0] = MMX(i,M); sc[1] = DMX(i,M); esl_vec_FLogNorm(sc, 2); /* now sc is a prob vector */ scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; } break; /* M connects from {MDI} i-1,k-1, or B */ case p7T_M: if (MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i); sc[0] = XMX(i-1,p7G_B) + TSC(p7P_BM, k-1); sc[1] = MMX(i-1,k-1) + TSC(p7P_MM, k-1); sc[2] = IMX(i-1,k-1) + TSC(p7P_IM, k-1); sc[3] = DMX(i-1,k-1) + TSC(p7P_DM, k-1); esl_vec_FLogNorm(sc, 4); switch (esl_rnd_FChoose(r, sc, 4)) { case 0: scur = p7T_B; break; case 1: scur = p7T_M; break; case 2: scur = p7T_I; break; case 3: scur = p7T_D; break; } k--; i--; break; /* D connects from M,D at i,k-1 */ case p7T_D: if (DMX(i, k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible D reached at k=%d,i=%d", k,i); sc[0] = MMX(i, k-1) + TSC(p7P_MD, k-1); sc[1] = DMX(i, k-1) + TSC(p7P_DD, k-1); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_D; k--; break; /* I connects from M,I at i-1,k */ case p7T_I: if (IMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible I reached at k=%d,i=%d", k,i); sc[0] = MMX(i-1,k) + TSC(p7P_MI, k); sc[1] = IMX(i-1,k) + TSC(p7P_II, k); esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_M : p7T_I; i--; break; /* N connects from S, N */ case p7T_N: if (XMX(i, p7G_N) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i); scur = (i == 0) ? p7T_S : p7T_N; break; /* B connects from N, J */ case p7T_B: if (XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i); sc[0] = XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; sc[1] = XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_N : p7T_J; break; /* J connects from E(i) or J(i-1) */ case p7T_J: if (XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i); sc[0] = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; sc[1] = XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]; esl_vec_FLogNorm(sc, 2); scur = (esl_rnd_FChoose(r, sc, 2) == 0) ? p7T_J : p7T_E; break; default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback"); } /* end switch over statetype[tpos-1] */ /* Append this state and the current i,k to be explained to the growing trace */ if ((status = p7_trace_Append(tr, scur, k, i)) != eslOK) goto ERROR; /* For NCJ, we had to defer i decrement. */ if ( (scur == p7T_N || scur == p7T_J || scur == p7T_C) && scur == sprv) i--; sprv = scur; } /* end traceback, at S state */ if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR; tr->M = gm->M; tr->L = L; free(sc); return eslOK; ERROR: if (sc != NULL) free(sc); return status; }
/* Function: p7_oprofile_MPIPackSize() * Synopsis: Calculates size needed to pack an OPROFILE. * Incept: MSF, Wed Oct 21, 2009 [Janelia] * * Purpose: Calculate an upper bound on the number of bytes * that <p7_oprofile_MPIPack()> will need to pack an * OPROFILE <om> in a packed MPI message for MPI * communicator <comm>; return that number of bytes * in <*ret_n>. * * Returns: <eslOK> on success, and <*ret_n> contains the answer. * * Throws: <eslESYS> if an MPI call fails, and <*ret_n> is 0. */ int p7_oprofile_MPIPackSize(P7_OPROFILE *om, MPI_Comm comm, int *ret_n) { int status; int n = 0; int K = om->abc->Kp; int len = 0; int cnt; int sz; int Q4 = p7O_NQF(om->M); int Q8 = p7O_NQW(om->M); int Q16 = p7O_NQB(om->M); int vsz = sizeof(vector float); /* MSV Filter information */ if (MPI_Pack_size(5, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(1, MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(vsz*Q16, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += (K*sz); /* Viterbi Filter information */ if (MPI_Pack_size(1, MPI_SHORT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += ((p7O_NXSTATES*p7O_NXTRANS+2)*sz); if (MPI_Pack_size(2, MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(K*vsz*Q8, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(8*vsz*Q8, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; /* Forward/Backward information */ if (MPI_Pack_size(1, MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += (p7O_NXSTATES*p7O_NXTRANS*sz); if (MPI_Pack_size(K*vsz*Q4, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(8*vsz*Q4, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; /* disk offsets */ if (MPI_Pack_size(1, MPI_LONG_LONG_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += ((p7_NOFFSETS+2)*sz); /* annotation info */ if (om->name != NULL) len += strlen(om->name) + 1; if (om->acc != NULL) len += strlen(om->acc) + 1; if (om->desc != NULL) len += strlen(om->desc) + 1; if (om->rf != NULL) len += strlen(om->rf) + 1; if (om->cs != NULL) len += strlen(om->cs) + 1; if (om->consensus != NULL) len += strlen(om->consensus) + 1; if (MPI_Pack_size(6, MPI_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(len, MPI_CHAR, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; cnt = p7_NEVPARAM + p7_NCUTOFFS + p7_MAXABET; if (MPI_Pack_size(cnt, MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; /* current model size */ if (MPI_Pack_size(4, MPI_INT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; if (MPI_Pack_size(1, MPI_FLOAT, comm, &sz) != 0) ESL_XEXCEPTION(eslESYS, "pack size failed"); n += sz; *ret_n = n; return eslOK; ERROR: *ret_n = 0; return status; }
/* Function: p7_oprofile_MPIUnpack() * Synopsis: Unpacks an OPROFILE from an MPI buffer. * Incept: MSF, Wed Oct 21, 2009 [Janelia] * * Purpose: Unpack a newly allocated OPROFILE from MPI packed buffer * <buf>, starting from position <*pos>, where the total length * of the buffer in bytes is <n>. * * Caller may or may not already know what alphabet the OPROFILE * is expected to be in. A reference to the current * alphabet is passed in <abc>. If the alphabet is unknown, * pass <*abc = NULL>, and when the OPROFILE is received, an * appropriate new alphabet object is allocated and passed * back to the caller via <*abc>. If the alphabet is * already known, <*abc> is that alphabet, and the new * OPROFILE's alphabet type is verified to agree with it. This * mechanism allows an application to let the first OPROFILE * determine the alphabet type for the application, while * still keeping the alphabet under the application's scope * of control. * * Returns: <eslOK> on success. <*pos> is updated to the position of * the next element in <buf> to unpack (if any). <*ret_om> * contains a newly allocated OPROFILE, which the caller is * responsible for free'ing. If <*abc> was passed as * <NULL>, it now points to an <ESL_ALPHABET> object that * was allocated here; caller is responsible for free'ing * this. * * Returns <eslEINCOMPAT> if the OPROFILE is in a different * alphabet than <*abc> said to expect. In this case, * <*abc> is unchanged, <*buf> and <*nalloc> may have been * changed, and <*ret_om> is <NULL>. * * Throws: <eslESYS> on an MPI call failure. <eslEMEM> on allocation failure. * In either case, <*ret_om> is <NULL>, and the state of <buf> * and <*pos> is undefined and should be considered to be corrupted. */ int p7_oprofile_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_OPROFILE **ret_om) { int status; int M, K, atype; int len; int x; int Q4, Q8, Q16; int vsz = sizeof(vector float); P7_OPROFILE *om = NULL; if (MPI_Unpack(buf, n, pos, &M, 1, MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &atype, 1, MPI_INT, comm) != 0) ESL_XEXCEPTION(eslESYS, "mpi unpack failed"); /* Set or verify the alphabet */ if (*abc == NULL) { /* still unknown: set it, pass control of it back to caller */ if ((*abc = esl_alphabet_Create(atype)) == NULL) { status = eslEMEM; goto ERROR; } } else { /* already known: check it */ if ((*abc)->type != atype) { status = eslEINCOMPAT; goto ERROR; } } Q4 = p7O_NQF(M); Q8 = p7O_NQW(M); Q16 = p7O_NQB(M); if ((om = p7_oprofile_Create(M, *abc)) == NULL) { status = eslEMEM; goto ERROR; } om->M = M; K = (*abc)->Kp; /* model configuration */ if (MPI_Unpack(buf, n, pos, &om->L, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->mode, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->nj, 1, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); /* MSV Filter information */ if (MPI_Unpack(buf, n, pos, &om->tbm_b, 1, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->tec_b, 1, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->tjb_b, 1, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->scale_b, 1, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->base_b, 1, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->bias_b, 1, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); for (x = 0; x < K; x++) if (MPI_Unpack(buf, n, pos, om->rbv[x], vsz*Q16, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); /* Viterbi Filter information */ if (MPI_Unpack(buf, n, pos, &om->scale_w, 1, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->base_w, 1, MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->ddbound_w, 1, MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->ncj_roundoff, 1, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, om->twv, 8*vsz*Q8, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); for (x = 0; x < p7O_NXSTATES; x++) if (MPI_Unpack(buf, n, pos, om->xw[x], p7O_NXTRANS, MPI_SHORT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); for (x = 0; x < K; x++) if (MPI_Unpack(buf, n, pos, om->rwv[x], vsz*Q8, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); /* Forward/Backward information */ if (MPI_Unpack(buf, n, pos, om->tfv, 8*vsz*Q4, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); for (x = 0; x < p7O_NXSTATES; x++) if (MPI_Unpack(buf, n, pos, om->xf[x], p7O_NXTRANS, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); for (x = 0; x < K; x++) if (MPI_Unpack(buf, n, pos, om->rfv[x], vsz*Q4, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); /* Forward/Backward information */ if (MPI_Unpack(buf, n, pos, om->offs, p7_NOFFSETS, MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->roff, 1, MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, &om->eoff, 1, MPI_LONG_LONG_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); /* Annotation information */ if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->name, len); if (MPI_Unpack(buf, n, pos, om->name, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->name[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->acc, len); if (MPI_Unpack(buf, n, pos, om->acc, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->acc[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->desc, len); if (MPI_Unpack(buf, n, pos, om->desc, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->desc[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->rf, len); if (MPI_Unpack(buf, n, pos, om->rf, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->rf[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->cs, len); if (MPI_Unpack(buf, n, pos, om->cs, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->cs[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, &len, 1, MPI_INT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (len > 0) { ESL_ALLOC(om->consensus, len); if (MPI_Unpack(buf, n, pos, om->consensus, len, MPI_CHAR, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); om->consensus[len-1] = '\0'; } if (MPI_Unpack(buf, n, pos, om->evparam, p7_NEVPARAM, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, om->cutoff, p7_NCUTOFFS, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); if (MPI_Unpack(buf, n, pos, om->compo, p7_MAXABET, MPI_FLOAT, comm) != 0) ESL_EXCEPTION(eslESYS, "mpi unpack failed"); *ret_om = om; return eslOK; ERROR: if (om != NULL) p7_oprofile_Destroy(om); return status; }